# DBpedia Data Extraction

In [1]:
import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# leer excel de topics para consultar en la DBpedia
data = pd.read_csv('../data_fundamentosdeanalisisdedatos/topicsDbpedia.csv', sep=';')

In [3]:
data.head()

Unnamed: 0,topic,topic_dbpedia,uri_dbpedia
0,Data science concepts,Data_science,<http://dbpedia.org/resource/Data_science>
1,Big data concepts,Big_data,<http://dbpedia.org/resource/Big_data>
2,Data mining concepts,Data_mining,<http://dbpedia.org/resource/Data_mining>
3,Machine learning algorithms,Machine_learning,<http://dbpedia.org/resource/Machine_learning>
4,Python,Python_(programming_language),<http://dbpedia.org/resource/Python_(programmi...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   topic          58 non-null     object
 1   topic_dbpedia  58 non-null     object
 2   uri_dbpedia    58 non-null     object
dtypes: object(3)
memory usage: 1.6+ KB


In [5]:
data.shape

(62, 3)

In [6]:
# revisar nulos por la transformacion a csv
data.isnull().sum()

topic            4
topic_dbpedia    4
uri_dbpedia      4
dtype: int64

In [7]:
data = data.dropna()

In [8]:
data.isnull().sum()

topic            0
topic_dbpedia    0
uri_dbpedia      0
dtype: int64

In [9]:
data.shape

(58, 3)

In [10]:
# Configurar la API de consulta de SPARQL de DBpedia
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

In [11]:
# extraer del df los dbr que vamos a buscar en la dbpedia
uri = data['uri_dbpedia'].tolist()

In [12]:
uri 

['<http://dbpedia.org/resource/Data_science>',
 '<http://dbpedia.org/resource/Big_data>',
 '<http://dbpedia.org/resource/Data_mining>',
 '<http://dbpedia.org/resource/Machine_learning>',
 '<http://dbpedia.org/resource/Python_(programming_language)>',
 '<http://dbpedia.org/resource/R_(programming_language)>',
 '<http://dbpedia.org/resource/Project_Jupyter>',
 '<http://dbpedia.org/resource/Apache_Spark>',
 '<http://dbpedia.org/resource/Kaggle>',
 '<http://dbpedia.org/resource/GitHub>',
 '<http://dbpedia.org/resource/Data_extraction>',
 '<http://dbpedia.org/resource/Web_API>',
 '<http://dbpedia.org/resource/Scrapy>',
 '<http://dbpedia.org/resource/Data_storage>',
 '<http://dbpedia.org/resource/Data_quality>',
 '<http://dbpedia.org/resource/Data_management>',
 '<http://dbpedia.org/resource/Data_manipulation_language>',
 '<http://dbpedia.org/resource/Comma-separated_values>',
 '<http://dbpedia.org/resource/JSON>',
 '<http://dbpedia.org/resource/Text_file>',
 '<http://dbpedia.org/resource/Da

In [13]:
# Endpoint de la API de DBPedia
endpoint_url = "http://dbpedia.org/sparql"

In [14]:
# Consulta SPARQL para extraer la descripción y la imagen de un recurso
query = """
SELECT ?description ?image
WHERE {
  %s dbo:abstract ?description .
  FILTER (langMatches(lang(?description), "en"))
  OPTIONAL {
    %s dbo:thumbnail ?image .
  }
}
"""

In [15]:
# Inicializar listas para almacenar descripciones e imágenes
descriptions = []
images = []

In [16]:
# Hacer la consulta para cada recurso y agregar la descripción y la imagen a las listas
for u in uri:
    # Formatear la consulta con el recurso actual
    formatted_query = query % (u, u)
    
    # Inicializar objeto SPARQLWrapper y establecer el endpoint y la consulta
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(formatted_query)
    
    # Especificar el formato de salida y ejecutar la consulta
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    # Obtener la descripción e imagen si existen y agregarlas a las listas
    if len(results["results"]["bindings"]) > 0:
        description = results["results"]["bindings"][0]["description"]["value"]
        descriptions.append(description)
        if "image" in results["results"]["bindings"][0]:
            image = results["results"]["bindings"][0]["image"]["value"]
            images.append(image)
        else:
            images.append('')
    else:
        # Manejar el caso en que no existe una descripción para el recurso
        descriptions.append('')
        images.append('')

In [17]:
# Imprimir las listas de descripciones e imágenes
print(descriptions)

['Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured and unstructured data, and apply knowledge from data across a broad range of application domains. Data science is related to data mining, machine learning and big data. Data science is a "concept to unify statistics, data analysis, informatics, and their related methods" in order to "understand and analyse actual phenomena" with data. It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge. However, data science is different from computer science and information science. Turing Award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational, and now data-driven) and asserted that "everything about science is changing because of the impact of informatio

In [18]:
print(images)

['http://commons.wikimedia.org/wiki/Special:FilePath/PIA23792-1600x1200(1).jpg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Hilbert_InfoGrowth.png?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Spurious_correlations_-_spelling_bee_spiders.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/AI_hierarchy.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Python-logo-notext.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/R_logo.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Jupyter_logo.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Apache_Spark_logo.svg?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/Kaggle_logo.png?width=300', 'http://commons.wikimedia.org/wiki/Special:FilePath/GitHub_logo_2013.svg?width=300', '', '', 'http://commons.wikimedia.org/wiki/Special:FilePath/Scrapy_logo.jpg?width=300', 'http://commons.wikimedia.org/wiki/Special:Fil

In [19]:
len(descriptions)

58

In [20]:
len(images)

58

In [21]:
# Crear DataFrame a partir de la lista descriptions
new_df = pd.DataFrame({'descriptions': descriptions})

In [22]:
# Concatenar DataFrame existente y nuevo DataFrame a lo largo del eje de columnas
data = pd.concat([data, new_df], axis=1)

In [23]:
# Crear DataFrame a partir de la lista images
new_df2 = pd.DataFrame({'images': images})

In [24]:
# Concatenar DataFrame existente y nuevo DataFrame a lo largo del eje de columnas
data = pd.concat([data, new_df2], axis=1)

In [25]:
# inspeccionamos la data
data.head(60)

Unnamed: 0,topic,topic_dbpedia,uri_dbpedia,descriptions,images
0,Data science concepts,Data_science,<http://dbpedia.org/resource/Data_science>,Data science is an interdisciplinary field tha...,http://commons.wikimedia.org/wiki/Special:File...
1,Big data concepts,Big_data,<http://dbpedia.org/resource/Big_data>,Big data refers to data sets that are too larg...,http://commons.wikimedia.org/wiki/Special:File...
2,Data mining concepts,Data_mining,<http://dbpedia.org/resource/Data_mining>,Data mining is the process of extracting and d...,http://commons.wikimedia.org/wiki/Special:File...
3,Machine learning algorithms,Machine_learning,<http://dbpedia.org/resource/Machine_learning>,Machine learning (ML) is a field of inquiry de...,http://commons.wikimedia.org/wiki/Special:File...
4,Python,Python_(programming_language),<http://dbpedia.org/resource/Python_(programmi...,"Python is a high-level, general-purpose progra...",http://commons.wikimedia.org/wiki/Special:File...
5,R,R_(programming_language),<http://dbpedia.org/resource/R_(programming_la...,R is a programming language for statistical co...,http://commons.wikimedia.org/wiki/Special:File...
6,Jupyter notebook,Project_Jupyter,<http://dbpedia.org/resource/Project_Jupyter>,Project Jupyter (/ˈdʒuːpɪtər/) is a project wi...,http://commons.wikimedia.org/wiki/Special:File...
7,Apache spark,Apache_Spark,<http://dbpedia.org/resource/Apache_Spark>,Apache Spark is an open-source unified analyti...,http://commons.wikimedia.org/wiki/Special:File...
8,Kaggle,Kaggle,<http://dbpedia.org/resource/Kaggle>,"Kaggle, a subsidiary of Google LLC, is an onli...",http://commons.wikimedia.org/wiki/Special:File...
9,Github,GitHub,<http://dbpedia.org/resource/GitHub>,"GitHub, Inc., is an Internet hosting service f...",http://commons.wikimedia.org/wiki/Special:File...


In [28]:
# generar csv con todos los metadatos extraidos
data.to_csv("api_dataExtraction/dbpedia.csv")