In [44]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

tqdm.pandas()

sparql = SPARQLWrapper("https://dbpedia.org/sparql")

In [150]:
ask_query = """
@prefix dbr:    <http://dbpedia.org/resource/> .

ASK  {{dbr:{} a owl:Thing. }}
"""

select_query1 = """
@prefix dbr:    <http://dbpedia.org/resource/> .


SELECT DISTINCT ?label WHERE {{
    {{ 
          dbr:{} <http://purl.org/linguistics/gold/hypernym> ?label. 
          ?label a owl:Thing .
    }}

}}
"""

select_query2 = """
@prefix dbr:    <http://dbpedia.org/resource/> .


SELECT DISTINCT ?label WHERE {{
    {{ 
    
          dbr:{} rdf:type ?class. 
          ?class rdfs:subClassOf* ?label .
          ?label a owl:Class.
          FILTER(?class != ?label) . 
    }}
}}
"""

In [151]:
def query_dbpedia(query):
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results
    
def query_item(item):
    try:
        result = query_dbpedia(ask_query.format(item))
        if result['boolean'] == 'true':
            return [f'http://dbpedia.org/resource/{item}']
        else:
            results = query_dbpedia(select_query1.format(item))
            df = pd.json_normalize(results['results']['bindings'])
            if df.shape[0] > 0:
                return df['label.value'].tolist()
            else:
                results = query_dbpedia(select_query2.format(item))
                df = pd.json_normalize(results['results']['bindings'])
                if df.shape[0] > 0:
                    return df['label.value'].tolist()
    except Exception as e:
        print(item, e)
        return []

In [62]:
def replace_characters(string):
    for ch in ["(", ")", "'", "_"]:
        string = string.replace(ch, f'\\{ch}')
    return string

In [116]:
resources = pd.read_csv('../data/topics.csv', index_col=False)
resources['name'] = resources['uri'].apply(lambda x: x.split('/')[-1])
resources['name'] = resources['name'].apply(lambda x: replace_characters(x))

In [117]:
resources_to_map = resources[resources['count'] >= 10]
resources_to_map

Unnamed: 0,uri,count,name
0,http://dbpedia.org/resource/White_supremacy,8664,White\_supremacy
1,http://dbpedia.org/resource/VAX,6656,VAX
2,http://dbpedia.org/resource/Hurricane_Floyd,5319,Hurricane\_Floyd
3,http://dbpedia.org/resource/Self-defense,3558,Self-defense
4,http://dbpedia.org/resource/Kenosha_Comets,3348,Kenosha\_Comets
...,...,...,...
1006,http://dbpedia.org/resource/Rosa_Brooks,10,Rosa\_Brooks
1007,http://dbpedia.org/resource/Imperialism,10,Imperialism
1008,http://dbpedia.org/resource/Clapping,10,Clapping
1009,http://dbpedia.org/resource/House_arrest,10,House\_arrest


In [118]:
resources_to_map['uri_list'] = resources_to_map['name'].progress_apply(query_item)

 33%|██████████████████████████                                                     | 333/1011 [04:34<06:46,  1.67it/s]

Darrell\_Wallace\_Jr. QueryBadFormed: a bad request has been sent to the endpoint, probably the sparql query is bad formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 5: syntax error at '.' before 'a'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\n@prefix dbr:    <http://dbpedia.org/resource/> .\n\nASK  {dbr:Darrell\\_Wallace\\_Jr. a owl:Thing. }\n"


 91%|████████████████████████████████████████████████████████████████████████▎      | 925/1011 [12:39<01:03,  1.36it/s]

.223\_Remington QueryBadFormed: a bad request has been sent to the endpoint, probably the sparql query is bad formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 0: Bad character '\\' (0x5c) in SPARQL expression at '\\'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\n@prefix dbr:    <http://dbpedia.org/resource/> .\n\nASK  {dbr:.223\\_Remington a owl:Thing. }\n"


 97%|████████████████████████████████████████████████████████████████████████████▎  | 977/1011 [13:22<00:21,  1.59it/s]

Orange\_S.A. QueryBadFormed: a bad request has been sent to the endpoint, probably the sparql query is bad formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 5: syntax error at '.' before 'a'\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\n@prefix dbr:    <http://dbpedia.org/resource/> .\n\nASK  {dbr:Orange\\_S.A. a owl:Thing. }\n"


100%|██████████████████████████████████████████████████████████████████████████████| 1011/1011 [13:48<00:00,  1.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [127]:
resources_to_map

Unnamed: 0,uri,count,name,uri_list
0,http://dbpedia.org/resource/White_supremacy,8664,White\_supremacy,"[http://www.w3.org/2002/07/owl#Thing, http://d..."
1,http://dbpedia.org/resource/VAX,6656,VAX,"[http://www.w3.org/2002/07/owl#Thing, http://d..."
2,http://dbpedia.org/resource/Hurricane_Floyd,5319,Hurricane\_Floyd,
3,http://dbpedia.org/resource/Self-defense,3558,Self-defense,
4,http://dbpedia.org/resource/Kenosha_Comets,3348,Kenosha\_Comets,"[http://www.w3.org/2002/07/owl#Thing, http://d..."
...,...,...,...,...
1006,http://dbpedia.org/resource/Rosa_Brooks,10,Rosa\_Brooks,[http://dbpedia.org/resource/Professor]
1007,http://dbpedia.org/resource/Imperialism,10,Imperialism,[http://dbpedia.org/resource/Advocacy]
1008,http://dbpedia.org/resource/Clapping,10,Clapping,[http://dbpedia.org/resource/Sound]
1009,http://dbpedia.org/resource/House_arrest,10,House\_arrest,"[http://www.w3.org/2002/07/owl#Thing, http://d..."


In [152]:
query_item('Arizona_State_University')

['http://dbpedia.org/resource/University']

In [131]:
resources_to_map.set_index('uri')[['uri_list']].explode('uri_list').to_csv('../data/dbpedia_resource_mapping.csv')