In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

In [3]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [4]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df.parquet')

In [5]:
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)

  1%|          | 34/4775 [03:04<2:35:37,  1.97s/it] 

name: Eliot Theodore
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'Eliot Theodore\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        ?item wdt:P31 wd:Q5\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=1464779d-d0f2-437b-88d2-46bcaa1860fc,bopId=1,partitionId=-1,sinkId=2,altSinkId=null}, cause=java.util.concurrent.ExecutionException: java.util.concurrent.Ex

 35%|███▍      | 1656/4775 [24:26<17:36:57, 20.33s/it]

name: Joel M. Fisher
error message: HTTP Error 504: Gateway Timeout


 37%|███▋      | 1783/4775 [33:45<1:34:34,  1.90s/it] 

name: Ivan Guryevich Niklessa
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'Ivan Guryevich Niklessa\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        ?item wdt:P31 wd:Q5\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=e7f5524e-8409-4895-8328-e2cb924af135,bopId=1,partitionId=-1,sinkId=2,altSinkId=null}, cause=java.util.concurrent.ExecutionException: java.

 38%|███▊      | 1791/4775 [34:55<16:38:47, 20.08s/it]

name: Anthony Derrick Parsons
error message: HTTP Error 504: Gateway Timeout


 57%|█████▋    | 2741/4775 [45:59<29:06,  1.16it/s]   

name: Sultan ibn Abd al-Aziz al Saud Prince
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'Sultan ibn Abd al-Aziz al Saud Prince\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        ?item wdt:P31 wd:Q5\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=9a8ad134-296f-4b8e-945a-e563297540a5,bopId=1,partitionId=-1,sinkId=2,altSinkId=null}, cause=java.util.concurre

 58%|█████▊    | 2792/4775 [46:47<1:10:53,  2.15s/it]

name: Sayyid ’Umar (Omar) al-Saqqaf
error message: EndPointInternalError: The endpoint returned the HTTP status code 500. 

Response:
b'SPARQL-QUERY: queryStr=\n        SELECT ?item WHERE {\n        SERVICE wikibase:mwapi {\n            bd:serviceParam wikibase:endpoint "www.wikidata.org";\n                            wikibase:api "EntitySearch";\n                            mwapi:search  \'Sayyid \xe2\x80\x99Umar (Omar) al-Saqqaf\';\n                            mwapi:language "en".\n            ?item wikibase:apiOutputItem mwapi:item.\n            ?num wikibase:apiOrdinal true.\n        }\n        ?item wdt:P31 wd:Q5\n        }\n        \njava.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryEvaluationException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.Exception: task=ChunkTask{query=ae0eed81-3e97-4e22-9025-c072381872bd,bopId=1,partitionId=-1,sinkId=2,altSinkId=null}, cause=java.util.concurrent.Ex

100%|██████████| 4775/4775 [1:02:08<00:00,  1.28it/s]


In [9]:
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/new_unified_person_df_wikicol.parquet')

In [24]:
tag_d = {}

for idx, key in new_unified_person_df.iterrows():

    for ent in key['wiki_col']:
        
        if ent in tag_d:
            tag_d[ent].append(idx)
        else:
            tag_d[ent] = [idx]


In [37]:
x = [5293, 5929]
new_unified_person_df.loc[x]

Unnamed: 0,name_set,name_list,id_list,description_list,wiki_col
5293,Champfeu Jacques Larosière de de,[Jacques de Larosière de Champfeu],[frus1969-76v37_p_LCJ_1],"[Counselor, French Ministry of Economic Affair...",[http://www.wikidata.org/entity/Q1363641]
5929,Jacques Larosiere de,[Jacques de Larosiere],[frus1969-76v31_p_LJ1],"[Director of the Treasury, French Ministry of ...",[http://www.wikidata.org/entity/Q1363641]


In [27]:
for key in tag_d:
    if len(tag_d[key])>1:
        print(tag_d[key])

[29, 218]
[42, 479]
[45, 5631]
[56, 500]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423, 2414]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423, 2414]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423, 2414]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[59, 1423]
[61, 734, 3351, 4306]
[62, 4306]
[77, 741]
[89, 271]
[107, 3796]
[140, 3023]
[140, 3023]
[140, 3023]
[149, 4532]
[162, 526]
[165, 2297]
[186, 3647]
[188, 1973]
[196, 1272]
[201, 1557]
[210, 4715]
[221, 5189]
[231, 3178]
[234, 3727]
[255, 2478]
[290, 2174]
[294, 1335]
[311, 2206]
[325, 1348]
[339, 3222]
[343, 2618]
[343, 2618]
[345, 1614]
[351, 1117]
[355, 5115]
[355, 5115]
[362, 3579]
[386, 5848]
[387, 2916]
[