In [148]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [149]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

## person matching (move to person_unify)

In [None]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df.parquet')

In [None]:
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)

In [None]:
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/new_unified_person_df_wikicol.parquet')

### processing ends.

### sentence transformers

In [173]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [175]:
new_unified_person_df_wikicol = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')

In [176]:
new_unified_person_df_wikicol[new_unified_person_df_wikicol['wiki_col'].apply(lambda x: True if len(x)>1 else False)]

Unnamed: 0,name_set,name_list,id_list,description_list,wiki_col
4,Allen Richard V.,"[Richard Allen, Richard V. Allen]","[frus1969-76v29_p_AR1, frus1969-76v03_p_ARV1, ...","[Member, National Security Council Staff, 1969...","[http://www.wikidata.org/entity/Q30122355, htt..."
10,Blee David H.,"[David H. Blee, David Blee]","[frus1969-76v14_p_BDH2, frus1969-76ve08_p_BDH2...","[Chief of the Soviet/Eastern Europe Division, ...","[http://www.wikidata.org/entity/Q15804769, htt..."
11,Brandt Willy,[Willy Brandt],"[frus1969-76v14_p_BWHF1, frus1969-76v29_p_BW1,...",[Chancellor of the Federal Republic of Germany...,"[http://www.wikidata.org/entity/Q29168166, htt..."
18,Castro Fidel Ruz,"[Fidel Castro, Fidel Castro Ruz, Castro Ruz Fi...","[frus1969-76v16_p_CF_1, frus1969-76ve16_p_CF_1...","[Premier of Cuba, Cuban Prime Minister, Cuban ...","[http://www.wikidata.org/entity/Q11256, http:/..."
19,Chancellor John,"[John Chancellor, Chancellor John]","[frus1969-76v14_p_CJ8, frus1969-76v13_p_CJ1]","[anchor on the NBC Nightly News, anchor on NBC...","[http://www.wikidata.org/entity/Q1770797, http..."
...,...,...,...,...,...
5925,Jackson John,[John Jackson],[frus1969-76v31_p_JJ1],"[General Counsel, Office of the Special Repres...","[http://www.wikidata.org/entity/Q19325443, htt..."
5930,Long Olivier,[Olivier Long],[frus1969-76v31_p_LO1],"[Director-General, General Agreement on Tariff...","[http://www.wikidata.org/entity/Q64789172, htt..."
5980,Farouk I,[Farouk I],[frus1969-76v25_p_FI_2],"[King of Egypt, 1936–1952]","[http://www.wikidata.org/entity/Q60577842, htt..."
6023,Sabah al-Ahmad al-Jabir al-Sabah,[Sabah al-Ahmad al-Jabir al-Sabah],[frus1969-76v25_p_SAAA_1],[Kuwaiti Foreign Minister],"[http://www.wikidata.org/entity/Q57555, http:/..."


In [157]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [177]:
def process_wiki_col(row):

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [182]:
selected_wiki_entity = new_unified_person_df.progress_apply(lambda x: process_wiki_col(x),axis=1)

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet('tables/new_unified_person_df_sbert.parquet')

100%|██████████| 4775/4775 [36:45<00:00,  2.16it/s]  


## reduce names with exactly same wikidata entries

In [256]:
t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)

In [257]:
import copy
import itertools

scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

removing 85 keys.
---
removing 0 keys.
---


In [258]:
for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [266]:
new_unified_person_df.to_parquet('tables/new_unified_person_df_final.parquet')

### extracting extra info from wikidata

In [300]:
Q='Q42013'
#Q='Q9588'

In [281]:
# sex query
query="""
SELECT ?sex ?sexLabel
WHERE 
{
wd:"""+Q+""" wdt:P21 ?sex;

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['sex', 'sexLabel']},
 'results': {'bindings': [{'sex': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q6581097'},
    'sexLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'male'}}]}}

In [286]:
# citizenship query
query="""
SELECT ?citizenship ?citizenshipLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P27 ?statement1.
?statement1 ps:P27 ?citizenship.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['citizenship',
   'citizenshipLabel',
   'startyearLabel',
   'endyearLabel']},
 'results': {'bindings': [{'citizenship': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q370173'},
    'citizenshipLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Sultanate of Egypt'},
    'startyearLabel': {'type': 'literal', 'value': '1918-01-01T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1922-01-01T00:00:00Z'}},
   {'citizenship': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q124943'},
    'citizenshipLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Kingdom of Egypt'},
    'startyearLabel': {'type': 'literal', 'value': '1922-01-01T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1953-01-01T00:00:00Z'}},
   {'citizenship': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q3087763'},
    'citizenshipLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Republic of

In [293]:
# positionheld query
query="""
SELECT ?positionheld ?positionheldLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P39 ?statement1.
?statement1 ps:P39 ?positionheld.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['positionheld',
   'positionheldLabel',
   'startyearLabel',
   'endyearLabel']},
 'results': {'bindings': [{'positionheld': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q2669413'},
    'positionheldLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Vice President of Egypt'},
    'startyearLabel': {'type': 'literal', 'value': '1969-12-19T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1970-10-14T00:00:00Z'}},
   {'positionheld': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q1571396'},
    'positionheldLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Prime Minister of Egypt'},
    'startyearLabel': {'type': 'literal', 'value': '1980-05-15T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1981-10-06T00:00:00Z'}},
   {'positionheld': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q15618993'},
    'positionheldLabel': {'xml:lang': 'en',
     'type': 'literal',
   

In [290]:
# occupation query
query="""
SELECT ?occupation ?occupationLabel
WHERE 
{
wd:"""+Q+""" wdt:P106 ?occupation.

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['occupation', 'occupationLabel']},
 'results': {'bindings': [{'occupation': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q40348'},
    'occupationLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'lawyer'}},
   {'occupation': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q82955'},
    'occupationLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'politician'}},
   {'occupation': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q189290'},
    'occupationLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'military officer'}},
   {'occupation': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q372436'},
    'occupationLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'statesperson'}},
   {'occupation': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q18814623'},
    'occupationLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 

In [294]:
# political party query
query="""
SELECT ?party ?partyLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P102 ?statement1.
?statement1 ps:P102 ?party.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['party', 'partyLabel', 'startyearLabel', 'endyearLabel']},
 'results': {'bindings': [{'party': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q841133'},
    'partyLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'National Democratic Party'},
    'startyearLabel': {'type': 'literal', 'value': '1977-01-01T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1981-01-01T00:00:00Z'}},
   {'party': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q624111'},
    'partyLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Arab Socialist Union'},
    'startyearLabel': {'type': 'literal', 'value': '1962-01-01T00:00:00Z'},
    'endyearLabel': {'type': 'literal', 'value': '1977-01-01T00:00:00Z'}}]}}

In [295]:
# memberof query
query="""
SELECT ?memberof ?memberofLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P463 ?statement1.
?statement1 ps:P463 ?memberof.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['memberof',
   'memberofLabel',
   'startyearLabel',
   'endyearLabel']},
 'results': {'bindings': [{'memberof': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q781607'},
    'memberofLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Free Officers Movement'}}]}}

In [301]:
# educatedat query
query="""
SELECT ?educatedat ?educatedatLabel
WHERE 
{
wd:"""+Q+""" wdt:P69 ?educatedat.

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['educatedat', 'educatedatLabel']},
 'results': {'bindings': [{'educatedat': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q3603942'},
    'educatedatLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Egyptian Military Academy'}}]}}

In [302]:
# religion query
query="""
SELECT ?religion ?religionLabel
WHERE 
{
wd:"""+Q+""" wdt:P140 ?religion.

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()

{'head': {'vars': ['religion', 'religionLabel']},
 'results': {'bindings': [{'religion': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q483654'},
    'religionLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Sunni Islam'}}]}}