In [7]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [8]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

## person matching (move to person_unify)

In [None]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df.parquet')

In [None]:
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)

In [None]:
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/new_unified_person_df_wikicol.parquet')

### processing ends.

### sentence transformers

In [173]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [175]:
new_unified_person_df_wikicol = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')

In [176]:
new_unified_person_df_wikicol[new_unified_person_df_wikicol['wiki_col'].apply(lambda x: True if len(x)>1 else False)]

Unnamed: 0,name_set,name_list,id_list,description_list,wiki_col
4,Allen Richard V.,"[Richard Allen, Richard V. Allen]","[frus1969-76v29_p_AR1, frus1969-76v03_p_ARV1, ...","[Member, National Security Council Staff, 1969...","[http://www.wikidata.org/entity/Q30122355, htt..."
10,Blee David H.,"[David H. Blee, David Blee]","[frus1969-76v14_p_BDH2, frus1969-76ve08_p_BDH2...","[Chief of the Soviet/Eastern Europe Division, ...","[http://www.wikidata.org/entity/Q15804769, htt..."
11,Brandt Willy,[Willy Brandt],"[frus1969-76v14_p_BWHF1, frus1969-76v29_p_BW1,...",[Chancellor of the Federal Republic of Germany...,"[http://www.wikidata.org/entity/Q29168166, htt..."
18,Castro Fidel Ruz,"[Fidel Castro, Fidel Castro Ruz, Castro Ruz Fi...","[frus1969-76v16_p_CF_1, frus1969-76ve16_p_CF_1...","[Premier of Cuba, Cuban Prime Minister, Cuban ...","[http://www.wikidata.org/entity/Q11256, http:/..."
19,Chancellor John,"[John Chancellor, Chancellor John]","[frus1969-76v14_p_CJ8, frus1969-76v13_p_CJ1]","[anchor on the NBC Nightly News, anchor on NBC...","[http://www.wikidata.org/entity/Q1770797, http..."
...,...,...,...,...,...
5925,Jackson John,[John Jackson],[frus1969-76v31_p_JJ1],"[General Counsel, Office of the Special Repres...","[http://www.wikidata.org/entity/Q19325443, htt..."
5930,Long Olivier,[Olivier Long],[frus1969-76v31_p_LO1],"[Director-General, General Agreement on Tariff...","[http://www.wikidata.org/entity/Q64789172, htt..."
5980,Farouk I,[Farouk I],[frus1969-76v25_p_FI_2],"[King of Egypt, 1936–1952]","[http://www.wikidata.org/entity/Q60577842, htt..."
6023,Sabah al-Ahmad al-Jabir al-Sabah,[Sabah al-Ahmad al-Jabir al-Sabah],[frus1969-76v25_p_SAAA_1],[Kuwaiti Foreign Minister],"[http://www.wikidata.org/entity/Q57555, http:/..."


In [157]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [177]:
def process_wiki_col(row):

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [182]:
selected_wiki_entity = new_unified_person_df.progress_apply(lambda x: process_wiki_col(x),axis=1)

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet('tables/new_unified_person_df_sbert.parquet')

100%|██████████| 4775/4775 [36:45<00:00,  2.16it/s]  


## reduce names with exactly same wikidata entries

In [312]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_sbert.parquet')

In [313]:
t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)

In [314]:
import copy
import itertools

scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

removing 85 keys.
---
removing 0 keys.
---


In [315]:
for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [317]:
new_unified_person_df.to_parquet('tables/new_unified_person_df_final.parquet')

### extracting extra info from wikidata (make this separate file)

In [300]:
Q='Q42013'
#Q='Q9588'

In [24]:
gender_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P21 ?item;
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

religion_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P140 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

educated_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P69 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

occupation_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P106 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

citizenship_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P27 ?statement1.
?statement1 ps:P27 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

party_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P102 ?statement1.
?statement1 ps:P102 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

memberof_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P463 ?statement1.
?statement1 ps:P463 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

positionheld_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P39 ?statement1.
?statement1 ps:P39 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""


In [25]:
function_dict={'gender':gender_f,
               'religion':religion_f,
               'educated':educated_f,
               'occupation':occupation_f,
               'positionheld':positionheld_f,
               'citizenship':citizenship_f,
               'memberof':memberof_f,
               'party':party_f}

In [139]:
def execute_query(type,entity):

    try:
        sparqlwd.setQuery(function_dict[type](entity))

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {entity}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_query(row,type):
    
    entity = row['selected_wiki_entity']

    retrieved = []

    if entity:

        entity = entity.split('/')[-1]

        res = execute_query(type,entity)

        temp = []
        for binding in res['results']['bindings']:
            temp.append(binding['item']['value'])
            temp.append(binding['itemLabel']['value'])
            if binding.get('startyearLabel',None):
                temp.append(binding['startyearLabel']['value'])
            if binding.get('endyearLabel',None):
                temp.append(binding['endyearLabel']['value'])
        
        if len(temp)>0:
            retrieved.append(temp)

    if len(retrieved)>0:
        return retrieved
    else:
        return None

In [22]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_final.parquet')

In [86]:
gender_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('gender',))
religion_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('religion',))
educated_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('educated',))
occupation_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('occupation',))
positionheld_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('positionheld',))
citizenship_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('citizenship',))
memberof_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('memberof',))
party_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('party',))

100%|██████████| 4690/4690 [08:57<00:00,  8.73it/s]


In [144]:
new_unified_person_df

Unnamed: 0,name_set,name_list,id_list,description_list,wiki_col,selected_wiki_entity
0,Abrams Creighton General Major W.,"[Major General Creighton W. Abrams, Creighton ...","[frus1969-76v14_p_AGCWJ1, frus1969-76v07_p_ACW...","[USA, Commander of the United States Military ...",[http://www.wikidata.org/entity/Q280290],http://www.wikidata.org/entity/Q280290
1,Agnew Spiro T.,"[Spiro T. Agnew, Spiro Agnew]","[frus1969-76v14_p_AST1, frus1969-76v28_p_AST_1...","[Vice President of the United States, Vice Pre...",[http://www.wikidata.org/entity/Q203433],http://www.wikidata.org/entity/Q203433
2,Aleksandrov-Agentov Andrei M.,"[Andrei M. Aleksandrov, Andrei M. Aleksandrov-...","[frus1969-76v15_p_AAM_1, frus1969-76v25_p_AAM_...",[Assistant to Soviet General Secretary Brezhne...,[],
3,Alkhimov S. Vladimir,"[Alkhimov Vladimir, Vladimir S. Alkhimov]","[frus1969-76v15_p_AV_1, frus1969-76v14_p_AVS1]","[Soviet Deputy Foreign Trade Minister, Soviet ...",[],
4,Allen Richard V.,"[Richard Allen, Richard V. Allen]","[frus1969-76v29_p_AR1, frus1969-76v03_p_ARV1, ...","[Member, National Security Council Staff, 1969...","[http://www.wikidata.org/entity/Q30122355, htt...",http://www.wikidata.org/entity/Q517683
...,...,...,...,...,...,...
6031,Sirri Umar,[Umar Sirri],[frus1969-76v25_p_SU_1],"[Minister, Egyptian Ministry of Foreign Affairs]",[],
6033,Colonel Donald Lieutenant Stukel,[Lieutenant Colonel Donald Stukel],[frus1969-76v25_p_SD_1],"[member, National Security Council staff]",[],
6035,Cyrus L. Sulzberger,[Cyrus L. Sulzberger],[frus1969-76v25_p_SCL_1],"[II, U.S. newspaper journalist]","[http://www.wikidata.org/entity/Q23681106, htt...",http://www.wikidata.org/entity/Q5201118
6036,General Gordon Major Sumner,[Major General Gordon Sumner],[frus1969-76v25_p_SG_1],"[Director, Near East and South Asia Region, Of...",[],


In [140]:
temp_series = new_unified_person_df.iloc[:100].progress_apply(process_query,axis=1,args=('memberof',))

100%|██████████| 100/100 [00:16<00:00,  6.02it/s]


In [141]:
temp_series

0                                                  None
1                                                  None
2                                                  None
3                                                  None
4                                                  None
                            ...                        
95    [[http://www.wikidata.org/entity/Q463303, Amer...
96                                                 None
97    [[http://www.wikidata.org/entity/Q47932, Jewis...
98                                                 None
99    [[http://www.wikidata.org/entity/Q2839513, Alp...
Length: 100, dtype: object

In [135]:
temp_series.iloc[4]

[['http://www.wikidata.org/entity/Q13371',
  'Harvard University',
  'http://www.wikidata.org/entity/Q55044',
  'Ludwig Maximilian University of Munich',
  'http://www.wikidata.org/entity/Q178848',
  'University of Notre Dame']]