In [59]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [60]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

## person matching

In [None]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df.parquet')

In [None]:
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)

In [None]:
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/new_unified_person_df_wikicol.parquet')

In [None]:
tag_d = {}

for idx, key in new_unified_person_df.iterrows():

    for ent in key['wiki_col']:
        
        if ent in tag_d:
            tag_d[ent].append(idx)
        else:
            tag_d[ent] = [idx]


In [None]:
x = [5293, 5929]
new_unified_person_df.loc[x]

In [None]:
for key in tag_d:
    if len(tag_d[key])>1:
        print(tag_d[key])

### sentence transformers

In [48]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [49]:
new_unified_person_df_wikicol = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')

In [141]:
new_unified_person_df_wikicol[new_unified_person_df_wikicol['wiki_col'].apply(lambda x: True if len(x)>1 else False)]

Unnamed: 0,name_set,name_list,id_list,description_list,wiki_col
4,Allen Richard V.,"[Richard Allen, Richard V. Allen]","[frus1969-76v29_p_AR1, frus1969-76v03_p_ARV1, ...","[Member, National Security Council Staff, 1969...","[http://www.wikidata.org/entity/Q30122355, htt..."
10,Blee David H.,"[David H. Blee, David Blee]","[frus1969-76v14_p_BDH2, frus1969-76ve08_p_BDH2...","[Chief of the Soviet/Eastern Europe Division, ...","[http://www.wikidata.org/entity/Q15804769, htt..."
11,Brandt Willy,[Willy Brandt],"[frus1969-76v14_p_BWHF1, frus1969-76v29_p_BW1,...",[Chancellor of the Federal Republic of Germany...,"[http://www.wikidata.org/entity/Q29168166, htt..."
18,Castro Fidel Ruz,"[Fidel Castro, Fidel Castro Ruz, Castro Ruz Fi...","[frus1969-76v16_p_CF_1, frus1969-76ve16_p_CF_1...","[Premier of Cuba, Cuban Prime Minister, Cuban ...","[http://www.wikidata.org/entity/Q11256, http:/..."
19,Chancellor John,"[John Chancellor, Chancellor John]","[frus1969-76v14_p_CJ8, frus1969-76v13_p_CJ1]","[anchor on the NBC Nightly News, anchor on NBC...","[http://www.wikidata.org/entity/Q1770797, http..."
...,...,...,...,...,...
5925,Jackson John,[John Jackson],[frus1969-76v31_p_JJ1],"[General Counsel, Office of the Special Repres...","[http://www.wikidata.org/entity/Q19325443, htt..."
5930,Long Olivier,[Olivier Long],[frus1969-76v31_p_LO1],"[Director-General, General Agreement on Tariff...","[http://www.wikidata.org/entity/Q64789172, htt..."
5980,Farouk I,[Farouk I],[frus1969-76v25_p_FI_2],"[King of Egypt, 1936–1952]","[http://www.wikidata.org/entity/Q60577842, htt..."
6023,Sabah al-Ahmad al-Jabir al-Sabah,[Sabah al-Ahmad al-Jabir al-Sabah],[frus1969-76v25_p_SAAA_1],[Kuwaiti Foreign Minister],"[http://www.wikidata.org/entity/Q57555, http:/..."


In [126]:
# move this step to entity linking part above as new col wiki_descp ?
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [147]:
row = new_unified_person_df_wikicol.loc[10] # 6035

desc_list = row['description_list']
frus_embedding = np.mean(model.encode(desc_list), axis=0)

wiki_descs = process_candidate_entities(row)
wiki_embeddings = model.encode(wiki_descs)

cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)
print(cos_sim)

selected_idx = np.argmax(cos_sim,axis=1)[0]

print(f'desc_list:{desc_list}\n\nwiki_descs:{wiki_descs}\n\nselected:{wiki_descs[selected_idx]}\n\nQ:{row["wiki_col"][selected_idx]}')

tensor([[0.1281, 0.5819]])
desc_list:['Chief of the Soviet/Eastern Europe Division, Directorate of Operations, Central Intelligence Agency'
 'Associate Director of Operations, Central Intelligence Agency'
 'Chief, Near East and South Asia Division, Directorate of Operations, Central Intelligence Agency'
 'Central Intelligence Agency'
 'Chief, Near East and South Asia Division, Directorate of Operations, Central Intelligence Agency'
 'Chief, Near East and South Asia Division, Directorate of Operations, Central Intelligence Agency']

wiki_descs:['American mathematician', 'CIA officer (1916-2000)']

selected:CIA officer (1916-2000)

Q:http://www.wikidata.org/entity/Q28162662


### extracting extra info from wikidata

In [None]:
query="""
SELECT ?descp
WHERE 
{
wd:"""+Q+""" schema:description ?descp.
FILTER ( lang(?descp) = "en" )
}"""

sparqlwd.setQuery(query)

sparqlwd.query().convert()