In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

## person matching (move to person_unify)

In [None]:
def find_wiki_entity(name):

    try:
        query = """
        SELECT ?item WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?item wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?item wdt:P31 wd:Q5
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name_list(row):

    name_list = row['name_list']

    wiki_tag = set()

    for name in name_list:
        res = find_wiki_entity(name)

        for binding in res['results']['bindings']:
            wiki_tag.add(binding['item']['value'])

    return list(wiki_tag)

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df.parquet')

In [None]:
wiki_col = new_unified_person_df.progress_apply(lambda x: process_name_list(x),axis=1)

In [None]:
new_unified_person_df['wiki_col'] = wiki_col
new_unified_person_df.to_parquet('tables/new_unified_person_df_wikicol.parquet')

### processing ends.

### sentence transformers

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer,util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
new_unified_person_df_wikicol = pd.read_parquet('tables/new_unified_person_df_wikicol.parquet')

In [None]:
new_unified_person_df_wikicol[new_unified_person_df_wikicol['wiki_col'].apply(lambda x: True if len(x)>1 else False)]

In [None]:
# helpers for using sbert for deciding among wikidata entries
def get_entity_descp(Q):

    try:
        query = """
        SELECT ?descp
        WHERE 
        {
        wd:"""+Q+""" schema:description ?descp.
        FILTER ( lang(?descp) = "en" )
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_candidate_entities(row):

    q_list = row['wiki_col']
    
    wiki_descp = []

    for q in q_list:
        
        res = get_entity_descp(q.split('/')[-1])
        
        if len(res['results']['bindings'])==0:
            wiki_descp.append('')
        else:      
            for binding in res['results']['bindings']:

                wiki_descp.append(binding['descp']['value'])

    return wiki_descp

In [None]:
def process_wiki_col(row):

    wiki_col = row['wiki_col']
    
    if len(wiki_col)==0:
        return None

    elif len(wiki_col)==1:
        return wiki_col[0]

    else:
        desc_list = row['description_list']
        frus_embedding = np.mean(model.encode(desc_list), axis=0)

        wiki_descs = process_candidate_entities(row)
        wiki_embeddings = model.encode(wiki_descs)

        cos_sim = util.cos_sim(frus_embedding, wiki_embeddings)

        selected_idx = np.argmax(cos_sim,axis=1)[0]
        
        return row["wiki_col"][selected_idx]

In [None]:
selected_wiki_entity = new_unified_person_df.progress_apply(lambda x: process_wiki_col(x),axis=1)

new_unified_person_df['selected_wiki_entity'] = selected_wiki_entity
new_unified_person_df.to_parquet('tables/new_unified_person_df_sbert.parquet')

## reduce names with exactly same wikidata entries

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_sbert.parquet')

In [None]:
t = {}

for idx, key in new_unified_person_df.iterrows():

    ent = key['selected_wiki_entity']

    if not ent:
        t[idx]=set([idx])
    else:
        t[idx]=set(new_unified_person_df[new_unified_person_df['selected_wiki_entity']==ent].index)

In [None]:
import copy
import itertools

scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')

In [None]:
for temp_key in t:
    
    te_df = new_unified_person_df.loc[list(t[temp_key])]

    name_list = list(itertools.chain.from_iterable(te_df['name_list'].values))
    id_list = list(itertools.chain.from_iterable(te_df['id_list'].values))
    description_list = list(itertools.chain.from_iterable(te_df['description_list'].values))

    new_unified_person_df.at[temp_key, 'name_list'] = name_list
    new_unified_person_df.at[temp_key, 'id_list'] = id_list
    new_unified_person_df.at[temp_key, 'description_list'] = description_list

new_unified_person_df = new_unified_person_df.loc[t.keys()]

In [None]:
new_unified_person_df.to_parquet('tables/new_unified_person_df_final.parquet')

### extracting extra info from wikidata (make this separate file)

In [None]:
new_unified_person_df = pd.read_parquet('tables/new_unified_person_df_final.parquet')

In [3]:
gender_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P21 ?item;
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

religion_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P140 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

educated_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P69 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

occupation_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P106 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

citizenship_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P27 ?statement1.
?statement1 ps:P27 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

party_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P102 ?statement1.
?statement1 ps:P102 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

memberof_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P463 ?statement1.
?statement1 ps:P463 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

positionheld_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P39 ?statement1.
?statement1 ps:P39 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""


In [4]:
function_dict={'gender':gender_f,
               'religion':religion_f,
               'educated':educated_f,
               'occupation':occupation_f,
               'positionheld':positionheld_f,
               'citizenship':citizenship_f,
               'memberof':memberof_f,
               'party':party_f}

In [51]:
def execute_query(type,entity):

    try:
        sparqlwd.setQuery(function_dict[type](entity))

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {entity}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_query(row,type):
    
    entity = row['selected_wiki_entity']

    retrieved = []

    if entity:

        entity = entity.split('/')[-1]

        res = execute_query(type,entity)

        for binding in res['results']['bindings']:
            temp = []
            temp.append(binding['item']['value'])
            temp.append(binding['itemLabel']['value'])
            if binding.get('startyearLabel',None):
                temp.append(binding['startyearLabel']['value'])
            if binding.get('endyearLabel',None):
                temp.append(binding['endyearLabel']['value'])
        
            if len(temp)>0:
                retrieved.append(temp)

    if len(retrieved)>0:
        return retrieved
    else:
        return None

In [56]:
gender_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('gender',))
religion_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('religion',))
educated_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('educated',))
occupation_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('occupation',))
positionheld_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('positionheld',))
citizenship_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('citizenship',))
party_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('party',))
memberof_series = new_unified_person_df.progress_apply(process_query,axis=1,args=('memberof',))

100%|██████████| 4690/4690 [09:27<00:00,  8.27it/s] 
100%|██████████| 4690/4690 [09:06<00:00,  8.58it/s]
100%|██████████| 4690/4690 [09:33<00:00,  8.17it/s]
100%|██████████| 4690/4690 [09:29<00:00,  8.24it/s]
100%|██████████| 4690/4690 [09:29<00:00,  8.23it/s] 
100%|██████████| 4690/4690 [09:31<00:00,  8.20it/s]
100%|██████████| 4690/4690 [09:14<00:00,  8.46it/s]


In [62]:
# no need to this!
merged_extra_df = pd.DataFrame.from_dict({'gender':gender_series,
                        'religion':religion_series,
                        'educated_at':educated_series,
                        'occupation':occupation_series,
                        'position_held':positionheld_series,
                        'citizenship':citizenship_series,
                        'member_of':memberof_series,
                        'political_party':party_series})

merged_extra_df.to_parquet('tables/person_wikidata_extras.parquet')

In [79]:
new_unified_person_df['gender'] = list(map(lambda x:x[0][1] if x else None,gender_series))
new_unified_person_df.to_parquet('tables/new_unified_person_df_final.parquet')

In [155]:
name_series_map = {'religion':religion_series,
                    'school':educated_series,
                    'occupation':occupation_series,
                    'role':positionheld_series,
                    'citizenship':citizenship_series,
                    'political_party':party_series}

In [162]:
# create dataframes of extra information

for series_name in ['religion', 'school', 'occupation']:

    series = name_series_map[series_name]

    temp_df = pd.concat([new_unified_person_df['name_set'],series],axis=1)
    temp_df.rename(columns={0:'info_list'},inplace=True)

    info_df = pd.DataFrame(columns=['name_set','info_name','info_tag'])

    def aux(row):
        global info_df

        name_set = row['name_set']
        info_list = row['info_list']

        if not info_list:
            info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],'info_name':[None],'info_tag':[None]})))
        else:
            for info in info_list:
                info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],'info_name':[info[1]],'info_tag':[info[0]]})))
        
        return

    temp_df.apply(lambda x: aux(x),axis=1)

    info_df.to_parquet('tables/person_'+series_name+'_69_76.parquet')
    


In [165]:
school_df = pd.read_parquet('tables/person_occupation_69_76.parquet')

In [166]:
school_df

Unnamed: 0,name_set,info_name,info_tag
0,Abrams Creighton General Major W.,military officer,http://www.wikidata.org/entity/Q189290
0,Agnew Spiro T.,lawyer,http://www.wikidata.org/entity/Q40348
0,Agnew Spiro T.,politician,http://www.wikidata.org/entity/Q82955
0,Aleksandrov-Agentov Andrei M.,,
0,Alkhimov S. Vladimir,,
...,...,...,...
0,Colonel Donald Lieutenant Stukel,,
0,Cyrus L. Sulzberger,journalist,http://www.wikidata.org/entity/Q1930187
0,Cyrus L. Sulzberger,diarist,http://www.wikidata.org/entity/Q18939491
0,General Gordon Major Sumner,,
