In [1]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import ray
from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

### extracting extra info from wikidata (make this separate file)

In [3]:
tables_path = 'tables/tables_52_88_demo/'

new_unified_person_df = pd.read_parquet(tables_path+'unified_person_df_final.parquet')

In [8]:
# helper functions for extracting specific person information

gender_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P21 ?item;
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

religion_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P140 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

educated_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P69 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

occupation_f=lambda Q:"""
SELECT ?item ?itemLabel
WHERE 
{
wd:"""+Q+""" wdt:P106 ?item.
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

citizenship_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P27 ?statement1.
?statement1 ps:P27 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

party_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P102 ?statement1.
?statement1 ps:P102 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

memberof_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P463 ?statement1.
?statement1 ps:P463 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""

positionheld_f=lambda Q:"""
SELECT ?item ?itemLabel ?startyearLabel ?endyearLabel
WHERE 
{
wd:"""+Q+""" p:P39 ?statement1.
?statement1 ps:P39 ?item.
OPTIONAL{?statement1 pq:P580 ?startyear.}
OPTIONAL{?statement1 pq:P582 ?endyear.}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""


In [7]:
function_dict={'gender':gender_f,
               'religion':religion_f,
               'educated':educated_f,
               'occupation':occupation_f,
               'positionheld':positionheld_f,
               'citizenship':citizenship_f,
               'memberof':memberof_f,
               'party':party_f}

In [9]:
def execute_query(type,entity):

    try:
        sparqlwd.setQuery(function_dict[type](entity))

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {entity}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_query(row,type):
    entity = row['selected_wiki_entity']
    ssl._create_default_https_context = ssl._create_unverified_context
    
    retrieved = []

    if entity:

        entity = entity.split('/')[-1]

        res = execute_query(type,entity)

        for binding in res['results']['bindings']:
            temp = []
            temp.append(binding['item']['value'])
            temp.append(binding['itemLabel']['value'])
            if binding.get('startyearLabel',None):
                temp.append(binding['startyearLabel']['value'])
            if binding.get('endyearLabel',None):
                temp.append(binding['endyearLabel']['value'])
        
            if len(temp)>0:
                retrieved.append(temp)

    if len(retrieved)>0:
        return retrieved
    else:
        return None

In [10]:
gender_series = new_unified_person_df.iloc[:100].progress_apply(process_query,axis=1,args=('gender',))

 15%|█▌        | 15/100 [00:04<00:23,  3.60it/s]


KeyboardInterrupt: 

In [7]:
ray.init(num_cpus=13)

futures = [process_query.remote(entity, 'gender') for entity in new_unified_person_df['selected_wiki_entity'].values]
gender_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'religion') for entity in new_unified_person_df['selected_wiki_entity'].values]
religion_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'educated') for entity in new_unified_person_df['selected_wiki_entity'].values]
educated_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'occupation') for entity in new_unified_person_df['selected_wiki_entity'].values]
occupation_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'positionheld') for entity in new_unified_person_df['selected_wiki_entity'].values]
positionheld_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'citizenship') for entity in new_unified_person_df['selected_wiki_entity'].values]
citizenship_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'party') for entity in new_unified_person_df['selected_wiki_entity'].values]
party_series = pd.Series(ray.get(futures))

futures = [process_query.remote(entity, 'memberof') for entity in new_unified_person_df['selected_wiki_entity'].values]
memberof_series = pd.Series(ray.get(futures))

ray.shutdown()

[2m[36m(process_query pid=5633)[0m name: Q1538373
[2m[36m(process_query pid=5633)[0m error message: HTTP Error 429: Too Many Requests
[2m[36m(process_query pid=5633)[0m name: Q45785
[2m[36m(process_query pid=5633)[0m error message: HTTP Error 429: Too Many Requests


In [9]:
new_unified_person_df['gender'] = list(map(lambda x:x[0][1] if x else None,gender_series))
new_unified_person_df.to_parquet(tables_path+'unified_person_df_final.parquet')

In [10]:
name_series_map = {'religion':religion_series,
                    'school':educated_series,
                    'occupation':occupation_series,
                    'role':positionheld_series,
                    'citizenship':citizenship_series,
                    'political_party':party_series}

In [11]:
# create dataframes of extra information

# series with NO start-end year information
for series_name in ['religion', 'school', 'occupation']:

    series = name_series_map[series_name]

    temp_df = pd.concat([new_unified_person_df['name_set'],series],axis=1)
    temp_df.rename(columns={0:'info_list'},inplace=True)

    info_df = pd.DataFrame(columns=['name_set','info_name','info_tag'])

    def aux(row):
        global info_df

        name_set = row['name_set']
        info_list = row['info_list']

        if not info_list:
            info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],'info_name':[None],'info_tag':[None]})))
        else:
            for info in info_list:
                info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],'info_name':[info[1]],'info_tag':[info[0]]})))
        
        return

    temp_df.apply(lambda x: aux(x),axis=1)

    info_df.dropna(thresh=2,inplace=True) # exclude persons with no info
    info_df.to_parquet(tables_path+'person_'+series_name+'.parquet')


# series with start-end year information
for series_name in ['role', 'citizenship', 'political_party']:

    series = name_series_map[series_name]

    temp_df = pd.concat([new_unified_person_df['name_set'],series],axis=1)
    temp_df.rename(columns={0:'info_list'},inplace=True)

    info_df = pd.DataFrame(columns=['name_set','info_name','info_tag','start_year','end_year'])

    def aux(row):
        global info_df

        name_set = row['name_set']
        info_list = row['info_list']

        if not info_list:
            info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],
                                                        'info_name':[None],
                                                        'info_tag':[None],
                                                        'start_year':[None],
                                                        'end_year':[None]})))
        else:
            for info in info_list:
                info_df = pd.concat((info_df,pd.DataFrame({'name_set':[name_set],
                                                            'info_name':[info[1]],
                                                            'info_tag':[info[0]],
                                                            'start_year':[info[2] if len(info)>2 else None],
                                                            'end_year':[info[3] if len(info)>3 else None]})))
        
        return

    temp_df.apply(lambda x: aux(x),axis=1)

    info_df.dropna(thresh=2,inplace=True) # exclude persons with no info
    info_df.to_parquet(tables_path+'person_'+series_name+'.parquet')

### adding country information to political party and school (comes with above)

In [None]:
def get_country_tag(Q):

    try:
        query = """
        SELECT ?country ?countryLabel
        WHERE 
        {
        wd:"""+Q+""" wdt:P17 ?country.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
        }"""
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {Q}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_entities(entity):

    res = get_country_tag(entity.split('/')[-1])
    
    if len(res['results']['bindings'])==0:
        return ''
    else: 
        # not checking multiple countries since meaningless
        binding = res['results']['bindings'][0]

        return binding['countryLabel']['value']

In [None]:
def add_country_info(df):
    party_tag_list = pd.unique(df['info_tag'])
    country_col = list(map(lambda x: process_entities(x),party_tag_list))
    party_tag_country_dict = dict(zip(party_tag_list,country_col))
    return df['info_tag'].apply(lambda x: party_tag_country_dict[x])

In [None]:
person_party_df = pd.read_parquet(tables_path+'person_political_party.parquet')
person_party_df['country'] = add_country_info(person_party_df)
person_party_df.to_parquet(tables_path+'person_political_party.parquet')

In [None]:
person_school_df = pd.read_parquet(tables_path+'person_school.parquet')
person_school_df['country'] = add_country_info(person_school_df)
person_school_df.to_parquet(tables_path+'person_school.parquet')