In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.entity import WikidataItem
import itertools
from collections import Counter
from tqdm.notebook import tqdm

from datetime import datetime


pd.options.plotting.backend = "plotly"

In [7]:
df = pd.read_json('data\\speakers.json.bz2', compression='bz2')

In [12]:
speakers_dropped = df.drop(columns=['aliases', 'label', 'US_congress_bio_ID', 
                                          'lastrevid', 'type', 
                                          'candidacy', 'academic_degree',
                                          'religion', 'n_unique_quotes',
                                          'ethnic_group', 'party']).dropna(axis=0)
print(speakers_dropped.shape)
speakers_dropped.head(2)

(442811, 5)


Unnamed: 0,date_of_birth,nationality,gender,occupation,n_quotes
Q270316,[+1947-01-28T00:00:00Z],[Q30],[Q6581072],[Q82955],21060
Q1253,[+1944-06-13T00:00:00Z],[Q884],[Q6581097],"[Q82955, Q193391]",94704


In [17]:
type(speakers_dropped.iloc[12]['date_of_birth'][0])

str

In [18]:
ages = []
for birth in speakers_dropped.date_of_birth.values:
    #birth = eval(birth)
    if not birth is None:
        ages.append(datetime.now().year - int(birth[0][1:5]))
    else:
        ages.append(None)

speakers_dropped['age'] = ages
speakers_dropped.head(2)

Unnamed: 0,date_of_birth,nationality,gender,occupation,n_quotes,age
Q270316,[+1947-01-28T00:00:00Z],[Q30],[Q6581072],[Q82955],21060,74
Q1253,[+1944-06-13T00:00:00Z],[Q884],[Q6581097],"[Q82955, Q193391]",94704,77


In [19]:
speakers_features = speakers_dropped.drop(columns=['date_of_birth'])
speakers_features.head(2)

Unnamed: 0,nationality,gender,occupation,n_quotes,age
Q270316,[Q30],[Q6581072],[Q82955],21060,74
Q1253,[Q884],[Q6581097],"[Q82955, Q193391]",94704,77


In [22]:
speakers_features_full = pd.DataFrame()
speakers_features_full['n_quotes'] = speakers_features['n_quotes']
speakers_features_full['age'] = speakers_features['age']

for name, values in speakers_features.iteritems():
    if name not in ['n_quotes', 'age']:
        updated_values = []
        for val in values:
            #print(val)
            # print(val.split("'")[1])
            if not val is None:
                updated_values.append(val[0])
            else:
                updated_values.append(None)
        speakers_features_full[name] = updated_values

speakers_features_preprocessed = speakers_features_full.dropna(axis=0) # remove row if any column value is None
print(speakers_features_preprocessed.shape)
speakers_features_preprocessed.head(2)

(442811, 5)


Unnamed: 0,n_quotes,age,nationality,gender,occupation
Q270316,21060,74,Q30,Q6581072,Q82955
Q1253,94704,77,Q884,Q6581097,Q82955


In [23]:
speakers_features_preprocessed.to_json('data\\clean_speakers.jsonl')

In [24]:
speakers_features_final = speakers_features_preprocessed[(speakers_features_preprocessed.age > 0) 
                                                               & (speakers_features_preprocessed.age < 150) 
                                                               & (speakers_features_preprocessed.n_quotes > 5)]
print(speakers_features_final.shape)
speakers_features_final.head(2)

(326437, 5)


Unnamed: 0,n_quotes,age,nationality,gender,occupation
Q270316,21060,74,Q30,Q6581072,Q82955
Q1253,94704,77,Q884,Q6581097,Q82955


In [25]:
speakers_features_final.to_json('data\\clean_speakers_5.jsonl')

In [None]:
df = pd.read_json('data\\clean_speakers_5.jsonl')
print(df.shape)

In [None]:
qid = 'Q6581097'
get_entity_dict_from_api(qid)['labels']['en']['value']

'male'

In [None]:
ctr = Counter(df['occupation'])
relevant_occupations = pd.DataFrame.from_dict(ctr, orient='index').reset_index().rename(columns={'index': 'qid', 0: 'count'}).sort_values('count', ascending=False)[0:100]
relevant_occupations['meaning'] = relevant_occupations['qid'].apply(lambda s: get_entity_dict_from_api(s)['labels']['en']['value'])
relevant_occupations.head(20)

Unnamed: 0,qid,count,meaning
0,Q82955,33781,politician
6,Q937857,29266,association football player
42,Q33999,21880,actor
7,Q19204627,10649,American football player
2,Q1930187,9077,journalist
19,Q177220,8616,singer
63,Q36180,8466,writer
11,Q3665646,7288,basketball player
26,Q10871364,6683,baseball player
14,Q11774891,6413,ice hockey player


In [None]:
tqdm.pandas()
# association football player, American football player, basketball player, ice hockey player, cricketer, baseball player, rugby union player, athletics competitor, sport cyclist
sports = {'Q937857', 'Q19204627', 'Q3665646', 'Q11774891', 'Q12299841', 'Q10871364', 'Q14089670', 'Q11513337', 'Q2309784'}
# actor, singer, film director, musician, singer-songwriter, screenwriter, composer, painter
culture = {'Q33999', 'Q177220', 'Q2526255', 'Q639669', 'Q488205', 'Q28389', 'Q36834', 'Q1028181'}
# journalist, writer, economist, lawyer
academy = {'Q1930187', 'Q36180', 'Q188094', 'Q40348'}
df['politics'] = df['occupation'].apply(lambda x: 1 if x=='Q82955' else 0)
df['sport'] = df['occupation'].apply(lambda x: 1 if x in sports else 0)
df['culture'] = df['occupation'].apply(lambda x: 1 if x in culture else 0)
df['academic'] = df['occupation'].apply(lambda x: 1 if x in academy else 0)
df['gender'] = df['gender'].apply(lambda x: 'male' if x=='Q6581097' else 'female' if x=='Q6581072' else 0)
df.drop(columns=['occupation'], inplace=True)

In [None]:
ctr = Counter(df['nationality'])
relevant_nationality = pd.DataFrame.from_dict(ctr, orient='index').reset_index().rename(columns={'index': 'qid', 0: 'count'}).sort_values('count', ascending=False)[0:20]
relevant_nationality['meaning'] = relevant_nationality['qid'].apply(lambda s: get_entity_dict_from_api(s)['labels']['en']['value'])
relevant_nationality.head(5)

Unnamed: 0,qid,count,meaning
0,Q30,117674,United States of America
3,Q145,40031,United Kingdom
9,Q16,18202,Canada
2,Q408,14889,Australia
5,Q183,11361,Germany


In [None]:
df['nationality'] = df['nationality'].apply(lambda s: relevant_nationality.loc[relevant_nationality['qid']==s].meaning.values[0] if s in relevant_nationality.qid.values else 'Other')

In [None]:
df.head()

Unnamed: 0,n_quotes,age,nationality,gender,politics,sport,culture,academic
Q270316,21060,74,United States of America,female,1,0,0,0
Q1253,94704,77,Other,male,1,0,0,0
Q19874690,1207,62,Australia,male,0,0,0,0
Q5271548,1587,83,United States of America,female,0,0,0,1
Q2287947,132971,28,United States of America,male,0,0,0,0


In [None]:
df.to_json('data\\nice_df_speakers.jsonl')