In [1]:
import numpy as np
import pandas as pd

DATA_PATH = 'Data/'
FILE2020 = DATA_PATH + 'quotes-2020.json.bz2'
Quotes2020 = pd.read_json(FILE2020, lines=True, compression='bz2', nrows=100)
Wikidata = pd.read_parquet(DATA_PATH + 'speaker_attributes.parquet')

In [134]:
Quotes2020.head(20)

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2020-01-28-000082,[ D ] espite the efforts of the partners to cr...,,[],2020-01-28 08:04:05,1,"[[None, 0.7272], [Prime Minister Netanyahu, 0....",[http://israelnationalnews.com/News/News.aspx/...,E
1,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,[Q367796],2020-01-16 12:00:13,1,"[[Sue Myrick, 0.8867], [None, 0.0992], [Ron Wy...",[http://thehill.com/opinion/international/4782...,E
2,2020-02-10-000142,... He (Madhav) also disclosed that the illega...,,[],2020-02-10 23:45:54,1,"[[None, 0.8926], [Prakash Rai, 0.1074]]",[https://indianexpress.com/article/business/ec...,E
3,2020-02-15-000053,"... [ I ] f it gets to the floor,",,[],2020-02-15 14:12:51,2,"[[None, 0.581], [Andy Harris, 0.4191]]",[https://patriotpost.us/opinion/68622-trump-bu...,E
4,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,[Q20684375],2020-01-24 20:37:09,4,"[[Meghan King Edmonds, 0.5446], [None, 0.2705]...",[https://people.com/parents/meghan-king-edmond...,E
5,2020-02-27-000223,[ one's ] individual's physical characteristic...,,[],2020-02-27 08:27:00,1,"[[None, 0.7164], [Prince Charles, 0.2836]]",[https://ukhumanrightsblog.com/2020/02/27/the-...,E
6,2020-04-15-000176,[ Queen ] can. He can cover.,,[],2020-04-15 17:30:45,1,"[[None, 0.8956], [Eric DeCosta, 0.0665], [Kenn...",[https://www.pennlive.com/baltimore-ravens/202...,E
7,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith,[Q5268447],2020-01-17 13:03:00,1,"[[Dexter Smith, 0.924], [None, 0.076]]",[http://www.sloughexpress.co.uk/gallery/slough...,E
8,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger,[Q4864119],2020-04-02 14:18:20,1,"[[Barry Coppinger, 0.9017], [None, 0.0983]]",[http://www.theweek.co.uk/106479/why-police-ar...,E
9,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,[Q816459],2020-03-19 19:14:00,1,"[[Ben Carson, 0.9227], [None, 0.0773]]",[https://mortgageorb.com/hud-fha-suspend-forec...,E


In [129]:
Wikidata.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


## Functions and their utility
#### FromQidsToUsefulQids:
- The qids extracted from the Quote database are collected in a array of sub-arrays, where each sub-array stands for a quote. This function aims at stacking each sub-array to each other, creating thus a Series of qids, whose 0th columns are then the ones we want to work with. 
- By precising 'ALL' in the condition, we make sure that all the qids (of the sub-arrays) are kept, otherise only the first is used and the other deleted. An absence of qid will be marked down as a 'NaN' in the new Seriesé

#### FromQidsToName:
- from the useful array of qids, we want the names. This function uses "https://www.wikidata.org/wiki/Special:EntityData/"+str(qid)+".json" to fetch the json file (and hence the name) of the passed qid.
- Qids missing are simply marked as 'NaN' in the output list. 

#### FromDataQidsToDataName:
- Using the 2 sub-mentioned functions, it transforms a DataFrame with the following columns 'gender', 'nationality', 'date of birth' composed of qids into comprehensible names.
- Columns of interst can easily be included/excluded.


In [130]:
def FromQidsToUsefulQids(qids_array, cond): 
    #cond = 'ALL' if keep ALL elements of array
    #otherwise keep only the first of each array
    qids_new = qids_array.apply(pd.Series).stack(dropna=False).reset_index(drop=False)
    
    if cond != 'ALL':
        qids_new = qids_new.loc[qids_new['level_1'] == 0] # To keep only the first element of the array (in case there were multiple)
    
    qids_final = qids_new[0]
    return qids_final
    

In [126]:
def FromQidsToName(qids_array): 
    
    from urllib.request import urlopen # import urllib library
    import json
    import pandas as pd
    import math as m 
    name_array = []
    for qid in qids_array:
        #print(type(qid))
        if type(qid) != str:
            name_array.append('NaN')
        else:
            # store the URL in url as 
            # parameter for urlopen
            url = "https://www.wikidata.org/wiki/Special:EntityData/"+str(qid)+".json"

            # store the response of URL
            response = urlopen(url)

            data_json = json.loads(response.read())

            data1 = pd.DataFrame.from_dict(data_json['entities'])
            data2 = data1.loc['labels']        
            name = data2.loc[str(qid)]['en']['value']
            #print(name)   
            name_array.append(name)
        
    return [name_array]

In [114]:
def FromDataQidsToDataName(data):
    
    #initialization of new data
    data1 = pd.DataFrame()
    data1['id'] = data['id']
    data1['label'] = data['label']
    
    #transform gender
    qids = data['gender']
    qids_new = FromQidsToUsefulQids(qids, 'only first')
    gender = []
    for qid in qids_new:
        if qid == 'Q6581097':
            gender.append('male')
        elif qid == 'Q6581072':
            gender.append('female')
        else:
            gender.append(undefined)
    
    data1['gender'] = gender
    
    #transform nationalities
    qids = data['nationality']
    qids_new = FromQidsToUsefulQids(qids, 'only first')
    names = FromQidsToName(qids_new)
    names_1 = pd.Series(names).apply(pd.Series).stack(dropna=False).reset_index(drop=False)
    names_ok = names_1[0].tolist()
    data1['nationality'] = names_ok
    
    #transform date of birth
    data1['date of birth'] = data['date_of_birth']
    
    return data1

In [127]:
Wikidata_name = FromDataQidsToDataName(Wikidata_interest.loc[1218846:1478578])
Wikidata_name

<class 'str'>
<class 'str'>
<class 'float'>
<class 'str'>
<class 'float'>


Unnamed: 0,id,label,gender,nationality,date of birth
1218846,Q3018737,David Rubenstein,male,United States of America,[+1949-08-11T00:00:00Z]
1326960,Q7199798,Piyush Goyal,male,India,[+1964-06-13T00:00:00Z]
1330885,Q7356416,Rod Young,male,,
1434677,Q16225529,Harmeet Singh,male,India,[+1987-10-09T00:00:00Z]
1478578,Q19664192,Bettina Korek,female,,[+1978-00-00T00:00:00Z]
