In [3]:
import pandas as pd
import numpy as np
import bz2
import json

In [4]:
# read the large compressed file in 1 go (may not work)
path = 'D:/ADA_quotebank/quotes-2020_preprocessed.json.bz2' #local location
df_quotes = pd.read_json(path, lines=True, compression='bz2')
df_quotes = df_quotes.set_index('qid') #Change and rename index for future merging
df_quotes.index.rename('id', inplace=True)
df_quotes.head(5)

Unnamed: 0_level_0,quoteID,quotation,speaker,prob
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q367796,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,0.8867
Q20684375,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,0.5446
Q5268447,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith,0.924
Q4864119,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger,0.9017
Q816459,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,0.9227


In [3]:
#Load speaker attributes parquet file
df_attrib = pd.read_parquet("speaker_attributes.parquet\speaker_attributes.parquet")
df_attrib.drop(columns=['aliases','US_congress_bio_ID','type','lastrevid','candidacy','label'],inplace=True)
#df_attrib = df_attrib.set_index('id')

#Load QID correspondance file and remove description column
df_labels = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col = 'QID')
df_labels = df_labels.drop(['Description'], axis=1)

In [None]:
#Merge quotes file and speaker attributes file based on the unique qid of the speaker
df = df_quotes.merge(df_attrib, left_index=True, right_index=True)
df.head(5)

In [4]:
def get_qid_label(df, labels):
    '''
    Replaces Wikidata Qids with the corresponding text
    :param df: dataframe to modify
    :param dflabels: 
    :return: the updated dataframe with text instead of qids
    '''
    for col in df.columns:
        try:
            df.loc[:, 'label'] = df.loc[:, col].apply(lambda x: [labels.loc[cat].item() for cat in x] if x is not None else x)     
        except:
            continue
    return df

In [6]:
df_out = get_qid_label(df_attrib,df_labels)

In [136]:
df_out.head(10)

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,religion,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.968,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-01-28-105408,"We were not consulted, we don't know what this...",Vasily Nebenzya,0.9512,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-02-11-012952,But what if one side-Palestine-considers the p...,Vasily Nebenzya,0.8329,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-02-28-048691,not to bury Astana process,Vasily Nebenzya,0.5201,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-01-30-075841,Russia has no hidden agenda in the Libyan conf...,Vasily Nebenzya,0.8704,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,
Q1000053,2020-02-28-048692,not to bury the Astana process.,Vasily Nebenzya,0.9692,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],,"[Q193391, Q82955]",,,,


In [84]:
df_attrib = df_attrib.set_index('id')
df_attrib.head(10)

Unnamed: 0_level_0,date_of_birth,nationality,gender,ethnic_group,occupation,party,academic_degree,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q23,[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,[Q682443]
Q42,[+1952-03-11T00:00:00Z],[Q145],[Q6581097],[Q7994501],"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,
Q1868,[+1868-08-23T00:00:00Z],[Q31],[Q6581097],,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,
Q207,[+1946-07-06T00:00:00Z],[Q30],[Q6581097],,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,"[Q329646, Q682443, Q33203]"
Q297,[+1599-06-06T00:00:00Z],[Q29],[Q6581097],,[Q1028181],,,
Q368,[+1915-11-25T00:00:00Z],[Q298],[Q6581097],,"[Q189290, Q82955]",[Q327591],,[Q1841]
Q501,[+1821-04-09T00:00:00Z],[Q142],[Q6581097],[Q121842],"[Q49757, Q4164507, Q11774202, Q333634, Q36180,...",,,[Q1841]
Q619,[+1473-02-19T00:00:00Z],[Q1649871],[Q6581097],[Q1026],"[Q11063, Q185351, Q188094, Q170790, Q16012028,...",,,[Q1841]
Q633,[+1945-11-12T00:00:00Z],"[Q16, Q30]",[Q6581097],,"[Q177220, Q488205, Q2526255, Q639669, Q1881462...",,,
Q640,[+1969-00-00T00:00:00Z],[Q183],[Q6581097],,"[Q33231, Q41546637]",,,


In [97]:
genders = df.gender.value_counts().rename_axis('qid').reset_index(name='counts')
genders.head(5)

Unnamed: 0,qid,counts
0,[Q6581097],1899635
1,[Q6581072],563199
2,[Q1052281],2721
3,[Q48270],1448
4,[Q18116794],895


In [None]:
genders_exp = genders["qid"].apply(pd.Series)

In [120]:
genders = get_qid_label(genders,df_labels)
genders[]

Unnamed: 0,qid,counts,label
0,[Q6581097],1899635,[male]
1,[Q6581072],563199,[female]
2,[Q1052281],2721,[transgender female]
3,[Q48270],1448,[non-binary]
4,[Q18116794],895,[genderfluid]
5,[Q2449503],392,[transgender male]
6,"[Q6581072, Q48270]",175,"[female, non-binary]"
7,[Q1984232],112,[shemale]
8,"[Q48270, Q6581072]",111,"[non-binary, female]"
9,[Q12964198],106,[genderqueer]


In [122]:
genders_exp = genders["label"].apply(pd.Series)
genders_exp

Unnamed: 0,0,1,2
0,male,,
1,female,,
2,transgender female,,
3,non-binary,,
4,genderfluid,,
5,transgender male,,
6,female,non-binary,
7,shemale,,
8,non-binary,female,
9,genderqueer,,


In [119]:
for idx, row in genders_exp.iterrows():
    if genders_exp[1] != np.Nan or genders_exp[2] != np.Nan:
        genders['label'][idx] = 'other'
    elif genders_exp

0                    NaN
1                    NaN
2                    NaN
3                    NaN
4                    NaN
5                    NaN
6             non-binary
7                    NaN
8                 female
9                    NaN
10                   NaN
11                   NaN
12              intersex
13    transgender person
14              bigender
15                  male
16                female
17            non-binary
18                female
19                  male
20                female
21                  māhū
22           genderfluid
23              intersex
24                  male
25                   NaN
26            non-binary
27      transgender male
28            non-binary
29                  male
30      cisgender female
31                  male
32                   NaN
33                female
34                female
35                   NaN
36    transgender female
37    transgender female
38                  male
39              intersex


In [9]:
def gender_labels(df, labels):
    '''
    Replaces Wikidata Qids with the corresponding text
    :param df: dataframe to modify
    :param dflabels: 
    :return: the updated dataframe with text instead of qids
    '''
    for col in df.columns:
        try:
            df.loc[:, 'label'] = df.loc[:, col].apply(lambda x: labels.loc[cat].item() for cat in x if x is not None else x)     
        except:
            continue
    return df

In [134]:
#genders.loc[genders['label'] == ['male']]
genders['qid'][2].shape[0]
#genders[['qid']].to_numpy()

1

In [133]:
np.shape(genders['qid'].head(10)[6])[0]

2

In [7]:
dfmid = pd.DataFrame({'QID': ['Q15991263'], 'Label': ['auctioneer']})
dfmid = dfmid.set_index('QID')
pd.concat([dflabels, dfmid])

Unnamed: 0_level_0,Label
QID,Unnamed: 1_level_1
Q31,Belgium
Q45,Portugal
Q75,Internet
Q148,People's Republic of China
Q155,Brazil
...,...
Q106341153,biochemistry teacher
Q106368830,2018 Wigan Metropolitan Borough Council electi...
Q106369692,2018 Wigan Metropolitan Borough Council electi...
Q106376887,


In [57]:
dffinal

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.9680,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q999711,2020-03-19-074326,This is his home racetrack and that also makes...,Steve Fitzsimmons,0.7526,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-066900,The Hanover Bentinck and Brant Agricultural So...,Steve Fitzsimmons,0.5578,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-03-05-074250,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.7475,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-077480,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.6221,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,


In [32]:
for item in dffinal['gender']:
    if item is not None:
        item = item[0]
    else:
        item=item

In [37]:
dffinal

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.9680,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q999711,2020-03-19-074326,This is his home racetrack and that also makes...,Steve Fitzsimmons,0.7526,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-066900,The Hanover Bentinck and Brant Agricultural So...,Steve Fitzsimmons,0.5578,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-03-05-074250,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.7475,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-077480,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.6221,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,


In [10]:
dffinal.gender.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[male]                                         1899635
[female]                                        563199
[transgender female]                              2721
[non-binary]                                      1448
[genderfluid]                                      895
[transgender male]                                 392
[female, non-binary]                               182
[shemale]                                          112
[non-binary, female]                               111
[genderqueer]                                      106
[cisgender male]                                    73
[cisgender female]                                  57
[cisgender female, intersex]                        47
[non-binary, transgender person]                    33
[male, female]                                      24
[transmasculine, bigender]                          23
[transgender female, male]                          19
[female, male]                                       7
[female, i

In [67]:
dffinal['nationality'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[United States of America]             1066919
[United Kingdom]                        248436
[India]                                 125727
[Canada]                                103010
[Australia]                              99489
                                        ...   
[Austria, Tunisia]                           1
[Kingdom of the Netherlands, Aruba]          1
[France, Hungary]                            1
[São Tomé and Príncipe]                      1
[Norway, Malaysia, Canada]                   1
Name: nationality, Length: 1662, dtype: int64

In [55]:
dfmerged.loc[:, 'occupation'] = dfmerged.loc[:, 'occupation'].apply(lambda x: [dflabels.loc[nat].item() for nat in x] if x is not None else x)

KeyError: 'Q15991263'