In [1]:
#import 
import pandas as pd
import numpy as np
import bz2
import json

In [2]:
# read the large compressed file in 1 go
dfquotes = pd.read_json('quotes-2020-speaker.json.bz2', lines=True, compression='bz2')
dfquotes = dfquotes.set_index('qid') #Change and rename index for future merging
dfquotes.index.rename('id', inplace=True)

In [3]:
dfquotes

Unnamed: 0_level_0,quoteID,quotation,speaker,prob
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q367796,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,0.8867
Q20684375,2020-01-24-000168,[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds,0.5446
Q5268447,2020-01-17-000357,[ The delay ] will have an impact [ on Slough ...,Dexter Smith,0.9240
Q4864119,2020-04-02-000239,[ The scheme ] treats addiction as an illness ...,Barry Coppinger,0.9017
Q816459,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,0.9227
...,...,...,...,...
Q6279,2020-03-03-079268,you're going to take care of the gun problem w...,Joe Biden,0.6777
Q18115465,2020-02-24-080186,"you're seeing a young team that's maturing, th...",Brendan Whittet,0.7077
Q3635235,2020-02-07-122251,"You're talking about African-Americans, right?...",Barry Michael Cooper,0.5605
Q896796,2020-02-04-118820,You've got to sometimes take that leap of fait...,Brad Gushue,0.7060


In [3]:
#Load speaker attributes parquet file
dfattrib = pd.read_parquet("speaker_attributes.parquet\speaker_attributes.parquet")

#Load QID correspondance file and remove description column
dflabels = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col = 'QID')
dflabels = dflabels.drop(['Description'], axis=1)

In [5]:
def get_qid_label(df, dflabels):
    '''
    Replace Wikidata Qids with the corresponding text
    '''
    for col in df.columns:
        try:
            df.loc[:, col] = df.loc[:, col].apply(lambda x: [dflabels.loc[cat].item() for cat in x] if x is not None else x)
        except:
            continue

    return df

Functions done to make the dataset cleaner, will be used later in the project

In [33]:
def CleanGender(df):
    for i in range(int(df['gender'].count())):
        try:
            if np.shape(df['gender'].loc[i])[0]>1: # More than one element in the list
                df['gender'].loc[i] = 'other'
            else:
                df['gender'].loc[i] = df['gender'].loc[i][0] # Take element out of the list
        except:
            df['gender'].loc[i] = 'None' # Exception
            continue
    return df

In [34]:
def CleanNationality(df):
    for i in range(int(df['nationality'].count())):
        try:
            if np.shape(df['nationality'].loc[i])[0]>1: # More than one element in the list
                df['nationality'].loc[i] = 'Mixed'
            else:
                df['nationality'].loc[i] = df['nationality'].loc[i][0] # Take element out of the list
        except:
            df['nationality'].loc[i] = 'None' # Exception
            continue
    return df

In [35]:
def CleanBirthDate(df):
    for i in range(int(df['date_of_birth'].count())):
        try:
            if np.shape(df['date_of_birth'].loc[i])[0]>1: # More than one element in the list
                df['date_of_birth'].loc[i] = 'None'
            s = df['date_of_birth'].loc[i][0]
            df['date_of_birth'].loc[i] = int(s[1:5])
        except:
            df['date_of_birth'].loc[i] = 'None' # Exception
            continue
    return df

In [36]:
def CleanEthnicGroup(df):
    for i in range(int(df['ethnic_group'].count())):
        try:
            if np.shape(df['ethnic_group'].loc[i])[0]>1: # More than one element in the list
                df['ethnic_group'].loc[i] = 'None'
            else:
                df['ethnic_group'].loc[i] = df['ethnic_group'].loc[i][0] # Take element out of the list
        except:
            df['ethnic_group'].loc[i] = 'None' # Exception
            continue
    return df

In [37]:
def CleanReligion(df):
    for i in range(int(df['religion'].count())):
        try:
            if np.shape(df['religion'].loc[i])[0]>1: # More than one element in the list
                df['religion'].loc[i] = 'None'
            else:
                df['religion'].loc[i] = df['religion'].loc[i][0] # Take element out of the list
        except:
            df['religion'].loc[i] = 'None' # Exception
            continue
    return df

In [38]:
def CleanAcademicDegree(df):
    for i in range(int(df['academic_degree'].count())):
        try:
            df['academic_degree'].loc[i] = df['academic_degree'].loc[i][0] # Take element out of the list
        except:
            df['academic_degree'].loc[i] = 'None' # Exception
            continue
    return df

In [6]:
dflabels = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col = 'QID')
dflabels = dflabels.drop(['Description'], axis=1)
dflabels

Unnamed: 0_level_0,Label
QID,Unnamed: 1_level_1
Q31,Belgium
Q45,Portugal
Q75,Internet
Q148,People's Republic of China
Q155,Brazil
...,...
Q106302506,didgeridooist
Q106341153,biochemistry teacher
Q106368830,2018 Wigan Metropolitan Borough Council electi...
Q106369692,2018 Wigan Metropolitan Borough Council electi...


In [7]:
#Merge quotes file and speaker attributes file based on the unique qid of the speaker
dfmerged = dfquotes.merge(dfattrib.set_index('id'), left_index=True, right_index=True)
dfmerged.head(5)

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.968,,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,,[+1962-02-26T00:00:00Z],"[Q159, Q15180]",[Q6581097],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,


In [8]:
#Apply the function that changes the qids by their corresponding label
dffinal = get_qid_label(dfmerged, dflabels)

In [9]:
dffinal

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.9680,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],1391394799,,,"[Q193391, Q82955]",,,Vasily Nebenzya,,item,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q999711,2020-03-19-074326,This is his home racetrack and that also makes...,Steve Fitzsimmons,0.7526,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-066900,The Hanover Bentinck and Brant Agricultural So...,Steve Fitzsimmons,0.5578,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-03-05-074250,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.7475,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,
Q999711,2020-02-27-077480,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.6221,,[+1976-09-07T00:00:00Z],[Australia],[male],1313714618,,,[Q937857],,,Steve Fitzsimmons,,item,


In [None]:
dffinal = dffinal.drop(['aliases', 'lastrevid', 'US_congress_bio_ID', 'party', 'label', 'occupation', 'candidacy', 'type'], axis=1)

In [45]:
dffinal

Unnamed: 0_level_0,quoteID,quotation,speaker,prob,date_of_birth,nationality,gender,ethnic_group,academic_degree,religion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Q1000053,2020-01-30-115748,We stress that any attempts to fuel the confli...,Vasily Nebenzya,0.9429,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],,,
Q1000053,2020-01-30-110614,We consistently send signals to abandon aggres...,Vasily Nebenzya,0.5209,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],,,
Q1000053,2020-02-26-086429,We understand the concerns of a number of non-...,Vasily Nebenzya,0.9361,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],,,
Q1000053,2020-02-25-076403,"We cannot agree to that kind of approach,",Vasily Nebenzya,0.9680,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],,,
Q1000053,2020-01-09-098083,violation and noncompliance of international law.,Vasily Nebenzya,0.9364,[+1962-02-26T00:00:00Z],"[Russia, Soviet Union]",[male],,,
...,...,...,...,...,...,...,...,...,...,...
Q999711,2020-03-19-074326,This is his home racetrack and that also makes...,Steve Fitzsimmons,0.7526,[+1976-09-07T00:00:00Z],[Australia],[male],,,
Q999711,2020-02-27-066900,The Hanover Bentinck and Brant Agricultural So...,Steve Fitzsimmons,0.5578,[+1976-09-07T00:00:00Z],[Australia],[male],,,
Q999711,2020-03-05-074250,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.7475,[+1976-09-07T00:00:00Z],[Australia],[male],,,
Q999711,2020-02-27-077480,This is a very unique opportunity for me. My f...,Steve Fitzsimmons,0.6221,[+1976-09-07T00:00:00Z],[Australia],[male],,,


In [67]:
dffinal['nationality'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[United States of America]             1066919
[United Kingdom]                        248436
[India]                                 125727
[Canada]                                103010
[Australia]                              99489
                                        ...   
[Austria, Tunisia]                           1
[Kingdom of the Netherlands, Aruba]          1
[France, Hungary]                            1
[São Tomé and Príncipe]                      1
[Norway, Malaysia, Canada]                   1
Name: nationality, Length: 1662, dtype: int64

In [10]:
dffinal.gender.value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[male]                                         1899635
[female]                                        563199
[transgender female]                              2721
[non-binary]                                      1448
[genderfluid]                                      895
[transgender male]                                 392
[female, non-binary]                               182
[shemale]                                          112
[non-binary, female]                               111
[genderqueer]                                      106
[cisgender male]                                    73
[cisgender female]                                  57
[cisgender female, intersex]                        47
[non-binary, transgender person]                    33
[male, female]                                      24
[transmasculine, bigender]                          23
[transgender female, male]                          19
[female, male]                                       7
[female, i

In [None]:
#Clean Data, which will be performed when merging the speakers and quotes features
CleanBirthDate(dffinal)


#Clean Gender
CleanGender(dffinal)


#Clean Nationality
CleanNationality(dffinal)


#Clean Ethnic Group
CleanEthnicGroup(dffinal)


#Clean Religion
CleanReligion(dffinal)

#Clean AcademicDegree
CleanAcademicDegree(dffinal)
print(dffinal.head(5))