# Enrich Dataset

In [1]:
%load_ext autoreload
%autoreload 2
#firstly we need to load all dependencies we need
import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
import pandas as pd

In [2]:
import pandas as pd
import glob
#Here we load the data which have been cleaned and preprocessed
#Thus all quotations here are related to the Brexit
path = r'D:\Learning\Master3\Ada\ada-2021-project-skrw\ada_M3' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.head()

Unnamed: 0,date,quotation,speaker,qids,probas
0,2016-09-07,Corbyn shows his true colours. Rather than att...,Tim Farron,['Q304251'],"[['Tim Farron', '0.8865'], ['None', '0.0683'],..."
1,2016-08-11,People are nervous and universities are being ...,Steve Smith,"['Q11894442', 'Q16885757', 'Q23418614', 'Q2347...","[['Steve Smith', '0.887'], ['None', '0.113']]"
2,2016-07-05,best placed to help forge a great post Brexit ...,Boris Johnson,['Q180589'],"[['Boris Johnson', '0.7713'], ['Andrea Leadsom..."
3,2016-12-14,In the run-up to the Brexit vote earlier this ...,Janet Yellen,['Q263725'],"[['Janet Yellen', '0.6759'], ['None', '0.3241']]"
4,2016-07-25,"On the issue of Brexit, I speak for the people...",Martin McGuinness,['Q57689'],"[['Martin McGuinness', '0.924'], ['None', '0.0..."


In [3]:
import pyarrow
# Here we load the data which gives us additional information about specific speaker from speaker_attributes.parquet
speaker = pd.read_parquet("D:/Learning/Master3/Ada/ada-2021-project-skrw/speaker_attributes.parquet")
speaker = speaker[['date_of_birth','nationality', 'gender', 'occupation', 'label','id']].copy()
speaker.head()

Unnamed: 0,date_of_birth,nationality,gender,occupation,label,id
0,[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",George Washington,Q23
1,[+1952-03-11T00:00:00Z],[Q145],[Q6581097],"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",Douglas Adams,Q42
2,[+1868-08-23T00:00:00Z],[Q31],[Q6581097],"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",Paul Otlet,Q1868
3,[+1946-07-06T00:00:00Z],[Q30],[Q6581097],"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",George W. Bush,Q207
4,[+1599-06-06T00:00:00Z],[Q29],[Q6581097],[Q1028181],Diego Velázquez,Q297


In [4]:
#load data from wikidatabase to resolve Q_codes in speaker_attributes.parquet
df_wikidata = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')
df_wikidata.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


### Resolve Q-codes in speaker_attributes.parquet

In [5]:
#cols are columns which we need to solve their Q_codes
cols = ['gender', 'nationality', 'occupation']

def resolve_q_code(x):
    try:
        if x is not None:
            return list(map(lambda q_code: df_wikidata.loc[q_code, 'Label'], x))
        else:
            return None
    except KeyError:
        return None

for col in cols:
    speaker[col] = speaker[col].apply(resolve_q_code)

In [6]:
speaker.head()

Unnamed: 0,date_of_birth,nationality,gender,occupation,label,id
0,[+1732-02-22T00:00:00Z],"[Great Britain, United States of America]",[male],"[politician, military officer, farmer, cartogr...",George Washington,Q23
1,[+1952-03-11T00:00:00Z],[United Kingdom],[male],"[playwright, screenwriter, novelist, children'...",Douglas Adams,Q42
2,[+1868-08-23T00:00:00Z],[Belgium],[male],"[writer, lawyer, librarian, information scient...",Paul Otlet,Q1868
3,[+1946-07-06T00:00:00Z],[United States of America],[male],"[politician, motivational speaker, autobiograp...",George W. Bush,Q207
4,[+1599-06-06T00:00:00Z],[Spain],[male],[painter],Diego Velázquez,Q297


### Merge our dataset with additional speaker information

In [7]:
#process the speaker qid into the first qid in the qid lists
frame["num_of_qids"] = frame["qids"].apply(lambda x: len(x.split(",")))
frame["short_qids"] = frame.qids.apply(lambda x: x[1:-1].split(','))
frame['speaker_qid'] = frame.short_qids.apply(lambda short_qids: short_qids[0])
frame['speaker_qid'] = frame.speaker_qid.apply(lambda x: x.replace("'",""))
frame['speaker_qid'] = frame.speaker_qid.apply(lambda x: x.replace(" ",""))

In [8]:
#exclude all quotations which doesn't have specific speaker
frame_no_NA = frame[frame["speaker"] != "None"]

In [9]:
frame_no_NA.head(5)

Unnamed: 0,date,quotation,speaker,qids,probas,num_of_qids,short_qids,speaker_qid
0,2016-09-07,Corbyn shows his true colours. Rather than att...,Tim Farron,['Q304251'],"[['Tim Farron', '0.8865'], ['None', '0.0683'],...",1,['Q304251'],Q304251
1,2016-08-11,People are nervous and universities are being ...,Steve Smith,"['Q11894442', 'Q16885757', 'Q23418614', 'Q2347...","[['Steve Smith', '0.887'], ['None', '0.113']]",36,"['Q11894442', 'Q16885757', 'Q23418614', 'Q2...",Q11894442
2,2016-07-05,best placed to help forge a great post Brexit ...,Boris Johnson,['Q180589'],"[['Boris Johnson', '0.7713'], ['Andrea Leadsom...",1,['Q180589'],Q180589
3,2016-12-14,In the run-up to the Brexit vote earlier this ...,Janet Yellen,['Q263725'],"[['Janet Yellen', '0.6759'], ['None', '0.3241']]",1,['Q263725'],Q263725
4,2016-07-25,"On the issue of Brexit, I speak for the people...",Martin McGuinness,['Q57689'],"[['Martin McGuinness', '0.924'], ['None', '0.0...",1,['Q57689'],Q57689


In [11]:
#merge the dataframe with speaker_attributes.parquet
df = frame_no_NA.merge(speaker, how="left", left_on="speaker_qid", right_on="id").drop("id", axis=1)
df = df[['date','quotation','speaker','qids','probas','speaker_qid','date_of_birth','nationality','gender','occupation','label']]
df.head()
#df.to_csv("brexit_enriched.csv.bz2", compression="bz2")

Unnamed: 0,date,quotation,speaker,qids,probas,speaker_qid,date_of_birth,nationality,gender,occupation,label
0,2016-09-07,Corbyn shows his true colours. Rather than att...,Tim Farron,['Q304251'],"[['Tim Farron', '0.8865'], ['None', '0.0683'],...",Q304251,[+1970-05-27T00:00:00Z],[United Kingdom],[male],[politician],Tim Farron
1,2016-08-11,People are nervous and universities are being ...,Steve Smith,"['Q11894442', 'Q16885757', 'Q23418614', 'Q2347...","[['Steve Smith', '0.887'], ['None', '0.113']]",Q11894442,[+1981-07-28T00:00:00Z],[United States of America],[male],[basketball player],Steve Smith
2,2016-07-05,best placed to help forge a great post Brexit ...,Boris Johnson,['Q180589'],"[['Boris Johnson', '0.7713'], ['Andrea Leadsom...",Q180589,[+1964-06-19T00:00:00Z],"[United States of America, United Kingdom]",[male],"[journalist, politician, editor, writer, essay...",Boris Johnson
3,2016-12-14,In the run-up to the Brexit vote earlier this ...,Janet Yellen,['Q263725'],"[['Janet Yellen', '0.6759'], ['None', '0.3241']]",Q263725,[+1946-08-13T00:00:00Z],[United States of America],[female],"[economist, professor, banker, politician]",Janet Yellen
4,2016-07-25,"On the issue of Brexit, I speak for the people...",Martin McGuinness,['Q57689'],"[['Martin McGuinness', '0.924'], ['None', '0.0...",Q57689,[+1950-05-23T00:00:00Z],[Ireland],[male],[politician],Martin McGuinness


### Resolve speakers' date_of_birth into ages

In [12]:
#use try_join to change lists into string
def try_join(l):
    try:
        return ','.join(map(str, l))
    except TypeError:
        return np.nan

df['date_of_birth'] = [try_join(l) for l in df['date_of_birth']]

In [13]:
import datetime
def calculate_age(date):
    date = date.replace("+","")
    birth  = date.split('T')[0]
    birth = datetime.datetime.strptime(birth, "%Y-%m-%d")
    birth_year = birth.year
    age = 2021 - birth_year
    return age

In [14]:
def parse_date(date):
    try:
        if not isinstance(date, str):
            return None
        elif date == 'None' or date[0] == '-':
            return None
        else:
            return calculate_age(date)
    except ValueError as e:
        return None

In [15]:
df['age'] = df['date_of_birth'].map(parse_date)
#df.to_json("brexit_enriched_age.json.bz2", compression="bz2")

In [16]:
df.head()

Unnamed: 0,date,quotation,speaker,qids,probas,speaker_qid,date_of_birth,nationality,gender,occupation,label,age
0,2016-09-07,Corbyn shows his true colours. Rather than att...,Tim Farron,['Q304251'],"[['Tim Farron', '0.8865'], ['None', '0.0683'],...",Q304251,+1970-05-27T00:00:00Z,[United Kingdom],[male],[politician],Tim Farron,51.0
1,2016-08-11,People are nervous and universities are being ...,Steve Smith,"['Q11894442', 'Q16885757', 'Q23418614', 'Q2347...","[['Steve Smith', '0.887'], ['None', '0.113']]",Q11894442,+1981-07-28T00:00:00Z,[United States of America],[male],[basketball player],Steve Smith,40.0
2,2016-07-05,best placed to help forge a great post Brexit ...,Boris Johnson,['Q180589'],"[['Boris Johnson', '0.7713'], ['Andrea Leadsom...",Q180589,+1964-06-19T00:00:00Z,"[United States of America, United Kingdom]",[male],"[journalist, politician, editor, writer, essay...",Boris Johnson,57.0
3,2016-12-14,In the run-up to the Brexit vote earlier this ...,Janet Yellen,['Q263725'],"[['Janet Yellen', '0.6759'], ['None', '0.3241']]",Q263725,+1946-08-13T00:00:00Z,[United States of America],[female],"[economist, professor, banker, politician]",Janet Yellen,75.0
4,2016-07-25,"On the issue of Brexit, I speak for the people...",Martin McGuinness,['Q57689'],"[['Martin McGuinness', '0.924'], ['None', '0.0...",Q57689,+1950-05-23T00:00:00Z,[Ireland],[male],[politician],Martin McGuinness,71.0


In [17]:
df.isna().sum()

date                 0
quotation            0
speaker              0
qids                 0
probas               0
speaker_qid          0
date_of_birth     6707
nationality      11201
gender             693
occupation        2448
label              219
age              10516
dtype: int64