# Setup

In [None]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'

In [None]:
PATH_OUTPUT = 'output'

In [None]:
!pip install pyarrow

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
%matplotlib inline  

# 1. Loading speakers data

## Reading wikidata labels

In [None]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')
df_wikidata_labels.head()

## Reading speakers parquet file

In [None]:
df_speakers = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers.set_index(keys='id', inplace=True)
df_speakers.head()

In [None]:
df_speakers_cleaned = df_speakers.dropna()
print('Total number of speakers : ', len(df_speakers))
print('Total number of speakers with filled attributes : ', len(df_speakers_cleaned))
print('{} rows dropped'.format(len(df_speakers) - len(df_speakers_cleaned)))

As we can see that dropping all speakers with undefined attributes is unrealistic to work with, since we're only left with 3 speakers with all their attributes defined to work with. We're going to have to some fine-grained filtering of undefined values column wise instead of on whole rows (all attributes).

## 2 - Merging speakers attributes qids with wikidata labels

In [None]:
attributes_columns = ['date_of_birth', 'nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'candidacy', 'religion']

# Let's first explode their attributes
for col in attributes_columns:
    df_speakers = df_speakers.explode(col)
    
df_speakers.head()

#### Quick stop before proceeding

Before we continue, we need to convert the date of birth to something meaningful like the speaker's age.

In [None]:
def normalize_date_of_birth(date):
    '''
    Normalize the date of birth format of our speakers. Currently the format is (+/-)YYYY-MM-DDTHH:MM:SS: Z.
    We'll convert it simply to YYYY-MM-DD. However the date sometimes has month or day value of 0, and hence 
    needs further sanitization, and in that case we just output the year.
    
    Params: 
        date: the date
    returns:
        the date formatted as YYYY:MM:DD
    '''
    if date is not None:
        birth_date = date[1:11]
        year, month, day = birth_date[0:4], birth_date[5:7], birth_date[8:11]
        if month == '00' or day == '00':
            return year
        else:
            return birth_date
    else:
        return date

test_data = '+1732-02-22T00:00:00Z'
display(normalize_date_of_birth(test_data))
test_data_invalid_month = '+1732-00-22T00:00:00Z'
display(normalize_date_of_birth(test_data_invalid_month))

In [None]:
# Normalize date of birth into a new column 'age' in which we'll calculate the age of the speaker
# and then convert it to datetime instance
df_speakers['age'] = pd.to_datetime(df_speakers.date_of_birth.apply(normalize_date_of_birth), errors='coerce')
df_speakers.age.head()

In [None]:
def compute_age(date_of_birth):
    now = pd.Timestamp('now')
    age = date_o

In [None]:
# Calculate the age by computing the difference between now and their birth dates.
now = pd.Timestamp('now')
df_speakers.age = (now.year - df_speakers.age.year)
df_speakers.age.head()

In [None]:
df_speakers[df_speakers.label == 'Donald Trump'][['date_of_birth', 'age']]

In [None]:
pd.to_datetime('1990-02-01')

Of course ages are just indicative, we can have people like president George Washington who are over 200 years old.

#### Now let's merge attributes with their wikidata labels

In [None]:
df_speakers_labeled = df_speakers.copy()

for col in attributes_columns:
    df_speakers_labeled = df_speakers_labeled.merge(df_wikidata_labels, left_on=col, right_index=True, how='left')
    df_speakers_labeled[col] = df_speakers_labeled['Label']
    df_speakers_labeled.drop(columns=['Label', 'Description'], inplace=True)

In [None]:
df_speakers_labeled.head()

Now we have a row per speaker for every combination of his attributes. This might be biased for speakers with a lot of attributes (e.g. several occupations, ethnic_group, etc...).

In [None]:
display(df_speakers_labeled.dtypes)
print('Length of final labeled speakers dataframe {}'.format(len(df_speakers_labeled)))

## 3 - Save result to disk for quick future retrieval

In [None]:
df_speakers_labeled.to_csv(PATH_OUTPUT + '/speakers_labeled.csv', index=False)

## 4 - Explore the speakers data (Continue work from here)

Let's do some quick analysis on the speakers' professions. First retrieve our dataframe saved in csv.

In [None]:
retrieved_labeled_speakers = pd.read_csv(PATH_OUTPUT + '/speakers_labeled.csv')
retrieved_labeled_speakers.head()

In [None]:
display(retrieved_labeled_speakers.dtypes)
print('Length of final labeled speakers dataframe {}'.format(len(retrieved_labeled_speakers)))

### 4.1 - Speakers occupations

Now let's see the distribution of the speakers' occupations.

In [None]:
grouped_by_occupations = retrieved_labeled_speakers.groupby(by='occupation').label.agg(['count'])
grouped_by_occupations = grouped_by_occupations.sort_values(by='count', ascending=False)
grouped_by_occupations.head()

In [None]:
# Plot the distribution of the top 20 occupations
fig, ax = plt.subplots(figsize =  (14, 8))
ax.pie(grouped_by_occupations[:20]['count'], labels=grouped_by_occupations[:20].index, autopct='%1.1f%%',
        shadow=True, startangle=90)

fig.tight_layout()

Someone like Donald Trump has a lot of occupations, which means he's going to have a lot of rows dedicated to him in the dataframe, since we exploded the multi-values attributes. (This can be **problematic when training our model**)

In [None]:
# Some examples of prominent US politicans
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Donald Trump'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Barack Obama'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Alexandria Ocasio-Cortez'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Hillary Clinton'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Bill Clinton'].occupation.unique())

### 4.2 - Speaker's gender 

In [None]:
grouped_by_gender = retrieved_labeled_speakers.groupby(by='gender').label.agg(['count'])
grouped_by_gender = grouped_by_gender.sort_values(by='count', ascending=False)
grouped_by_gender.head()

In [None]:
not_male_mask = grouped_by_gender.index != 'male' 
not_female_mask = grouped_by_gender.index != 'female'
len(grouped_by_gender[not_male_mask & not_female_mask]) / len(grouped_by_gender)

The majority of speakers in the Quotebank are male (almost 70%). All other genders consitute about 5% of the speakers.

### 4.3 - Speaker's age

So far we've had the speakers date of birth. Let's convert that to age.

## 5 - Constructing a feature matrix

If we're trying to train a model using our speakers attributes, we're going to have to think of a way to construct a feature matrix. The issue is a lot of the attributes of the speakers are categorical. One way to construct features from those is to one-hot encode them.