# Setup

In [None]:
PATH_ROOT = 'data'
PATH_PARQUET = PATH_ROOT + '/project_datasets'

In [None]:
PATH_OUTPUT = 'output'

In [None]:
!pip install pyarrow

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
%matplotlib inline 

import pickle

# 1. Loading speakers data

## Reading wikidata labels

In [None]:
df_wikidata_labels = pd.read_csv(PATH_PARQUET + '/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')
df_wikidata_labels.head()

## Reading speakers parquet file

In [None]:
df_speakers_original = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
df_speakers_original.set_index(keys='id', inplace=True)
df_speakers_original.head()

In [None]:
df_speakers_cleaned = df_speakers_original.dropna()
print('Total number of speakers : ', len(df_speakers))
print('Total number of speakers with filled attributes : ', len(df_speakers_cleaned))
print('{} rows dropped'.format(len(df_speakers) - len(df_speakers_cleaned)))

As we can see that dropping all speakers with undefined attributes is unrealistic to work with, since we're only left with 3 speakers with all their attributes defined to work with. We're going to have to some fine-grained filtering of undefined values column wise instead of on whole rows (all attributes).

## 2 - Merging all speakers attributes qids with wikidata labels

Some columns store their values as lists, such as a speaker who has multiple occupations or nationalities. We need to explode those so that we have a row for each unique value in each column.

In [None]:
attributes_columns_to_be_labeled = ['nationality', 'gender', 'ethnic_group', 'occupation', 'party', 'academic_degree', 'candidacy', 'religion']

# Let's first explode their attributes
for col in attributes_columns_to_be_labeled:
    df_speakers = df_speakers.explode(col)
    
# Let's also explode the date of birth
df_speakers = df_speakers.explode('date_of_birth')
    
# Keep old id and 'qid' column of speakers, and reset index
df_speakers = df_speakers.reset_index()
df_speakers = df_speakers.rename({'id': 'qid'}, axis = 'columns')
df_speakers

#### Quick stop before proceeding: determining speaker's age

Before we continue, we need to convert the date of birth to something meaningful like the speaker's age. To avoid tedious computations using *datetime* and carefully considering none values on the month and day of birth, we're only going to approximate the age by doing calculations **based only on the year of birth** (which is good enough for our purposes).

In [None]:
def get_year_of_birth(date):
    '''
    Helper function to extract the year of birth of the speaker from the format we're given.
    
    Params:
        date: the date of birth in the given format (e.g. '+1732-02-22T00:00:00Z')
    Returns:
        the year of birth
    '''
    if date:
        return int(date[1:5])
    else:
        return date
    
test_data = '+1732-02-22T00:00:00Z'
get_year_of_birth(test_data)

In [None]:
# Normalize date of birth into a new column 'age' in which we'll calculate the age of the speaker
df_speakers['age'] = df_speakers.date_of_birth.apply(get_year_of_birth)
df_speakers.age.head()

In [None]:
# Calculate the age by computing the difference between now and their birth dates.
now = pd.Timestamp('now')
df_speakers.age = (now.year - df_speakers.age)
df_speakers.age.head()

Of course ages are just indicative, we can have people like president George Washington who are over 200 years old.

#### Let's also categorize each speaker by their age group

For example, are they in their 20s, 30s or 70s.

In [None]:
def get_age_group(age):
    '''
    Return the age group based on the age as a string. For people between 10 and 100 years old
    we just keep the tens of the value (e.g. 75 -> '70s', 20 -> '20s'). 
    For people more than 100 years old we output '>100'
    For people less than 10 years old we output '<10'
    
    Params:
        age: the age
    Returns 
        the age group in string format
    '''
    if age is None or np.isnan(age):
        return age
    elif age < 10:
        return '<10'
    elif age > 100 :
        return '>100'
    else:
        return str(int(age / 10) * 10) + 's'

df_speakers['age_group'] = df_speakers.age.apply(get_age_group)
df_speakers.age_group

In [None]:
df_speakers[df_speakers.label == 'Donald Trump'][['date_of_birth', 'age', 'age_group']]

#### Now let's merge attributes with their wikidata labels

In [None]:
df_speakers_labeled = df_speakers.copy()

for col in attributes_columns_to_be_labeled:
    df_speakers_labeled = df_speakers_labeled.merge(df_wikidata_labels, left_on=col, right_index=True, how='left')
    df_speakers_labeled[col] = df_speakers_labeled['Label']
    df_speakers_labeled.drop(columns=['Label', 'Description'], inplace=True)

df_speakers_labeled

Now we have a row per speaker for every combination of his attributes. This might be biased for speakers with a lot of attributes (e.g. several occupations, ethnic_group, etc...).

In [None]:
display(df_speakers_labeled.dtypes)
print('Length of final labeled speakers dataframe {}'.format(len(df_speakers_labeled)))

# Extras

There is **one important issue** we haven't addressed yet. For each speaker, We have **exploded all their attributes** so that each row contains attributes that have only one value in them (so not a list). However, for speakers with several attributes that have multiple values in them (e.g. a speaker with several occupations and several genders, etc...) **the explosion of one attribute may affect the overall distribution in the original data of another attribute**. For example, a speaker in the original data that has as *gender* 'transexual' and as *occupation* both 'politician' and 'basketball player', will appear twice in the final exploded dataframe, once as 'politician' and once as 'basketball player', however in both rows he will be 'transexual' and this means 'transexual' will be counted twice, meaning the distribution of gender in the final dataframe will not be the same as the one in the original dataframe.  

So the answer is yes: distributions will differ but by how much and will it affect the final result greatly?
Let's do some analysis and try to compare for example for *occupation*. We will compare the speakers data where we exploded only 'occupation' and the one where we exploded all attributes.

In [None]:
# Top 10 occupations in the dataframe where we exploded all attributes 
grouped_by_occupations = retrieved_labeled_speakers.groupby(by='occupation').label.agg(['count'])
grouped_by_occupations = grouped_by_occupations.sort_values(by='count', ascending=False)
grouped_by_occupations.head(n=10)

In [None]:
# Retrieve the original speakers data
original_speakers_df = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
original_speakers_df.set_index(keys='id', inplace=True)
original_speakers_df

In [None]:
# Let's only explode the attribute we're comparing here (occupation)
original_speakers_df = original_speakers_df.explode('occupation')
# Let's merge again on the occupation column
original_speakers_df = original_speakers_df.merge(df_wikidata_labels, left_on='occupation', right_index=True, how='left')
original_speakers_df['occupation'] = original_speakers_df['Label']
original_speakers_df.drop(columns=['Label'], inplace=True)

display(original_speakers_df)

# sort the occupations to get the top occupations
original_grouped_by_occupations = original_speakers_df.groupby(by='occupation').label.agg(['count'])
original_grouped_by_occupations = original_grouped_by_occupations.sort_values(by='count', ascending=False)

In [None]:
# Compare the distribution of the top 20 occupations between both dataframes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize =  (14, 8))
ax1.pie(grouped_by_occupations[:10]['count'], labels=grouped_by_occupations[:10].index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.set_title('Distribution of occupations after exploding all attributes')

ax2.pie(original_grouped_by_occupations[:10]['count'], labels=grouped_by_occupations[:10].index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax2.set_title('Distribution of occupations after exploding occupation only')

fig.tight_layout()

We can see that the distribution may more or less change, but the top 10 occupations stay the same and maintain the same order. However **since we only exploded *occupation*** the right chart represents the real distribution of occupations.

**So the final rule of thumb is**: only explode attributes that we're using as features or doing analysis on, to minimize distortions in the original distributions.

## 3 - Save result to disk for quick future retrieval

In [None]:
df_speakers_labeled.to_csv(PATH_OUTPUT + '/speakers_all_attributes_labeled.csv', index=False)

In [None]:
# Save the dtypes of the speakers labeled. This should help when reading them in csv knowing their dtypes
speakers_labeled_dtypes = dict(df_speakers_labeled.dtypes)

with open('output/speakers_all_attributes_labeled_dtypes.pickle', 'wb') as handle:
    pickle.dump(speakers_labeled_dtypes, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 4 - Explore the speakers data (Continue work from here)

Let's do some quick analysis on the speakers' professions. First retrieve our dataframe saved in csv.

In [None]:
# Retrieve the labeled speakers dataframe columns datatypes
with open('output/speakers_all_attributes_labeled_dtypes.pickle', 'rb') as handle:
    speakers_labeled_dtypes = pickle.load(handle)

In [None]:
retrieved_labeled_speakers = pd.read_csv(PATH_OUTPUT + '/speakers_all_attributes_labeled.csv', dtype=speakers_labeled_dtypes)
retrieved_labeled_speakers

In [None]:
display(retrieved_labeled_speakers.dtypes)
print('Length of final labeled speakers dataframe {}'.format(len(retrieved_labeled_speakers)))

### 4.1 - Speakers occupations

Now let's see the distribution of the speakers' occupations.

In [None]:
grouped_by_occupations = retrieved_labeled_speakers.groupby(by='occupation').label.agg(['count'])
grouped_by_occupations = grouped_by_occupations.sort_values(by='count', ascending=False)
grouped_by_occupations.head(n=10)

In [None]:
# Plot the distribution of the top 20 occupations
fig, ax = plt.subplots(figsize =  (14, 8))
ax.pie(grouped_by_occupations[:20]['count'], labels=grouped_by_occupations[:20].index, autopct='%1.1f%%',
        shadow=True, startangle=90)

fig.tight_layout()

Someone like Donald Trump has a lot of occupations, which means he's going to have a lot of rows dedicated to him in the dataframe, since we exploded the multi-values attributes. (This can be **problematic when training our model**)

In [None]:
# Some examples of prominent US politicans
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Donald Trump'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Barack Obama'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Alexandria Ocasio-Cortez'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Hillary Clinton'].occupation.unique())
display(retrieved_labeled_speakers[retrieved_labeled_speakers.label == 'Bill Clinton'].occupation.unique())

### 4.2 - Speaker's gender 

In [None]:
grouped_by_gender = retrieved_labeled_speakers.groupby(by='gender').label.agg(['count'])
grouped_by_gender = grouped_by_gender.sort_values(by='count', ascending=False)
grouped_by_gender.head()

In [None]:
not_male_mask = grouped_by_gender.index != 'male' 
not_female_mask = grouped_by_gender.index != 'female'
len(grouped_by_gender[not_male_mask & not_female_mask]) / len(grouped_by_gender)

The majority of speakers in the Quotebank are male (almost 70%). All other genders consitute about 5% of the speakers. So if we were to encode gender as features, we would have ['male', 'female', 'other'] where 'other' refers to the rest of the genders.

### 4.3 - Speaker's age

So far we've had the speakers date of birth. Let's convert that to age.

In [None]:
# TODO
#sns.histplot(retrieved_labeled_speakers.age.dropna(axis='columns'))

### 4.4 - Speaker's party

In [None]:
print('There are {} parties recorded for the speakers'.format(len(retrieved_labeled_speakers.party.unique())))

top_10_parties = retrieved_labeled_speakers.groupby(by='party').label.agg(['count'])
top_10_parties = top_10_parties.sort_values(by='count', ascending=False).head(n=10)
top_10_parties

### 4.5 - Speaker's nationality

In [None]:
print('There are {} nationalities recorded for the speakers'.format(len(retrieved_labeled_speakers.nationality.unique())))
top_10_nationalities = retrieved_labeled_speakers.groupby(by='nationality').label.agg(['count'])
top_10_nationalities = top_10_nationalities.sort_values(by='count', ascending=False).head(n=10)
top_10_nationalities

## 5 - Constructing a feature matrix

If we're trying to train a model using our speakers attributes, we're going to have to think of a way to construct a feature matrix. The issue is a lot of the attributes of the speakers are categorical. 

Here some pointers so that we could construct the feature matrix:
- Use a subset of the attributes as feature columns e.g. let's train our model using the speaker's gender, age (or age_group), nationality, occupation and party.
- Each attribute has a very large number of unique values. We could retain the top unique values (not a very good approach, especially if they're more or less uniformly distributed). 
- We have to find a way to encode them. (https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159w)

# Extras

There is **one important issue** we haven't addressed yet. For each speaker, We have **exploded all their attributes** so that each row contains attributes that have only one value in them (so not a list). However, for speakers with several attributes that have multiple values in them (e.g. a speaker with several occupations and several genders, etc...) **the explosion of one attribute may affect the overall distribution in the original data of another attribute**. For example, a speaker in the original data that has as *gender* 'transexual' and as *occupation* both 'politician' and 'basketball player', will appear twice in the final exploded dataframe, once as 'politician' and once as 'basketball player', however in both rows he will be 'transexual' and this means 'transexual' will be counted twice, meaning the distribution of gender in the final dataframe will not be the same as the one in the original dataframe.  

So the answer is yes: distributions will differ but by how much and will it affect the final result greatly?
Let's do some analysis and try to compare for example for *occupation*. We will compare the speakers data where we exploded only 'occupation' and the one where we exploded all attributes.

In [None]:
# Top 10 occupations in the dataframe where we exploded all attributes 
grouped_by_occupations = retrieved_labeled_speakers.groupby(by='occupation').label.agg(['count'])
grouped_by_occupations = grouped_by_occupations.sort_values(by='count', ascending=False)
grouped_by_occupations.head(n=10)

In [None]:
# Retrieve the original speakers data
original_speakers_df = pd.read_parquet(PATH_PARQUET + '/speaker_attributes.parquet')
original_speakers_df.set_index(keys='id', inplace=True)
original_speakers_df

In [None]:
# Let's only explode the attribute we're comparing here (occupation)
original_speakers_df = original_speakers_df.explode('occupation')
# Let's merge again on the occupation column
original_speakers_df = original_speakers_df.merge(df_wikidata_labels, left_on='occupation', right_index=True, how='left')
original_speakers_df['occupation'] = original_speakers_df['Label']
original_speakers_df.drop(columns=['Label'], inplace=True)

display(original_speakers_df)

# sort the occupations to get the top occupations
original_grouped_by_occupations = original_speakers_df.groupby(by='occupation').label.agg(['count'])
original_grouped_by_occupations = original_grouped_by_occupations.sort_values(by='count', ascending=False)

In [None]:
# Compare the distribution of the top 20 occupations between both dataframes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize =  (14, 8))
ax1.pie(grouped_by_occupations[:10]['count'], labels=grouped_by_occupations[:10].index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.set_title('Distribution of occupations after exploding all attributes')

ax2.pie(original_grouped_by_occupations[:10]['count'], labels=grouped_by_occupations[:10].index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax2.set_title('Distribution of occupations after exploding occupation only')

fig.tight_layout()

We can see that the distribution may more or less change, but the top 10 occupations stay the same and maintain the same order. However **since we only exploded *occupation*** the right chart represents the real distribution of occupations.

**So the final rule of thumb is**: only explode attributes that we're using as features or doing analysis on, to minimize distortions in the original distributions.