# Ada Final Project - EDA

In [None]:
# Python Standard Libraries
import re
import csv
import bz2
import json
import string
import pickle
from tqdm.notebook import tqdm

# Install using conda
# conda install matplotlib pandas ipywidgets beautifulsoup4 nltk
import nltk
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import plotly.graph_objects as go

import pandas as pd
import numpy as np

import numpy as np
import plotly.express as px

from nltk.sentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup


#NLP libraries
#
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models
import gensim.corpora as corpora
from gensim.models import LdaMulticore

from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import syllables


%matplotlib inline

nltk.download([
     "names",
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
 ]);

In [None]:
PATH_TO_FILE = 'data/hillary-and-trump-quotes-2016.json.bz2'

CEFR_HTML_IN = "data/cefr_data.html"

CEFR_CLEAN_CSV_IN = "data/cefr_data_clean.csv"
CEFR_CSV_OUT = "data/cefr_data.csv"

JEKYLL_PLOTS_PATH = "docs/_includes/plots/"
    
CHUNK_SIZE = 100_000

RANDOM_SAMPLE_SIZE = 3_000

SEED = 92813

HC = "Hillary Clinton"
DT = "Donald Trump"

## 1. Load Data

### Initial Data Extraction

For reference we include the code we executed on Google Colab to extract all quotes by **Hillary Clinton** and **Donald Trump** during the **year 2016** from the Quotebank dataset. This was a one time operation, which is why we did it outside of this notebook. All other algorithms we apply to the data will be/have been possibly iterated on for improvement, which is where the notebook format comes in handy.

```python
PATH_TO_FILE = '/content/drive/MyDrive/Quotebank/quotes-2016.json.bz2'
PATH_TO_OUT = '/content/drive/MyDrive/hillary-and-trump-quotes-2016.json.bz2'

SPEAKER_NAMES = ['Hillary Clinton', 'Donald Trump']

hits = 0

with bz2.open(PATH_TO_FILE, 'rb') as s_file:
    with bz2.open(PATH_TO_OUT, 'wb') as d_file:

        for instance in s_file:

            instance = json.loads(instance)
            speaker = instance['speaker']

            if any(map(speaker.__contains__, desired_speakers)):

                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))

                hits += 1


print(f"Hits: {hits}")
```

In [None]:
df_1 = pd.read_json(PATH_TO_FILE, lines=True, compression='bz2') #chunksize=CHUNK_SIZE)

In [None]:
display(df_1.head())

In [None]:
#
# Produces a sample from the entire dataset as well one for each candidate
#
def get_df_samples(df, sample_size=RANDOM_SAMPLE_SIZE):

    sample = df.sample(n=sample_size, random_state=SEED)

    dt_sample = df[df['speaker'] == 'Donald Trump'].sample(n=sample_size, random_state=SEED)
    hc_sample = df[df['speaker'] == 'Hillary Clinton'].sample(n=sample_size, random_state=SEED)
    
    return sample, dt_sample, hc_sample

## 2. Enhance Data

In [None]:
df_2 = df_1.copy()
#
# Extract first (highest) proba score, which is the one assigned to the "speaker" and place it in the proba column.
#
df_2['proba'] = df_2['probas'].apply(lambda probas : float(probas[0][1]))

#
# Only keep the date and ignore the time
#
df_2['date'] = pd.to_datetime(df_2['date'])

df_2.head()

### Sample of enhanced, but dirty data

In [None]:
sample, dt_sample, hc_sample = get_df_samples(df_2)

## 3. EDA and  Data Cleaning

In [None]:
df_3 = df_2.copy()

We are only interested in quotes by **Hillary Clinton** and **Donald Trump** during the **year 2016**, specifically from 01/01/2016 - 01/01/2017. The subset loaded only contains the Quotebank quotes which have one (or both) of them as a possible speaker in the _speaker_ columns list and that lie in the specified time frame.

Since the dataset was obtained using a ML model to extract and assign the quotes there will most likely be quotes which are faulty and quotes which have been assigned to the wrong speaker. The goal of the data cleaning is to remove such data points so that we can focus on working with as good data as possible.

We must specify what makes a quote faulty and motivate this so that we remove as many bad quotes as possible while not removing any or as little actually correct quotes as possible.

### General EDA

In [None]:
blue_patch = mpatches.Patch(color='blue', alpha=0.4, label='Hillary Clinton')
red_patch = mpatches.Patch(color='red', alpha=0.4, label='Donald Trump')

In [None]:
#
# Histogram of all quotes grouped by Trump and CLinton for each day in 2016
#
def quotes_hist_split(df, weighted=False, use_log=False, n_bins=366): # One per day (366 days in 2016)
    
    fig, ax = plt.subplots(figsize=(10,5))


    fig.suptitle("Histogram of {} Quotes by H.C. and D.T during 2016".format('all' if weighted else 'unique'))

    df_hc = df[df['speaker'] == 'Hillary Clinton']
    df_dt = df[df['speaker'] == 'Donald Trump']

    ax.hist(
        [df_hc['date'], df_dt['date']],
        weights=([df_hc['numOccurrences'] ,df_dt['numOccurrences']] if weighted else None),
        bins=n_bins,
        color=["blue", "red"],
        alpha=0.4,
        log=use_log
    )
    
    ax.set_ylabel("Frequency" + (' (log)' if use_log else ''))
    ax.legend(handles=[blue_patch, red_patch])

    plt.show()
    plt.clf()

quotes_hist_split(df_3)

![Histogram of all (Quotebank) Quotes in 2016](assets/histogram-quotebank-quotes-2016.png)

After seing that there are suspicious periods of no or barely any quotes in the graph above we went back to the unfiltered (whole) dataset and plotted the distribution of all quote dates. The result can be seen in the graph above (computed in Google Colab) and suggests that this pattern is also present in the entire dataset, suggesting that this at least should not stem from a faulty filtration. 

**Note: The sharp dips are due to data outages on behalf of Spinn3r (From: Quotebank - A Corpus of Quotations from a Decade of News.pdf)**

In [None]:
#
# Histogram of all occurances per quote grouped by Trump and CLinton for each day in 2016
#
def occ_hist_split(df, n_bins=100, use_log=True):
    
    fig, ax =  plt.subplots(figsize=(10,5))

    fig.suptitle("Histogram of the number of occurances of single quotes by H.C. and D.T during 2016", fontsize=14)

    df_hc = df[df['speaker'] == 'Hillary Clinton']
    df_dt = df[df['speaker'] == 'Donald Trump']
    ax.hist([df_hc['numOccurrences'], df_dt['numOccurrences']], log=use_log, bins=n_bins, color=["blue", "red"], alpha=0.4)

    ax.legend(handles=[blue_patch, red_patch])

    plt.show();
    plt.clf();

df_test = df_3.sort_values('numOccurrences', ascending=False)

display(df_test.head())
print()

#
#print(df_3[df_3['numOccurrences'] < 300].loc[85071])
#

occ_hist_split(df_test)

Check for duplictates in regards to the quote content

In [None]:
df_3[df_3.duplicated(subset=['quotation'], keep=False)]

No duplicate quote, hence all quotations which have been used by multiple sources are indicated by the `numOccurances` column and further detailed by the list in the `urls` column.

### Proba assigned to Quotes

#### Investigation

**Motivation**

Certain quotes that the model assigned to Trump and Clinton have very low probabilities to actually be quotes by them inside of the text as by the computation of the model. We want to learn about the distribution of the probability of the assigned quotes so that we can take a decision on if and when to filter out certain quotes due to a too low probability computed for them by the model.

**Distribution of the Proba**

Here we plot the distribution of a sample of 1,000 Trump and 1,000 Clinton quotes respectively. The reason we sample seperatley is given the fact that we want to ensure a large enough sample pool for both candidates, which is needed since there are more quotes assigned to Trump than Clinton. We also plot the two seperatly to make sure that we do not miss any differences in the dsitribution.

In [None]:
def probas_hist_split(df, on_sample=True, n_bins=20):
    
    if on_sample:
        _, dt_df, hc_df = get_df_samples(df)
    else:
        dt_df = df[df['speaker'] == 'Donald Trump']
        hc_df = df[df['speaker'] == 'Hillary Clinton']

    proba_bins = [round((1 / n_bins), 2) * i for i in range(0, n_bins  + 1)]

    fig, axs = plt.subplots(2, 2, figsize=(10, 5))
    fig.suptitle("Distribution of the probability computed by the model to the assigned speaker", fontsize=14)

    axs[0, 0].set(xlim=(0, 1))
    axs[0, 1].set(xlim=(0, 1))

    axs[0, 0].hist(hc_df['proba'], bins=proba_bins, color='blue', alpha=0.4)
    axs[0, 1].hist(dt_df['proba'], bins=proba_bins, color='red', alpha=0.4)

    axs[1, 0].set(xlim=(0, 1))
    axs[1, 1].set(xlim=(0, 1))

    axs[1, 0].boxplot(hc_df['proba'], vert=False)
    axs[1, 1].boxplot(dt_df['proba'], vert=False)


    axs[0, 0].set_ylabel('Frequency')
    axs[0, 0].title.set_text('Hillary Clinton (N={:,})'.format(len(hc_df)))

    axs[0, 1].title.set_text('Donald Trump (N={:,})'.format(len(dt_df)))

    axs[1, 1].set_xlabel('Computed Probability')
    axs[1, 0].set_xlabel('Computed Probability')

    plt.show()
    plt.clf()


#    
# Run on a random sample of 1,0000 quotes per speaker (for speed) 
#
example = df_3.copy()
probas_hist_split(example, on_sample=True)

    
print("Hillary Clinton Summary Statistics")
display(hc_sample['proba'].describe())
print()
print("Donald Trump Summary Statistics")
display(hc_sample['proba'].describe())

From plotting the distribution of probabilities which the model computed to the quotes it assigned to Trump and Hillary respectively we could now act in at least 3 ways:

1. Remove any quote which is below the min probability minus a small margin because they are outliers. **I.e. Only Remove outliers.**

2. Set the cutoff even higher because we decide to consider quotes with, ex. less than 0.4 probability assigned to the candidate being the speaker too weak to consider it in further analysis.

3. A further possibility could be to remove quotes, where the next best speaker assigned has a similair/close probability compare to the number one.
   
   Example: `[ [ 'Trump', 0.41 ], [ 'Kanye West', 0.39 ], ...]`

**TODO for later:** What should we do? What is scientifically sound? How do we motivate it?



**Filtering out datapoints with too low probability**

#### Filter Function

In [None]:
#
# Function to remove quote with proba below a threshold.
#
def remove_low_proba_quotes(df, threshold):
    return df[df['proba'] >= threshold].copy()

example = df_3.copy()

example_threshold = 0.7

example = remove_low_proba_quotes(df_3, threshold=example_threshold)

n_removed_lines = len(df_3) - len(example)
percentage_removed = (n_removed_lines / len(df_3)) * 100

print("Result of filter with threshold: {:}\n".format(example_threshold))
print("Removed {:,.0f} datapoints or {:,.2f}% of the original dataset.\n".format(n_removed_lines, percentage_removed))

probas_hist_split(example, on_sample=True)

### Who are the speakers?

#### Investigation

In [None]:
def display_speaker_breakdown(df):
    display(df.groupby(['speaker']).count().sort_values('quotation', ascending=False)['quotation'])

#
# Check out the unique speakers in our dataset
#
display_speaker_breakdown(df_3)

We see that our intial, rough, extraction algorithm extracted quotes by Trump's son and also some where Trump is titled "President". We assign will assign the "president" tittled quotes to Trump and filter out the  ones about his son.

#### Re-assignment Function

In [None]:
#
# Function to assign quotes, where Trump is titlte "President" to his speaker name.
#
def assign_quotes_for_president_dt_to_dt(df):
    
    df = df.replace(
        to_replace=['president Donald Trump', 'PRESIDENT Donald Trump', 'President Donald Trump'],
        value='Donald Trump'
    )

    return df

example = df_3.copy()
example = assign_quotes_for_president_dt_to_dt(df_3)

n_removed_lines = len(df_3) - len(example)
percentage_removed = (n_removed_lines / len(df_3)) * 100

print("Result of filter:\n")
print("Removed {:,.0f} datapoints or {:,.2f}% of the original dataset.\n".format(n_removed_lines, percentage_removed))


display_speaker_breakdown(example)

#### Filter Function

In [None]:
#
# Removes quotes by different speakers
#
def only_keep_dt_and_hc_quotes(df):
    return df[df['speaker'].isin(['Hillary Clinton', 'Donald Trump'])]

example = only_keep_dt_and_hc_quotes(example)

print("Result of filter: \n")
display_speaker_breakdown(example)

n_removed_lines = len(df_3) - len(example)
percentage_removed = (n_removed_lines / len(df_3)) * 100

print("\n")

print("Removed {:,.0f} datapoints or {:,.2f}% of the original dataset".format(n_removed_lines, percentage_removed))

### "Nonsense" content

**TODO**: We should ideally check for the quality of the quotes. There are certainly some faulty quotes and maybe even gibberish in the dataset but it is hopefully very limited in scope. We should nonetheless attempt to look for faulty/gibbersih quotes which were extracted by the model and remove them.

### The "data-gap"

In [None]:
# Unique Quotes per day of entire 2016 Quotebbank
df_original_hist = pd.read_csv("data/unique-quotes-per-day-2016.csv")[['date', 'n_unique_quotes']]
df_original_hist['date'] = pd.to_datetime(df_original_hist['date'])

fig, (left_ax, right_ax) = plt.subplots(1, 2, figsize=(15,5))

fig.suptitle("Distribution of Unique Quotes in Quotebank for 2016", fontsize=16)

left_ax.set_title("Trump and Clinton only")
left_ax.hist(df_3['date'], bins=366, color='purple', alpha=0.4)

right_ax.set_title("All Quotes")
right_ax.hist(df_original_hist['date'], weights=df_original_hist['n_unique_quotes'], bins=366, color='gray', alpha=0.4)

xticks = ['2016-03-15', '2016-06-15', '2016-09-15', '2016-12-15']
xticks_labels = ['15 March', '15 June', '15 September', '15 December']

left_ax.set_xticks(xticks)
left_ax.set_xticklabels(xticks_labels)

right_ax.set_xticks(xticks)
right_ax.set_xticklabels(xticks_labels)

plt.show()

plt.clf()

#### Patches for the Visualisations

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(15,8))

fig.suptitle("Identifying the 'Cutoff' Dates", fontsize=16)

#
# First Gap
#
axs[0, 0].set_title("Gap: February - March")

gap = df_original_hist[(df_original_hist['date'] >= '2016-02-01') & (df_original_hist['date'] <= '2016-03-01')]
gap_days = len(gap)

_, bins, patches = axs[0, 0].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-02-01', '2016-02-15', '2016-03-01']
xticks_labels = ['01 Feb', '15 Feb', '01 March']

axs[0, 0].set_xticks(xticks)
axs[0, 0].set_xticklabels(xticks_labels)

cutoff_left = '2016-02-12'
v_l = mlines.Line2D([cutoff_left, cutoff_left], [0,50_000], color='red')
axs[0, 0].add_line(v_l)
axs[0, 0].text(bins[6]-0.5, 50_000,  cutoff_left, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

cutoff_right = '2016-02-25'
v_l = mlines.Line2D([cutoff_right, cutoff_right], [0,50_000], color='red')
axs[0, 0].add_line(v_l)
axs[0, 0].text(bins[18]-0.5, 50_000,  cutoff_right, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

#
# Second Gap
#
axs[0, 1].set_title("Gap: April")

gap = df_original_hist[(df_original_hist['date'] >= '2016-04-07') & (df_original_hist['date'] <= '2016-05-01')]
gap_days = len(gap)

_, bins, patches = axs[0, 1].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-04-07', '2016-04-15', '2016-05-01']
xticks_labels = ['07 April', '15 April', '01 May']

axs[0, 1].set_xticks(xticks)
axs[0, 1].set_xticklabels(xticks_labels)

cutoff_left = '2016-04-17'
v_l = mlines.Line2D([cutoff_left, cutoff_left], [0,50_000], color='red')
axs[0, 1].add_line(v_l)
axs[0, 1].text(bins[3], 50_000,  cutoff_left, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

#
# Third Gap
#
axs[0, 2].set_title("Gap: May - June")

gap = df_original_hist[(df_original_hist['date'] >= '2016-05-15') & (df_original_hist['date'] <= '2016-06-15')]
gap_days = len(gap)

_, bins, patches = axs[0, 2].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-05-15', '2016-06-15']
xticks_labels = ['15 May', '15 June']

axs[0, 2].set_xticks(xticks)
axs[0, 2].set_xticklabels(xticks_labels)

cutoff_right = '2016-06-01'
v_l = mlines.Line2D([cutoff_right, cutoff_right], [0,50_000], color='red')
axs[0, 2].add_line(v_l)
axs[0, 2].text(bins[19], 50_000,  cutoff_right, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

#
# Fourth Gap
#
axs[1, 0].set_title("Gap: June - July")

gap = df_original_hist[(df_original_hist['date'] >= '2016-06-15') & (df_original_hist['date'] <= '2016-07-15')]
gap_days = len(gap)

_, bins, patches = axs[1, 0].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-06-15', '2016-07-01', '2016-07-15']
xticks_labels = ['15 June', '01 July', '15 July']

axs[1, 0].set_xticks(xticks)
axs[1, 0].set_xticklabels(xticks_labels)

cutoff_right = '2016-06-30'
v_l = mlines.Line2D([cutoff_right, cutoff_right], [0,50_000], color='red')
axs[1, 0].add_line(v_l)
axs[1, 0].text(bins[6], 50_000,  cutoff_right, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

#
# Fifth Gap
#
axs[1, 1].set_title("Gap: September - October")

gap = df_original_hist[(df_original_hist['date'] >= '2016-09-15') & (df_original_hist['date'] <= '2016-10-15')]
gap_days = len(gap)

_, bins, patches = axs[1, 1].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-09-15', '2016-10-01', '2016-10-15']
xticks_labels = ['15 September', '01 October', '15 October']

axs[1, 1].set_xticks(xticks)
axs[1, 1].set_xticklabels(xticks_labels)

cutoff_right = '2016-10-01'
v_l = mlines.Line2D([cutoff_right, cutoff_right], [0,50_000], color='red')
axs[1, 1].add_line(v_l)
axs[1, 1].text(bins[18], 50_000,  cutoff_right, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

#
# Sixt Gap
#
axs[1, 2].set_title("Gap: November - December")

gap = df_original_hist[(df_original_hist['date'] >= '2016-11-15') & (df_original_hist['date'] <= '2016-12-15')]
gap_days = len(gap)

_, bins, patches = axs[1, 2].hist(gap['date'], weights=gap['n_unique_quotes'], bins=gap_days, color='gray', alpha=0.4, edgecolor='black', linewidth=1.2)

xticks = ['2016-11-15', '2016-12-01', '2016-12-15']
xticks_labels = ['15 November', '01 December', '15 December']

axs[1, 2].set_xticks(xticks)
axs[1, 2].set_xticklabels(xticks_labels)

cutoff_right = '2016-11-30'
v_l = mlines.Line2D([cutoff_right, cutoff_right], [0,50_000], color='red')
axs[1, 2].add_line(v_l)
axs[1, 2].text(bins[6], 50_000,  cutoff_right, style='italic',
        bbox={'facecolor': 'red', 'alpha': 0.8, 'pad': 10})

plt.show()

plt.clf()

In [None]:
#
# Removes any quotes which are inside the data gaps as found above.
#
DATE_CUTOFFS = [
    ('2016-02-12', '2016-02-25'),
    ('2016-04-17', '2016-06-01'),
    ('2016-06-30', '2016-10-01'),
    ('2016-11-30', '2017-01-01')
]

def remove_quotes_inside_data_gaps(df):
    df = df.copy()
    df = df[
        ( (DATE_CUTOFFS[0][0] <  df['date']) & (df['date'] < DATE_CUTOFFS[0][1]) ) |
        ( (DATE_CUTOFFS[1][0] <  df['date']) & (df['date'] < DATE_CUTOFFS[1][1]) ) |
        ( (DATE_CUTOFFS[2][0] <  df['date']) & (df['date'] < DATE_CUTOFFS[2][1]) ) |
        ( (DATE_CUTOFFS[3][0] <  df['date']) & (df['date'] < DATE_CUTOFFS[3][1]) )
        
    ]
    
    return df

print("Pre Filter")
quotes_hist_split(df_3, weighted=False, use_log=True)
print("Post Filter")
example = remove_quotes_inside_data_gaps(df_3)
quotes_hist_split(example, weighted=False, use_log=True)

### Clean Data

Using our insights from the EDA we apply the different filters to "clean" our dataset, which is then ready for proper analysis.

In [None]:
df_cleaned = df_2.copy()
print("Pre-Cleaning:  {:>10,} Quotes".format(len(df_cleaned)))

# Reassign quotes to Trump
df_cleaned = assign_quotes_for_president_dt_to_dt(df_cleaned)

# Remove quotes which are of  other speakers
df_cleaned = only_keep_dt_and_hc_quotes(df_cleaned)

# Remove any quote inside  our identified data gaps.
df_cleaned = remove_quotes_inside_data_gaps(df_cleaned)

n_removed = len(df_2) - len(df_cleaned)
per_removed = n_removed / len(df_2)
print("Removed:       {:>10,} Quotes or {:.2%} of the Original Data".format(n_removed, per_removed))
print("Post-Cleaning: {:>10,} Quotes".format(len(df_cleaned)))

display(df_cleaned.head())
quotes_hist_split(df_cleaned, weighted=False, use_log=True)

In [None]:
# NOTE: This should  be removed and is just so the code doesn't break after (Dean) having resturctured the cleaning process.
df_3_2 = df_cleaned.copy()

#### Sample

In [None]:
cleaned_sample, dt_cleaned_sample, hc_cleaned_sample = get_df_samples(df_cleaned)

## 4. Our Focus Questions

###  Introduction & Background

#### What data  do we have?

In [None]:
n_hc_quotes = sum(df_cleaned[df_cleaned['speaker'] == 'Hillary Clinton']['numOccurrences'])
n_dt_quotes = sum(df_cleaned[df_cleaned['speaker'] == 'Donald Trump']['numOccurrences'])

labels = 'Hillary', 'Trump'
sizes = [n_hc_quotes, n_dt_quotes]
explode = (0, 0.10)

fig, (left_ax, right_ax) = plt.subplots(nrows=1, ncols=2, figsize=(10,5))

fig.suptitle("Breakdown of Quotes",  fontsize=16)

#
# Amounted Quoted
#
left_ax.set_title("Times Quoted")

n_hc_quotes = sum(df_cleaned[df_cleaned['speaker'] == 'Hillary Clinton']['numOccurrences'])
n_dt_quotes = sum(df_cleaned[df_cleaned['speaker'] == 'Donald Trump']['numOccurrences'])
total_times_quoted = sum(df_cleaned['numOccurrences'])

left_ax.pie(
    [n_hc_quotes, n_dt_quotes],
    labels=labels,
    #autopct='%1.1f%%',
    autopct=lambda p: '{:,}'.format(round(p * total_times_quoted / 100)),
    colors=['blue', 'red'],
    startangle=90,
    textprops={'size': 'larger'},
    wedgeprops = {"alpha": 0.4})

#
# Unique  Quotes
#
right_ax.set_title("Unique Quotes")

n_unique_hc_quotes = len(df_cleaned[df_cleaned['speaker'] == 'Hillary Clinton'])
n_unique_dt_quotes = len(df_cleaned[df_cleaned['speaker'] == 'Donald Trump'])

right_ax.pie(
    [n_unique_hc_quotes, n_unique_dt_quotes],
    labels=labels,
    #autopct='%1.1f%%',
    autopct=lambda p: '{:,}'.format(round(p * len(df_cleaned) / 100)),
    colors=['blue', 'red'],
    startangle=90,
    textprops={'size': 'larger'},
    wedgeprops = {"alpha": 0.4})

plt.show()

### Q1: Media Bias

Do media outlets portray Trump and Clinton differently? Do media outlets quote the two candidates equally much? Does the bias of the news outlet correlate with the quotes they report?

Let's take two of the biggest outlets with political leaning views, CNN and Breitbart, and compare the distribution of Trump quotes with Clinton quotes, and also see how positive or negative they are.

The NLTK library will also be used for sentiment analysis later on.

In [None]:
import sys
sys.path

In [None]:
import itertools
from scripts.allsides import strip_url

def flatten(_list):
    return list(itertools.chain(*_list))

sia = SentimentIntensityAnalyzer()
def valence(quote):
    return sia.polarity_scores(quote)["compound"]

In [None]:
full_urls = flatten(cleaned_sample["urls"].to_list())
stripped_urls = list({strip_url(url) for url in full_urls})
pd.DataFrame(pd.Series(stripped_urls).unique())

In [None]:
quotes_by_outlet = dict()

for i, row in cleaned_sample.iterrows():
    quote, speaker, urls = row["quotation"], row["speaker"], row["urls"]
    
    for url in urls:
        outlet = strip_url(url)
        
        v = quotes_by_outlet.get(outlet)
        if v is None:
            v = {"Hillary Clinton": [], "Donald Trump": []}
            
        # A quote cannot appear more than once per outlet.
        if quote not in v[speaker]:
            v[speaker].append(quote)
        quotes_by_outlet[outlet] = v

df_by_outlet = pd.DataFrame.from_dict(quotes_by_outlet, orient="index")
df_by_outlet

We load the media bias dataset. The bias is translated from survey answers ("left-center", "center", "right"...)
and the weight is based on the confidence in the bias rating (a non-linear function combining the number of votes and the agreement rate).
You can find the cleaning and transformation process in `scripts/allsides.py`.

In [None]:
df_mb = pd.read_csv("data/allsides.csv")
df_mb

In [None]:
df_mb.groupby("bias").count()["total_votes"].plot(kind="bar")

In [None]:
df_mb_valence = df_by_outlet.merge(df_mb, left_index=True, right_on="url")
df_mb_valence.index = df_mb_valence["name"]
df_mb_valence.drop("name", axis=1, inplace=True)

map_quotes = lambda quotes: [valence(q) for q in quotes]
df_mb_valence["hc_valence"] = df_mb_valence[HC].apply(map_quotes)
df_mb_valence["dt_valence"] = df_mb_valence[DT].apply(map_quotes)
            
df_mb_valence.head(5)

In [None]:
df_mb_valence[df_mb_valence.bias == 2]

In [None]:
df_hc = df_mb_valence.copy()
df_hc["valence"] = df_hc["hc_valence"]
df_hc = df_hc.drop([HC, DT, "hc_valence", "dt_valence"], axis=1)
df_hc = df_hc.explode("valence")
df_hc.head()

In [None]:
df_dt = df_mb_valence.copy()
df_dt["valence"] = df_dt["dt_valence"]
df_dt = df_dt.drop([HC, DT, "hc_valence", "dt_valence"], axis=1)
df_dt = df_dt.explode("valence")
df_dt.head()


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
def plot_hist2d(df, candidate):
    
    if candidate == DT:
        color = px.colors.qualitative.Plotly[1]
        x = 1
    elif candidate == HC:
        color = px.colors.qualitative.Plotly[0]
        x = 0.4
    else:
        raise(f"Expected candidate to be one of [{DT}, {HC}]")
              
    args = {
        "x": df["bias"],
        "y": df["valence"],
        "z": df["weight"],
        "histfunc": "sum",
        "histnorm": "probability",
        "colorbar": dict(
            len=1.05,
            x=x,
            y=0.49,
            dtick=0.04
        ),
        "colorscale": ["white", color],
        "showscale": True,
    }
    return go.Histogram2d(args)

In [None]:
fig = make_subplots(
    cols=2,
    shared_yaxes="columns",
    x_title='Media bias',
    y_title='Sentiment',
    horizontal_spacing=0.2,
    subplot_titles=[HC, DT])
           
# customize font and legend orientation & position
fig.update_layout(
    font_family="Rockwell",
    legend=dict(
        title=None,
        orientation="h",
        y=1,
        yanchor="bottom",
        x=0.5,
        xanchor="center",
    ),
    title="Distribution of sentiment and media bias",
    title_x=0.5,
)

fig.update_xaxes(
    tickmode = 'array',
    tickvals = [-2, -1, 0, 1, 2],
    ticktext = ["Left", "LC", "Center", "RC", "Right"],    
    tickangle = 0,
)

fig.add_trace(plot_hist2d(df_hc, HC), row=1, col=1)
fig.add_trace(plot_hist2d(df_dt, DT), row=1, col=2)

fig.show()

In [None]:
def plot_hist(fig, outlet_name):
    
    if outlet_name in df_mb_valence.index:
        outlet = df_mb_valence.loc[outlet_name]
        bias = outlet.bias
    else:
        raise(ValueError(f"{outlet_name} is not a valid outlet name"))
    
    nbins = 5
    
    args = dict(
        histnorm= "probability",
        showlegend=bool(bias==-2),
        ybins=dict(
            start=-1.1,
            end=1.1,
            size=0.2,
        )
    )
        
    args_hc = dict(
        **args,
        name=HC,
        marker=dict(color=px.colors.qualitative.Plotly[0]),
        y=outlet["hc_valence"],
    )
    
    args_dt = dict(
        **args,
        name=DT,
        marker=dict(color=px.colors.qualitative.Plotly[1]),
        y=outlet["dt_valence"],
    )
    
    fig.add_trace(go.Histogram(args_hc), col=bias+3, row=1)
    fig.add_trace(go.Histogram(args_dt), col=bias+3, row=1)
    
    title_text=f"{outlet_name.split(' (')[0]}"
    fig.update_xaxes(title_text=title_text, col=bias+3, row=1)


outlet_names = ["CNN (Online News)", "Washington Post", "The Hill", "Newsmax (News)", "Breitbart News"]
    
fig = make_subplots(
    cols=5,
    shared_yaxes=True,
    x_title='Probability of occurrence',
    y_title='Sentiment',
)

fig.update_xaxes(
    tickmode = 'array',
    tickvals = [0.2, 0.4],
    ticktext = ["", ""],    
)

fig.update_yaxes(
    tickmode = 'array',
    tickvals = [-1, -0.5, 0, 0.5, 1],
)

fig.update_layout(
    barmode='stack',
    font_family="Rockwell",
    legend=dict(
        title="Speaker",
        orientation="h",
        y=1,
        yanchor="bottom",
        x=0.5,
        xanchor="center",
    ),
    title="Distribution of sentiment for certain media outlets",
    title_x=0.5,
)

for name in outlet_names:
    plot_hist(fig, name)
    
fig.show()

### Q2: Political Topics

We want to track the different political topics that the candidates focused on according to the content of their quotes. From this we aim to learn the importance of the topics in a absolut relative context but also in regards to when a certain topic might have been very present and then. disappeared for some time. We furthermore want to see if there might be a candidate which sparked a topic or at least started talking/being quoted about it first.

Bewlow we show an example of a topic in regards to Obamacare, which we all remember to be a important topic during the debate.

#### Example: Obama Care

In [None]:
OBAMACARE_REGEX = '(obama care)|obamacare' # Not perfect, proof of concept...

df_obamacare = df_3_2[df_3_2['quotation'].str.contains(pat = OBAMACARE_REGEX, regex = True, flags=re.IGNORECASE)].sort_values('date')
df_obamacare['date'] = df_obamacare['date'].apply(lambda date : date.date())

df_obamacare_hc = df_obamacare[df_obamacare['speaker'] == 'Hillary Clinton']
df_obamacare_dt = df_obamacare[df_obamacare['speaker'] == 'Donald Trump']

In [None]:
quotes_hist_split(df_obamacare)

In [None]:
def quote_topics_histogram(df_hc, df_dt, bins=52, topic_name=""):
    
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))

    kwargs = {
        "alpha" : 0.4,
        "bins": bins,
    }

    axs[0].hist(df_hc['date'], weights=df_hc['numOccurrences'], color="b", **kwargs)
    axs[1].hist(df_dt['date'], weights=df_dt['numOccurrences'], color="r", **kwargs)

    fig.suptitle("Mentions of the Topic {} in 2016 by candidate".format(topic_name), fontsize=16)

    axs[0].set_ylabel('Frequency')

    axs[0].title.set_text('Hillary Clinton')
    axs[0].set_xlabel('Date')

    axs[1].title.set_text('Donald Trump')
    axs[1].set_xlabel('Date')
    
    ticks = [f"2016-0{i * 3}-01" for i in range(1,4)] + ["2016-12-01"]

    axs[0].set_xticks(ticks)
    axs[1].set_xticks(ticks)
    
    plt.show()
    plt.clf()

In [None]:
# Example:
quote_topics_histogram(df_obamacare_hc, df_obamacare_dt, topic_name="Obamacare")

##### Obserrvation from Example

Hillary Clinton talked less about Obamacare but instead more about Healthcare... To be explored further!

#### Topic Extraction

In [None]:
#
# If set to True the entire topic extraction pipeline will be executed which takes a lot of time.
# If set to False if is possible to execute parts of the pipeline (see sections below).
#
TOPICS_FROM_SCRATCH = False

We get a quick look of the textual data we have available.

In [None]:
#
# We create one blob of text, consisting of all quotes in the dataset.
#
merged_quotes =  ' '.join(df_cleaned['quotation'].values)
#
print("Quotes in Dataset:          {:>10,}".format(len(df_cleaned)))
print("Characters in Text:         {:>10,}".format(len(merged_quotes)))
print("Spacy maximum:              {:>10,}".format(1_000_000))
print()
print("Clinton Quotes in Dataset:  {:>10,}".format(len(df_cleaned[df_cleaned['speaker'] == 'Hillary Clinton'])))
print("Trump Quotes in Dataset:    {:>10,}".format(len(df_cleaned[df_cleaned['speaker'] == 'Donald Trump'])))

Since the amount of data is too big to run on my computer we need to break it down.

##### Prepare Data

In [None]:
daily_quotes_aggregate = df_cleaned.groupby([df_cleaned['date'].dt.date]).apply(lambda group : group['quotation'].values).to_frame(name='quotes') #, columns=['date', 'quotes'])

print("{:} days with Quotes.".format(len(daily_quotes_aggregate)))
display(daily_quotes_aggregate)

##### Defining our NLP Model

We use this model to process the each quote in preparation for the topic extracion model (LDA).

In [None]:
nlp = spacy.load('en_core_web_sm') # Load Spacy English

def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    # Remove stopwords, punctuation and words with less than 3 characters.
    tokens = [token for token in doc if not token.is_stop]
    tokens = [token for token in tokens if not token.is_punct]
    doc = [token.text for token in tokens if len(token) > 2]
    return doc

nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [None]:
NLP_PROCESS_QUOTES = False or TOPICS_FROM_SCRATCH # Should take < 15 minutes.

NLP_MODEL_PATH = 'topics/nlp-model'
DOCUMENTS_FILE = 'topics/documents.pickle'

if NLP_PROCESS_QUOTES:

    print("Processing quotes using Spacy NLP..")

    daily_quotes = daily_quotes_aggregate['quotes'].values

    documents = []

    for day_quotes in tqdm(daily_quotes):
        merged_quotes = ''.join(day_quotes)

        documents.append(nlp(merged_quotes))

    # Write to disk
    nlp.to_disk(NLP_MODEL_PATH)
    docs_file = open(DOCUMENTS_FILE, 'wb')
    pickle.dump(documents, docs_file)

else:
    print("Reading from file...")
    nlp = nlp.from_disk(NLP_MODEL_PATH)
    
    docs_file = open(DOCUMENTS_FILE, "rb")
    documents = pickle.load(docs_file)

print("\nSample of 1 document:\n")
print(documents[0][0:40])

##### Build Corpus and ID2words for LDA

In [None]:
def build_word_dictionary(documents, max_freq=0.5, min_wordcount=5):
    words = corpora.Dictionary(documents)
    print("Words in Corpus:     {:>8,}     |  After Spacy NLP".format(len(words)))

    words.filter_extremes(no_below=min_wordcount, no_above=max_freq)
    print("Words in Corpus:     {:>8,}     |  After removing words with too low/high frequency)\n".format(len(words)))

    return words

def build_corpus(words, documents):

    # Turns each document into a bag of words.
    corpus = [words.doc2bow(document) for document in documents]
    print("Documents in Corpus: {:>8}     |  1 document per day in our data range\n".format(len(corpus)))

    return corpus

In [None]:
words = build_word_dictionary(documents)
corpus = build_corpus(words, documents)

##### Train LDA modelto extract topics

In [None]:
#
# Read modelfrom Disk or train from scratch.
#
NEW_MODEL = False or TOPICS_FROM_SCRATCH

N_TOPICS = 50  # Number of topics to extract

GENSIM_MODEL_FILE_NAME='topics/gensim-model/lda-model.gensim'

if NEW_MODEL:
    print("Training new model...")
    model = LdaMulticore(
        corpus=corpus,
        id2word=words,
        num_topics=N_TOPICS,
        workers=6,
        passes=50,
        random_state=SEED
    )
    model.save(GENSIM_MODEL_FILE_NAME)
    print("Done training new model. Saved to file...")
else:
    print("Reading from file...")
    model = LdaMulticore.load(GENSIM_MODEL_FILE_NAME)

EXAMPLE_TOPIC_ID = 0

print("Example:\n")
print("Displaying the function of  topic with ID: {:}\n".format(EXAMPLE_TOPIC_ID))
model.print_topic(EXAMPLE_TOPIC_ID)
print("\nShow  top 3 words of some topics:")
display(model.show_topics(num_words=3))

In [None]:
# Visualise the extracted topics.

#visualizable_data = pyLDAvis.gensim_models.prepare(model, corpus, words)
#pyLDAvis.display(visualizable_data)

##### Assign each quote to a topic

In [None]:
#
# If set to TRUE we will re-assign all quotes based on the latest model.
#
RE_ASSIGN = False or TOPICS_FROM_SCRATCH # takes alot of time (>15 minutes).

#
# Extracting the most likely topic out of a quote (or any text).
#
def assign_topic_id(quote):

        document = nlp(quote)
        document_bow = words.doc2bow(document)

        topics = model.get_document_topics(document_bow)
        topics = sorted(topics, key= lambda topic : topic[1], reverse=True)
    
        top_topic = topics[0][0]
        
        return top_topic

#
# Get dataframe with topic id assigned to each quote/row
# (Recomputed or Read from File)
#
if RE_ASSIGN:
    print("Re-assigning each quote/row based  on current model. (will take a while!)")
    df_cleaned['topic_id'] = df_cleaned['quotation'].apply(lambda quote : assign_topic_id(quote))

    df_cleaned_with_topics  = df_cleaned.copy()

    #  Write to disk
    df_cleaned_with_topics.to_csv('data/df_cleaned_with_topics.csv')
else:
    print("Reading from file...")
    df_cleaned_with_topics = pd.read_csv('data/df_cleaned_with_topics.csv')

In [None]:
from collections import Counter

fig, ax = plt.subplots(figsize=(10,5))

fig.suptitle("Histogram of Quotes per Topic", fontsize=16)

hist = np.bincount(df_cleaned_with_topics['topic_id'])

ax.bar(range(50), hist, color='green', alpha=0.4)
ax.set_xlabel("Topic  Number")
ax.set_ylabel("Frequency")

plt.show()
plt.clf()

## Q3: Language

### a) Intellectuality of Language

Idea 1: Count the syllables in each candidate's consolidated quotes and compare distributions

In [None]:
def syllables_hist(dt_text: str, hc_text: str):

    dt_syl = list(map(syllables.estimate, dt_text.split()))
    hc_syl = list(map(syllables.estimate, hc_text.split()))

    kwargs = {
        "x"    : [hc_syl, dt_syl],
        "label" : ["Hillary Clinton", "Donald Trump"],
        "color" : ["b", "r"],
        "alpha" : 0.4,
        "bins" : range(1, 6),
        "density" : True,
        "align": "left",
    }

    plt.hist(**kwargs)
    plt.legend()
    plt.title("Histogram of number of syllables", fontsize=16)
    plt.xlabel("Number of syllables in a Word")
    plt.ylabel("Probability of Occuring")

    # TODO generalize this
    plt.xticks(range(1, 5), range(1, 5))

    plt.show()

In [None]:
hc_text = ' '.join(hc_cleaned_sample["quotation"])
dt_text = ' '.join(dt_cleaned_sample["quotation"])

In [None]:
syllables_hist(dt_text, hc_text)

Idea 2: Look at the CEFR language level. We can measure the language level for all words in each candidate's consolidated quotes and compare the distributions.

But first we need to get data on words. We have the HTML for a web page containing these data, and we can put them in the form of a CSV file.

In [None]:
def bs_to_csv(soup, csv_writer):
    table = soup.find("tbody")
    tr_rows = table.find_all("tr")
    csv_writer.writerow(["word", "guideword", "level", "part of speech", "topic"])
    csv_writer.writerows(
        [[t.text for t in r.find_all("td")][:-1] for r in tr_rows]
    )

In [None]:
# Scrape the HTML file and write in CSV format to file.
with open(CEFR_HTML_IN) as fp, open(CEFR_CSV_OUT, 'w') as csv_out:
    soup = BeautifulSoup(fp)
    csv_writer = csv.writer(csv_out, delimiter=',')
    bs_to_csv(soup, csv_writer)

In [None]:
df_cefr = pd.read_csv(CEFR_CSV_OUT)

display(df_cefr)

The data includes idioms as well as nouns; this is intelligible for a human, but it might be a bit of trouble to make it work here, so we remove them.
We'll only need the `word` and `level` columns, so we remove the others as well.

In [None]:
df_cefr = df_cefr[[len(w.split()) == 1 for w in df_cefr["word"]]]\
                 .filter(items=["word", "level"])

Now we set everything to lowercase. Also it seems that some words have punctuation surrounding them, which is not desirable.

In [None]:
df_cefr["word"] = df_cefr["word"].transform(
    lambda w: w.lower()\
               .translate(str.maketrans('', '', string.punctuation)))

display(df_cefr.head())

Also, the same word can appear multiple times (as there might be different phrases in which the word means something slightly different) so we aggregate the levels using the median level.

In [None]:
cefr_level_map = {
    "A1": 1,
    "A2": 2,
    "B1": 3,
    "B2": 4,
    "C1": 5,
    "C2": 6,
}

In [None]:
df_cefr_copy = df_cefr.copy(deep=True)

In [None]:
# Use the map to transform the level column

df_cefr.level = df_cefr_copy["level"].map(cefr_level_map)

In [None]:
# Aggregate with the median

df_cefr = df_cefr.groupby("word").agg("median").reset_index()

display(df_cefr.head())

Finally we change the index to the word itself to facilitate searching.

In [None]:
df_cefr.index = df_cefr.word
df_cefr = df_cefr.filter(items=["level"])

Now we can finally apply this to our dataset.

In [None]:
cefr_data = pd.read_csv(CEFR_CLEAN_CSV_IN, index_col="word")
cefr_data.loc["he"].loc["level"]

In [None]:
def cefr_level_hist(dt_text: str, hc_text: str):

    cefr_data = pd.read_csv(CEFR_CLEAN_CSV_IN, index_col="word")
    cefr_data.loc["he"].loc["level"]
    
    level = lambda w: cefr_data.loc[w].loc["level"] if w in cefr_data.index else 0

    dt_level = list(map(level, dt_text.split()))
    hc_level = list(map(level, hc_text.split()))

    kwargs = {
        "x"    : [hc_level, dt_level],
        "label" : ["Hillary Clinton", "Donald Trump"],
        "color" : ["b", "r"],
        "alpha" : 0.4,
        "bins" : range(0, 7),
        "density" : True,
        "align": "left",
    }
    plt.hist(**kwargs)
    plt.legend()
    plt.title("Histogram of English level of words", fontsize=16)
    plt.xlabel("English level")
    plt.ylabel("Probability")

    plt.xticks(range(0, 7), ["NA", "A1", "A2", "B1", "B2", "C1", "C2"])

    plt.show()

In [None]:
cefr_level_hist(dt_text.lower(), hc_text.lower())

There seems to be little difference in terms of language levels, but note how many words were not accounted for; this is due in part to limitations of our CEFR dataset, and also to the fact that some words appear in text in different forms (e.g. a verb conjugated to the past tense). This particular issue will be remedied by "stemming" the words to reduce them to their "base" form.

### Ideas

- Who is are Trump and Clinton most confused with? Using the other speaker attributions in the quotas list.

### b) Sentiment Analysis

In [None]:
cleaned_sample.groupby(['speaker']).count().sort_values('quotation', ascending=False)['quotation']

#### Using NLTK’s Pre-Trained Sentiment Analyzer
We use NLTK VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analysis tools

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
trump_polarity_scores = dt_cleaned_sample['quotation'].apply(sia.polarity_scores)
dt_cleaned_sample['polarityScore'] = [score.get('compound') for score in trump_polarity_scores]

clinton_polarity_scores = hc_cleaned_sample['quotation'].apply(sia.polarity_scores)
hc_cleaned_sample['polarityScore'] = [score.get('compound') for score in clinton_polarity_scores]

In [None]:
def print_topk_quotes(df, sentiment, k):
    if (sentiment == "positive"):
        print("Top {} {} quotes:".format(k, sentiment))
        [print("\n[+]", quote) for quote in df.sort_values('polarityScore', ascending=False)['quotation'].head(k).tolist()]
    else:
        print("Top {} {} quotes:".format(k, sentiment))
        [print("\n[-]", quote) for quote in df.sort_values('polarityScore', ascending=True)['quotation'].head(k).tolist()]

#### Donald Trump's 3 most positive and negative quotes

In [None]:
print_topk_quotes(dt_cleaned_sample, "positive", 3)
print("\n---\n")
print_topk_quotes(dt_cleaned_sample, "negative", 3)

#### Hillary Clinton's 3 most positive and negative quotes

In [None]:
print_topk_quotes(hc_cleaned_sample, "positive", 3)
print("\n---\n")
print_topk_quotes(hc_cleaned_sample, "negative", 3)

In [None]:
trump_df_pos = dt_cleaned_sample[dt_cleaned_sample['polarityScore'] >= 0]
trump_df_neg = dt_cleaned_sample[dt_cleaned_sample['polarityScore'] < 0]

plt.scatter(x=trump_df_pos['date'], y=trump_df_pos['polarityScore'], color='g', alpha=0.3, label="positive")
plt.scatter(x=trump_df_neg['date'], y=trump_df_neg['polarityScore'], color='r', alpha=0.3, label="negative")

plt.title("Trump's quotes compound polarity scores by date in 2016")
plt.xlabel('Date')
plt.ylabel('Polarity score')
plt.legend()

plt.show()

In [None]:
clinton_df_pos = hc_cleaned_sample[hc_cleaned_sample['polarityScore'] >= 0]
clinton_df_neg = hc_cleaned_sample[hc_cleaned_sample['polarityScore'] < 0]

plt.scatter(x=clinton_df_pos['date'], y=clinton_df_pos['polarityScore'], color='g', alpha=0.3, label="positive")
plt.scatter(x=clinton_df_neg['date'], y=clinton_df_neg['polarityScore'], color='r', alpha=0.3, label="negative")

plt.title("Clinton's quotes compound polarity scores by date in 2016")
plt.xlabel('Date')
plt.ylabel('Polarity score')
plt.legend()

plt.show()

#### Distribution of Hillary Clinton and Donald Trumps compound sentiment

In [None]:
# calculate totals
dt_tot_quotes_count = len(dt_cleaned_sample)
dt_pos_quotes_count = sum(dt_cleaned_sample['polarityScore']>=0.05)
dt_neg_quotes_count = sum(dt_cleaned_sample['polarityScore']<=-0.05)
dt_neu_quotes_count = sum(np.abs(dt_cleaned_sample['polarityScore'])<0.05)

hc_tot_quotes_count = len(hc_cleaned_sample)
hc_pos_quotes_count = sum(hc_cleaned_sample['polarityScore']>=0.05)
hc_neg_quotes_count = sum(hc_cleaned_sample['polarityScore']<=-0.05)
hc_neu_quotes_count = sum(np.abs(hc_cleaned_sample['polarityScore'])<0.05)

# summary
print()
print('                                     Hilary Clinton     |      Donald Trump      ')
print('                                ------------------------|------------------------')
print('Total number quotes in sample:        {:>4} ({:.1%})     |      {:>4} ({:.1%})'.format(hc_tot_quotes_count, hc_tot_quotes_count/hc_tot_quotes_count,
                                                                                               dt_tot_quotes_count, dt_tot_quotes_count/dt_tot_quotes_count))
print('Number of negative quotes:            {:>4} ({:.1%})      |      {:>4} ({:.1%})'.format(hc_neg_quotes_count, hc_neg_quotes_count/hc_tot_quotes_count,
                                                                                                dt_neg_quotes_count, dt_neg_quotes_count/dt_tot_quotes_count ))
print('Number of neutral quotes:             {:>4} ({:.1%})      |      {:>4} ({:.1%})'.format(hc_neu_quotes_count, hc_neu_quotes_count/hc_tot_quotes_count,
                                                                                                dt_neu_quotes_count, dt_neu_quotes_count/dt_tot_quotes_count))
print('Number of positive quotes:            {:>4} ({:.1%})      |      {:>4} ({:.1%})'.format(hc_pos_quotes_count, hc_pos_quotes_count/hc_tot_quotes_count,
                                                                                                dt_pos_quotes_count, dt_pos_quotes_count/dt_tot_quotes_count ))

print("\n(Positive score > 0.05, Negative score < -0.05, Neutral score: -0.05 < score < 0.05)")

### Sentiment analysis by topic

In [None]:
def filter_n_concatenate_dfs(hc_sample, dt_sample, topic_keywords):
    hc_filtered = hc_sample[hc_sample['quotation'].str.contains(topic_keywords, case=False)]
    dt_filtered = dt_sample[dt_sample['quotation'].str.contains(topic_keywords, case=False)]
    
    hc_dt_filtered = pd.DataFrame(dict(
        candidate = np.concatenate((["Hillary Clinton"]*len(hc_filtered), 
                                 ["Donald Trump"]*len(dt_filtered))), 
        polarityScore   = np.concatenate((hc_filtered['polarityScore'],
                                          dt_filtered['polarityScore'])),
        quotation   = np.concatenate((hc_filtered['quotation'],
                                          dt_filtered['quotation']))
    ))
    return hc_dt_filtered

In [None]:
def sentimentHistogram(hc_sample, dt_sample, topic_keywords): 
    hc_dt_filtered = filter_n_concatenate_dfs(hc_sample, dt_sample, topic_keywords)

    # plot with plotly
    fig = px.histogram(hc_dt_filtered, x="polarityScore", color="candidate", histnorm='probability',
                       barmode="overlay", nbins=40, marginal="box",
                       width=600, height=400,
                       log_y=True
    )

    # customize font and legend orientation & position
    fig.update_layout( 
        font_family="Rockwell",
        legend=dict(
            title=None, orientation="h", y=1, yanchor="bottom", x=0.5, xanchor="center"
        ),
        title_text='Sentiment distribution (topic = '+topic_keywords+')', title_x=0.5,
        xaxis_title_text='Sentiment (polarity compound score)',
        yaxis_title_text='Number of quotes (log scale)', 
        # bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.05, # gap between bars of the same location coordinates
    )
    fig.update_xaxes(range=[-1, 1])
    plot_filename = JEKYLL_PLOTS_PATH + "sentiment_hist_" + topic_keywords[:5] + ".html"
    fig.write_html(plot_filename, include_plotlyjs=False, full_html=False, default_width='66%', default_height='66%')
    fig.show()

In [None]:
def quotesCountPerSpeaker(hc_sample, dt_sample, topic_keywords): 
    hc_dt_filtered = filter_n_concatenate_dfs(hc_sample, dt_sample, topic_keywords)

    quote_counts = hc_dt_filtered.groupby(['candidate']).size().reset_index(name='counts')
    
    # sort to have Clinton on the left and Trump on the right
    quote_counts.sort_values('candidate', ascending=False, inplace=True)

    fig = px.bar(quote_counts, x="candidate", y="counts",
                 color='candidate',
                 color_discrete_map={
                    'Hillary Clinton': px.colors.qualitative.Plotly[0],
                    'Donald Trump': px.colors.qualitative.Plotly[1],
                 },
                width=600, height=400,
                text="counts"
                )
    # customize font and legend orientation & position
    fig.update_layout( 
        font_family="Rockwell",
        legend=dict(
            title=None, orientation="h", y=1, yanchor="bottom", x=0.5, xanchor="center"
        ),
        title_text='Total number of quotes (topic = '+topic_keywords+')', title_x=0.5,
        xaxis_title_text='Candidate',
        yaxis_title_text='Number of quotes', 
    )
    plot_filename = JEKYLL_PLOTS_PATH + "quotes_count_per_speaker_" + topic_keywords[:5] + ".html"
    fig.write_html(plot_filename, include_plotlyjs=False, full_html=False, default_width='66%', default_height='66%')
    fig.show()

In [None]:
def sentimentClassifier(hc_sample, dt_sample, topic_keywords): 
    hc_dt_filtered = filter_n_concatenate_dfs(hc_sample, dt_sample, topic_keywords)

    # create a list of our conditions
    sentiment_conditions = [
        (hc_dt_filtered['polarityScore'] <= -0.05),
        (np.abs(hc_dt_filtered['polarityScore']) < 0.05),
        (hc_dt_filtered['polarityScore'] >= 0.05)
    ]

    # create a list of the values we want to assign for each condition
    values = ['Negative &#128545;<br>(score < -0.05)', 'Neutral &#128528; <br>(-0.05 < score < 0.05)', 'Positive &#128515;<br>(score > 0.05)']

    # create a new column and use np.select to assign values to it using our lists as arguments
    hc_dt_filtered['sentiment'] = np.select(sentiment_conditions, values)

    # aggregate our dataframe by sentiment and candidate
    df_sentiment_agg = hc_dt_filtered.groupby(['sentiment', 'candidate']).size().reset_index(name='counts')

    # NORMALIZATION
    # compute total number of quotes per candidate
    quote_counts = hc_dt_filtered.groupby(['candidate']).size()
    hc_quote_counts = quote_counts['Hillary Clinton']
    dt_quote_counts = quote_counts['Donald Trump']

    # divide by total number of quotes per candidate to get percentage
    df_sentiment_agg['counts_pct'] = 0
    df_sentiment_agg['counts_pct'] = np.where(df_sentiment_agg['candidate'] == 'Donald Trump', df_sentiment_agg['counts']/dt_quote_counts, df_sentiment_agg['counts_pct'])
    df_sentiment_agg['counts_pct'] = np.where(df_sentiment_agg['candidate'] == 'Hillary Clinton', df_sentiment_agg['counts']/hc_quote_counts, df_sentiment_agg['counts_pct'])

    # sort to have Clinton on the left and Trump on the right
    df_sentiment_agg.sort_values('candidate', ascending=False, inplace=True)

    fig = px.bar(df_sentiment_agg, x="sentiment", y="counts_pct",
                 color='candidate', barmode='group',
                 color_discrete_map={
                    'Hillary Clinton': px.colors.qualitative.Plotly[0],
                    'Donald Trump': px.colors.qualitative.Plotly[1],
                 },
                 width=600, height=400,
                 text="counts_pct",
                 hover_data={
                    'candidate': True, 
                    'sentiment': True, 
                    'counts_pct': False, 
                    'counts': True, 
                 }
    )

    # customize font and legend orientation & position
    fig.update_layout( 
        font_family="Rockwell",
        legend=dict(
            title=None, orientation="h", y=1, yanchor="bottom", x=0.5, xanchor="center"
        ),
        title_text='Quotes sentiment classification (topic = '+topic_keywords+')', title_x=0.5,
        xaxis_title_text='Sentiment',
        yaxis_title_text='% of quotes per candidate'
    )
    
    fig.update_traces(texttemplate='%{text:.0%}', textposition='inside')
    plot_filename = JEKYLL_PLOTS_PATH + "sentiment_class_" + topic_keywords[:5] + ".html"
    fig.write_html(plot_filename, include_plotlyjs=False, full_html=False, default_width='66%', default_height='66%')
    fig.show()    

In [None]:
topic_keywords = ''
quotesCountPerSpeaker(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentClassifier(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentHistogram(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)

In [None]:
topic_keywords = 'economy'
quotesCountPerSpeaker(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentClassifier(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentHistogram(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)

hc_dt_filtered = filter_n_concatenate_dfs(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
print_topk_quotes(hc_dt_filtered, "positive", 3)
print("\n---\n")
print_topk_quotes(hc_dt_filtered, "negative", 3)

In [None]:
topic_keywords = 'immigration|border'
quotesCountPerSpeaker(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentClassifier(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentHistogram(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)

hc_dt_filtered = filter_n_concatenate_dfs(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
print_topk_quotes(hc_dt_filtered, "positive", 3)
print("\n---\n")
print_topk_quotes(hc_dt_filtered, "negative", 3)

In [None]:
topic_keywords = 'obamacare'
quotesCountPerSpeaker(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentClassifier(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentHistogram(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)

hc_dt_filtered = filter_n_concatenate_dfs(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
print_topk_quotes(hc_dt_filtered, "positive", 3)
print("\n---\n")
print_topk_quotes(hc_dt_filtered, "negative", 3)

In [None]:
topic_keywords = 'abortion'
quotesCountPerSpeaker(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentClassifier(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
sentimentHistogram(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)

hc_dt_filtered = filter_n_concatenate_dfs(hc_cleaned_sample, dt_cleaned_sample, topic_keywords)
print_topk_quotes(hc_dt_filtered, "positive", 3)
print("\n---\n")
print_topk_quotes(hc_dt_filtered, "negative", 3)

### Sentiment analysis on the target of quotes. 

For instance, Trump calling Clinton "Crooked Hillary" would be a negative statement about Presidential candidate Clinton. This will be done for a number of politicians, including: Hillary Clinton, Nancy Pelosi, Barack Obama, Bernie Sanders, Elizabeth Warren, Ted Cruz, Mike Pence, and Mitch McConnell.

In [None]:
trump_df = sample.loc[sample['speaker'] == 'Donald Trump']
clinton_df = sample.loc[sample['speaker'] == 'Hillary Clinton']

trump_on_clinton = trump_df[trump_df['quotation'].str.contains('clinton|hillary', case=False)]
trump_on_obama = trump_df[trump_df['quotation'].str.contains('obama|barack', case=False)]
trump_on_sanders = trump_df[trump_df['quotation'].str.contains('bernie|sanders', case=False)]

clinton_on_trump = clinton_df[clinton_df['quotation'].str.contains('trump|donald', case=False)]

In [None]:
trump_on_clinton_PS = trump_on_clinton['quotation'].apply(sia.polarity_scores)
trump_on_clinton['polarityScore'] = [score.get('compound') for score in trump_on_clinton_PS]

trump_on_obama_PS = trump_on_obama['quotation'].apply(sia.polarity_scores)
trump_on_obama['polarityScore'] = [score.get('compound') for score in trump_on_obama_PS]

In [None]:
trump_on_clinton_pos = trump_on_clinton[trump_on_clinton['polarityScore'] >= 0]
trump_on_clinton_neg = trump_on_clinton[trump_on_clinton['polarityScore'] < 0]

plt.scatter(x=trump_on_clinton_pos['date'], y=trump_on_clinton_pos['polarityScore'], color='g', alpha=0.3, label="positive")
plt.scatter(x=trump_on_clinton_neg['date'], y=trump_on_clinton_neg['polarityScore'], color='r', alpha=0.3, label="negative")

plt.title("Trump's Hillary-targeted quotes compound polarity scores by date in 2016")
plt.xlabel('Date')
plt.ylabel('Polarity score')
plt.legend()

plt.show()

In [None]:
trump_on_obama_pos = trump_on_obama[trump_on_obama['polarityScore'] >= 0]
trump_on_obama_neg = trump_on_obama[trump_on_obama['polarityScore'] < 0]

plt.scatter(x=trump_on_obama_pos['date'], y=trump_on_obama_pos['polarityScore'], color='g', alpha=0.3, label="positive")
plt.scatter(x=trump_on_obama_neg['date'], y=trump_on_obama_neg['polarityScore'], color='r', alpha=0.3, label="negative")

plt.title("Trump's Obama-targeted quotes compound polarity scores by date in 2016")
plt.xlabel('Date')
plt.ylabel('Polarity score')
plt.legend()

plt.show()

Perhaps a next step would be to compare this with a baseline ratio of positive/negative words. For instance, maybe on average people say 70% positive things and 30% negative. Knowing this could help compare results with Trump and Clinton.

### Pronoun analysis

A large part of analyzing someone's speech is not only about what they say, but how they say it. People reveal who they are through their own words. Now besides content words such as nouns, regular & action verbs, and modifiers (adjectives and adverbs), there is a separate class of words called style or function words that, on their own, do not signify anything.

However, it turns out these function words are very good at indicating the current emotion of the speaker as well as how they think since they are processed differently in the brain and their use follows a power law in most languages. For instance, when someone is depressed, they will use the pronoun "I" more frequently. There are also gender differences when it comes to function words. Women use more first-person words such as "I" or "we" whereas men prefer to use articles like "a" and "the".

How do these differences fare between Hillary Clinton and Donald Trump? Do they talk more about themselves with "I" or "me" or about others?

In [None]:
#Counting number of most frequently used pronouns in the English language by candidate

freq_pronouns = ['it','I','you','he','they','we','she','who','them','me']
trump_pronouns_spc, clinton_pronouns_spc = [],[]
trump_I_count, clinton_I_count, trump_who_count, clinton_who_count = [],[],[],[]
trump_nb_pronouns, clinton_nb_pronouns, trump_spc_count, clinton_spc_count = 0,0,0,0

for k in freq_pronouns:
    for i in range(len(trump_df)):
        count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(k), trump_df['quotation'].iloc[i]))
        trump_nb_pronouns += count
        trump_spc_count += count
        if k == 'I':
            trump_I_count.append(count)
        if k == 'who':
            trump_who_count.append(count)
    trump_pronouns_spc.append(trump_spc_count)
    trump_spc_count = 0

for k in freq_pronouns:
    for i in range(len(clinton_df)):
        count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(k), clinton_df['quotation'].iloc[i]))
        clinton_nb_pronouns += count
        clinton_spc_count += count
        if k == 'I':
            clinton_I_count.append(count)
        if k == 'who':
            clinton_who_count.append(count)
    clinton_pronouns_spc.append(clinton_spc_count)
    clinton_spc_count = 0

In [None]:
import plotly.graph_objects as go

fig = go.Figure().update_xaxes(categoryorder = "total descending")
fig.add_trace(go.Bar(
    x=freq_pronouns,
    y=scaled_clinton_pronouns,
    name='Clinton pronoun count',
    marker_color='blue'
))
fig.add_trace(go.Bar(
    x=freq_pronouns,
    y=scaled_trump_pronouns,
    name='Trump pronoun count',
    marker_color='red'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

Another interesting finding relates to the social class differences in language patterns. People from higher social classes tend to use more articles and prepositions compared to their lower class counterparts which use more pronouns and auxiliary verbs. This difference is known to be statistically significant.

Does Donald Trump target the lower and middle classes better? The heartland of America? It certainly helped Trump in his case, since listeners feel closer to the speaker when the word "I" is used more often, even though political advisors usually suggest to use more "we" which unfortunately creates the opposite effect, such as during John Kerry's 2004 Presidential run. However, it is very likely that knowing this wouldn't have changed the outcome in 2004 nor in 2016. Language is a powerful reflection of a person's personality and character but does not change a person on its own.

This is in fact a counterintuitive finding as well since, as a man, Donald Trump would be statistically much more likely to use more articles and nouns and less likely to use pronouns than Hillary Clinton, his female adversary. Donald Trump's high social status also does not account for this pronoun use by the Republican candidate.

#### Plotting average sentiment in sentences with select pronouns

In [None]:
trump_sentiment_pronouns, clinton_sentiment_pronouns = [],[]

for i in freq_pronouns:
    temp_trump = dt_cleaned_sample[dt_cleaned_sample['quotation'].str.contains(i)]
    trump_sentiment_pronouns.append(temp_trump['polarityScore'].mean())
    temp_clinton = hc_cleaned_sample[hc_cleaned_sample['quotation'].str.contains(i)]
    clinton_sentiment_pronouns.append(temp_clinton['polarityScore'].mean())

In [None]:
fig = go.Figure().update_xaxes(categoryorder = "total descending")
fig.add_trace(go.Bar(
    x=freq_pronouns,
    y=clinton_sentiment_pronouns,
    name='Clinton pronoun count',
    marker_color='blue'
))
fig.add_trace(go.Bar(
    x=freq_pronouns,
    y=trump_sentiment_pronouns,
    name='Trump pronoun count',
    marker_color='red'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

#### Showing random negative Trump quote containing "who"

In [None]:
import numpy as np
import random
trump_who = np.array(trump_who_count)
trump_who_indices = trump_who.nonzero()

random_quote = random.randint(0, len(trump_who_indices[0]))
trump_df['quotation'].iloc[trump_who_indices[0][random_quote]]

### "Crooked Hillary"?

Trump famously popularized the harsh phrase "Crooked Hillary" to denote the dishonesty that he perceived from his opponent. But was there some basis for this statement? It turns out, deception can also be captured by language to some degree. Most people, when telling the truth about an important situation, will use more often the pronoun "I" (single best predictor of a person's honesty*) as well as more negative emotion. However, in our case, Clinton used both less "I" and less negative emotion than Trump for most pronouns. This gives some evidence for the validity of Trump's nickname for Clinton, cruel as it is.

*Taken from Pennebaker, James W. The Secret Life of Pronouns. Bloomsbury Publishing.