In [44]:
from helpers import *

import sys

import bz2
import json

import pickle

import numpy as np
# import scipy

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.linear_model import SGDClassifier
# from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

# from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# Printing without trunctions
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_colwidth', None)

---

**TODO describe a bit about party labeling and how we only keep politicians!!**

----

## Predicting which political party does a quote's message lean towards

Our first task is to create a model that can classify the political inclinations of a single quote.

Since, our classification task strongly resembles that of NLP sentiment analysis
we applied a corresponding methodology.

Which can be summed up by the following steps by the following steps:
1. Label data (done in part2)
2. Clean quotations (augmented in part3, this part)
3. Vectorize quotations
4. Train models and select optimal model for prediction

Due to the complex nature of the task, which is to predict whether a single quote
was said by a republican or democrat politician reaching a high accuracy is very
difficult and so we had to optimize all the steps described above.

Also, as noted in the course and on various online resources. It is sometimes
better to have less data cleaning and text preprocessing in an
NLP sentiment analysis tasks.

We therfore had to find the optimal pipeline. This required finding the best 
combination of text preprocessor/cleaner, vectorizer and ML model. Given, that
this isn't an ML class we tested a few computationally simple models
(which can also be trained in a reasonable time frame) and focused rather on
optimizing the preprocessing.

### Finding the optimal level of preprocessing

Our strategy in order to complete this task is to generate a dataset of quotes
where each quote has with 5 different levels of text preprocessing, ranging 
from very light to very strong. Then we run cross validation with a few simple models 
(to keep execution time reasonable) and aggregate a few performance metrics to
identify which level of preprocessing yielded the best performance with our model.

Unlike for the final classification pipeline/model we perform all this only with 
quotes from 2020 since it is a reasonably sized snapshot of the data for this task.

Each level of preprocessing was given a 1 letter name A,B,...,E.
Here is a description of the 5 different levels of preprocessing:
- A: Some trivial cleanup, removing digits and diacritics.
- B: All steps in A + casefolding and removing punctuation.
- C: All steps in B + removing stopwords.
- D: All steps in C + stemming. Using the snowball stemmer.
- E: All steps in C + lemmatization. Using nltk's WordNetLemmatizer.

We perform the analysis in the cells below. We first prepare the data
before running out tests.

In [17]:
# Load preprocessed data containing all variants of text processing
path = fixpath(QUOTES_2020_LABELED_CLEANED_VARIANTS)
# path = fixpath(QUOTES_2020_LABELED_CLEANED_VARIANTS)

df_raw = pd.read_json(path, orient='records', lines=True)
print(df_raw.shape)
df_raw.head()

(349146, 13)


Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,id,party_label,US_congress_bio_ID,quotation_cleanA,quotation_cleanB,quotation_cleanC,quotation_cleanD,quotation_cleanE
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,sue myrick,2020-01-16 12:00:13,1,Q367796,R,M001134,[ Department of Homeland Security ] was livid ...,department of homeland security was livid and ...,department homeland security livid strongly ur...,depart homeland secur livid strong urg agenda ...,department homeland security livid strongly ur...
1,2020-01-26-000499,a few of the candidates who will do better in ...,dave loebsack,2020-01-26 13:21:36,11,Q771586,D,L000565,a few of the candidates who will do better in ...,a few of the candidates who will do better in ...,candidates better part world,candid better part world,candidate better part world
2,2020-01-26-040663,"The generational thing is important, quite hon...",dave loebsack,2020-01-26 13:21:36,11,Q771586,D,L000565,"The generational thing is important, quite hon...",the generational thing is important quite hone...,generational thing important quite honestly th...,generat thing import quit honest think everyth...,generational thing important quite honestly th...
3,2020-01-20-000982,a host of other protections,debbie lesko,2020-01-20 15:32:48,1,Q16731415,R,L000589,a host of other protections,a host of other protections,host protections,host protect,host protection
4,2020-01-09-060095,"No, are you kidding? The way that this place l...",debbie lesko,2020-01-09 23:15:21,1,Q16731415,R,L000589,"No, are you kidding? The way that this place l...",no are you kidding the way that this place lea...,kidding way place leaks around,kid way place leak around,kidding way place leak around


In [18]:
df = df_raw.copy()

We drop very short quotes as a quote that is particularly short will give us
little information and would most likely be irrelevant in the classification
task. We just drop quotes shorter than 90% of all other quotes.

In [19]:
# Droping short quotes. Quotes shorter than 90% of all other quotes
df = drop_short_quotes(df, threshold_quantile=0.1, quote_col_name='quotation_cleanE')
df.shape

(311839, 13)

In [20]:
# Checking if dataset is balanced
df['party_label'].value_counts()

D    194756
R    117083
Name: party_label, dtype: int64

In [21]:
# Data is unbalanced. Since we have a lot of data we just downsample
df = downsample(df, 'party_label')

In [22]:
# Checking if the data is well balanced now
df['party_label'].value_counts()

D    117083
R    117083
Name: party_label, dtype: int64

In [23]:
# Several different sized version of the data for convenience. Since some
# models we test take long to train. Our final prediction for best level of
# preprocessing will be done on the full data frame (~220k quotes from 2020)
# that we generated above.

df_micro = df.sample(1000)
df_mini = df.sample(10000)
df = df.sample(frac=1)

In [30]:
def test_classifer(df, pipeline, break_after_one_iter=False):
    """
    Function to test different all version of preprocessed quotes with a given
    classifer.
    """
    
    cols = [
        'quotation_cleanA',
        'quotation_cleanB',
        'quotation_cleanC',
        'quotation_cleanD',
        'quotation_cleanE',
    ]

    for col in cols:
        
        # Get quotation preprocessing variant
        X = df[col].values

        # Get label and convert to useful format
        y = df['party_label'].values
        y = convert_labels(y)

        # Run cross validation with different metrics
        # scoring=['accuracy', 'precision', 'recall', 'f1']
        scoring=['accuracy', 'f1']
        res = cross_validate(pipeline, X, y, scoring=scoring, cv=3)
        res.pop('score_time')

        # Print results
        print(f'Col: {col}')
        print_cross_validate_results(res)
        
        if break_after_one_iter:
            break

On each level of preprocessed text we run cross validations with 3 different ML models. 
Multinomial Naive Bayes, LogisticRegression and Gradient Boosted Trees.

We only use Tfidf vectorization but test different levels of N-gram expansion.

In [31]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', MultinomialNB()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 3.628	std: 0.163
	test_accuracy        - 	avg: 0.671	std: 0.001
	test_f1              - 	avg: 0.673	std: 0.001
Col: quotation_cleanB
	fit_time             - 	avg: 3.505	std: 0.100
	test_accuracy        - 	avg: 0.671	std: 0.001
	test_f1              - 	avg: 0.673	std: 0.001
Col: quotation_cleanC
	fit_time             - 	avg: 2.122	std: 0.022
	test_accuracy        - 	avg: 0.671	std: 0.002
	test_f1              - 	avg: 0.671	std: 0.001
Col: quotation_cleanD
	fit_time             - 	avg: 2.341	std: 0.304
	test_accuracy        - 	avg: 0.662	std: 0.002
	test_f1              - 	avg: 0.661	std: 0.001
Col: quotation_cleanE
	fit_time             - 	avg: 2.533	std: 0.188
	test_accuracy        - 	avg: 0.668	std: 0.002
	test_f1              - 	avg: 0.667	std: 0.001


In [32]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', LogisticRegression(max_iter=1000)),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 10.666	std: 0.746
	test_accuracy        - 	avg: 0.670	std: 0.001
	test_f1              - 	avg: 0.666	std: 0.000
Col: quotation_cleanB
	fit_time             - 	avg: 11.970	std: 2.888
	test_accuracy        - 	avg: 0.670	std: 0.001
	test_f1              - 	avg: 0.666	std: 0.000
Col: quotation_cleanC
	fit_time             - 	avg: 7.622	std: 2.836
	test_accuracy        - 	avg: 0.669	std: 0.001
	test_f1              - 	avg: 0.665	std: 0.000
Col: quotation_cleanD
	fit_time             - 	avg: 4.686	std: 0.382
	test_accuracy        - 	avg: 0.664	std: 0.001
	test_f1              - 	avg: 0.659	std: 0.000
Col: quotation_cleanE
	fit_time             - 	avg: 5.684	std: 0.295
	test_accuracy        - 	avg: 0.668	std: 0.001
	test_f1              - 	avg: 0.663	std: 0.000


In [40]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', GradientBoostingClassifier()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 157.447	std: 2.448
	test_accuracy        - 	avg: 0.589	std: 0.001
	test_f1              - 	avg: 0.494	std: 0.003


KeyboardInterrupt: 

In [45]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', LinearSVC()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 6.733	std: 0.309
	test_accuracy        - 	avg: 0.674	std: 0.001
	test_f1              - 	avg: 0.672	std: 0.001
Col: quotation_cleanB
	fit_time             - 	avg: 5.830	std: 0.211
	test_accuracy        - 	avg: 0.674	std: 0.001
	test_f1              - 	avg: 0.672	std: 0.001
Col: quotation_cleanC
	fit_time             - 	avg: 4.404	std: 0.248
	test_accuracy        - 	avg: 0.672	std: 0.001
	test_f1              - 	avg: 0.670	std: 0.000
Col: quotation_cleanD
	fit_time             - 	avg: 4.410	std: 0.185
	test_accuracy        - 	avg: 0.666	std: 0.001
	test_f1              - 	avg: 0.664	std: 0.001
Col: quotation_cleanE
	fit_time             - 	avg: 4.574	std: 0.207
	test_accuracy        - 	avg: 0.669	std: 0.001
	test_f1              - 	avg: 0.667	std: 0.000


Now we'll try using using unigrams and bigrams!

In [33]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2))),
            ('clf', MultinomialNB()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 11.847	std: 0.019
	test_accuracy        - 	avg: 0.707	std: 0.002
	test_f1              - 	avg: 0.709	std: 0.002
Col: quotation_cleanB
	fit_time             - 	avg: 11.856	std: 0.215
	test_accuracy        - 	avg: 0.707	std: 0.002
	test_f1              - 	avg: 0.709	std: 0.002
Col: quotation_cleanC
	fit_time             - 	avg: 8.975	std: 0.102
	test_accuracy        - 	avg: 0.716	std: 0.002
	test_f1              - 	avg: 0.715	std: 0.002
Col: quotation_cleanD
	fit_time             - 	avg: 8.662	std: 0.410
	test_accuracy        - 	avg: 0.713	std: 0.002
	test_f1              - 	avg: 0.712	std: 0.002
Col: quotation_cleanE
	fit_time             - 	avg: 8.806	std: 0.152
	test_accuracy        - 	avg: 0.715	std: 0.002
	test_f1              - 	avg: 0.714	std: 0.002


In [42]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2))),
            ('clf', LogisticRegression(max_iter=1000)),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 28.409	std: 1.658
	test_accuracy        - 	avg: 0.695	std: 0.002
	test_f1              - 	avg: 0.690	std: 0.002
Col: quotation_cleanB
	fit_time             - 	avg: 29.686	std: 5.530
	test_accuracy        - 	avg: 0.695	std: 0.002
	test_f1              - 	avg: 0.690	std: 0.002
Col: quotation_cleanC
	fit_time             - 	avg: 40.314	std: 4.547
	test_accuracy        - 	avg: 0.700	std: 0.002
	test_f1              - 	avg: 0.694	std: 0.001
Col: quotation_cleanD
	fit_time             - 	avg: 36.972	std: 4.372
	test_accuracy        - 	avg: 0.699	std: 0.002
	test_f1              - 	avg: 0.693	std: 0.002
Col: quotation_cleanE
	fit_time             - 	avg: 43.261	std: 2.992
	test_accuracy        - 	avg: 0.700	std: 0.003
	test_f1              - 	avg: 0.695	std: 0.002


In [46]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,2))),
            ('clf', LinearSVC()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 18.149	std: 1.085
	test_accuracy        - 	avg: 0.708	std: 0.002
	test_f1              - 	avg: 0.706	std: 0.002
Col: quotation_cleanB
	fit_time             - 	avg: 17.297	std: 1.203
	test_accuracy        - 	avg: 0.708	std: 0.002
	test_f1              - 	avg: 0.706	std: 0.002
Col: quotation_cleanC
	fit_time             - 	avg: 12.830	std: 0.049
	test_accuracy        - 	avg: 0.718	std: 0.003
	test_f1              - 	avg: 0.716	std: 0.002
Col: quotation_cleanD
	fit_time             - 	avg: 10.893	std: 0.191
	test_accuracy        - 	avg: 0.714	std: 0.003
	test_f1              - 	avg: 0.712	std: 0.002
Col: quotation_cleanE
	fit_time             - 	avg: 11.049	std: 0.225
	test_accuracy        - 	avg: 0.717	std: 0.003
	test_f1              - 	avg: 0.714	std: 0.003


Once again the best performing classifer is Multinomial Naive Bayes, its also
the fastest one to train.

How about adding trigrams too! We don't train all 3 models as training times
get out of hand.

In [41]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,3))),
            ('clf', MultinomialNB()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 28.222	std: 0.143
	test_accuracy        - 	avg: 0.716	std: 0.003
	test_f1              - 	avg: 0.717	std: 0.003
Col: quotation_cleanB
	fit_time             - 	avg: 27.320	std: 0.790
	test_accuracy        - 	avg: 0.716	std: 0.003
	test_f1              - 	avg: 0.717	std: 0.003
Col: quotation_cleanC
	fit_time             - 	avg: 18.252	std: 0.441
	test_accuracy        - 	avg: 0.724	std: 0.002
	test_f1              - 	avg: 0.721	std: 0.002
Col: quotation_cleanD
	fit_time             - 	avg: 17.606	std: 0.456
	test_accuracy        - 	avg: 0.722	std: 0.002
	test_f1              - 	avg: 0.720	std: 0.002
Col: quotation_cleanE
	fit_time             - 	avg: 17.822	std: 0.225
	test_accuracy        - 	avg: 0.724	std: 0.002
	test_f1              - 	avg: 0.721	std: 0.002


In [47]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,3))),
            ('clf', LinearSVC()),
        ])

test_classifer(df, pipeline)

Col: quotation_cleanA
	fit_time             - 	avg: 37.191	std: 1.032
	test_accuracy        - 	avg: 0.718	std: 0.001
	test_f1              - 	avg: 0.715	std: 0.001
Col: quotation_cleanB
	fit_time             - 	avg: 32.412	std: 0.392
	test_accuracy        - 	avg: 0.719	std: 0.001
	test_f1              - 	avg: 0.715	std: 0.001
Col: quotation_cleanC
	fit_time             - 	avg: 21.432	std: 0.398
	test_accuracy        - 	avg: 0.724	std: 0.002
	test_f1              - 	avg: 0.721	std: 0.002
Col: quotation_cleanD
	fit_time             - 	avg: 20.245	std: 0.462
	test_accuracy        - 	avg: 0.723	std: 0.002
	test_f1              - 	avg: 0.720	std: 0.002
Col: quotation_cleanE
	fit_time             - 	avg: 21.807	std: 1.688
	test_accuracy        - 	avg: 0.724	std: 0.002
	test_f1              - 	avg: 0.720	std: 0.002


Here we achieve our best accuracy of 72.4%, with both LinearSVC and MultinomialNB. Eventhough, it is a rather low score we found it to be acceptable given the difficulty of the task. After all predicting predicting the political party of a speaker based on one quote alone is quite a feat. We assume this could be further improved by using word2vec or BERT but that would be beyond the scope of this project.

Later, we will also aggregate our predictions to predict the political affiliation of a speaker based on all the quotes that are attributed to them and as such we can expect a better performance there!

In [None]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(1,4))),
            ('clf', LinearSVC()),
        ])

test_classifer(df, pipeline)

In [None]:
pipeline = Pipeline([
            ('vect', TfidfVectorizer(ngram_range=(2,2))),
            ('clf', MultinomialNB()),
        ])

test_classifer(df, pipeline)

## Training our model on all our dataset

Now that we have established the best level of preprocessing and chosen 
a vectorizer and model we can train it on all the relevant quotebank data (6 million quotes).

From now and onwards we only use our most optimal text
cleaning/preprocessing (option E) which as mentioned before is our most thorough
version of cleaning and includes lemmatization.

We start by loading and preparing the data like we did for the preprocessed file
that contained all the text preprocessing variants.

In [3]:
path = fixpath(QUOTES_LABELED_CLEANED)
# path = fixpath(QUOTES_2020_LABELED_CLEANED)
df_raw = pd.read_json(path, orient='records', lines=True)
df_raw

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,id,party_label,US_congress_bio_ID,quotation_clean
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,sue myrick,2020-01-16 12:00:13,1,Q367796,R,M001134,department homeland security livid strongly ur...
1,2020-01-26-000499,a few of the candidates who will do better in ...,dave loebsack,2020-01-26 13:21:36,11,Q771586,D,L000565,candidate better part world
2,2020-01-26-040663,"The generational thing is important, quite hon...",dave loebsack,2020-01-26 13:21:36,11,Q771586,D,L000565,generational thing important quite honestly th...
3,2020-01-20-000982,a host of other protections,debbie lesko,2020-01-20 15:32:48,1,Q16731415,R,L000589,host protection
4,2020-01-09-060095,"No, are you kidding? The way that this place l...",debbie lesko,2020-01-09 23:15:21,1,Q16731415,R,L000589,kidding way place leak around
...,...,...,...,...,...,...,...,...,...
349141,2020-02-10-100845,We're just worried about making sure we keep t...,sherrie sprenger,2020-02-10 00:00:00,1,Q7495360,R,,worried making sure keep balance expression ri...
349142,2020-03-16-079753,We've all embraced strict proper hygiene proce...,robert abrams,2020-03-16 12:00:00,2,Q2156314,D,,embraced strict proper hygiene procedure heard...
349143,2020-01-13-091997,What's important is that we keep moving forward.,laurie jinkins,2020-01-13 19:51:15,1,Q6501617,D,,important keep moving forward
349144,2020-02-20-093793,"When they want a bill, they bring the bill to ...",j.t. wilcox,2020-02-20 22:12:45,1,Q6104393,R,,want bill bring bill floor


Dropping unneeded columns and again we drop short quotes.

In [4]:
df = df_raw.copy()
df.drop(['numOccurrences','US_congress_bio_ID'], axis=1, inplace=True)
df = drop_short_quotes(df, 0.1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,sue myrick,2020-01-16 12:00:13,Q367796,R,department homeland security livid strongly ur...
1,2020-01-26-000499,a few of the candidates who will do better in ...,dave loebsack,2020-01-26 13:21:36,Q771586,D,candidate better part world
2,2020-01-26-040663,"The generational thing is important, quite hon...",dave loebsack,2020-01-26 13:21:36,Q771586,D,generational thing important quite honestly th...
3,2020-01-20-000982,a host of other protections,debbie lesko,2020-01-20 15:32:48,Q16731415,R,host protection
4,2020-01-09-060095,"No, are you kidding? The way that this place l...",debbie lesko,2020-01-09 23:15:21,Q16731415,R,kidding way place leak around
...,...,...,...,...,...,...,...
349141,2020-02-10-100845,We're just worried about making sure we keep t...,sherrie sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
349142,2020-03-16-079753,We've all embraced strict proper hygiene proce...,robert abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
349143,2020-01-13-091997,What's important is that we keep moving forward.,laurie jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward
349144,2020-02-20-093793,"When they want a bill, they bring the bill to ...",j.t. wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor


In [8]:
df['party_label'].value_counts()

D    194756
R    117083
Name: party_label, dtype: int64

Rebalance the data by downsampling

In [10]:
df = downsample(df, 'party_label')

In [11]:
df['party_label'].value_counts()

D    117083
R    117083
Name: party_label, dtype: int64

In [None]:
df_bcp = df.copy()

## Model Training

In [13]:
# df_mini = df_filt.sample(100000)
# df_mini = df_filt.sample(frac=1)

# df

df

In [14]:
X = df['quotation_clean'].values

y = df['party_label'].values
y = convert_labels(y)

In [15]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.33, random_state=42)

split_frac = 0.9

bound = round(len(X)*split_frac)

X_train = X[:bound]
y_train = y[:bound]

X_test = X[bound:]
y_test = y[bound:]

In [16]:
X_train.shape

(90000,)

In [16]:
def vectorize_with_file(X, vectorizer):
    path_temp = TEMP_FILE

    with open(path_temp, 'w') as d_file:
        d_file.writelines(X + '\n')

    with open(path_temp, 'r') as s_file:
        X_vect=vectorizer.fit_transform(s_file)

    return X_vect

In [26]:
# Vectorize by writing to file
# vectorizer=TfidfVectorizer(ngram_range=(1,3))
# X_vect = vectorize_with_file(X_train, vectorizer)

In [18]:
# Vectorize in memory
vectorizer=TfidfVectorizer()
X_vect = vectorizer.fit_transform(X_train)

In [19]:
# with open(TEMP_FOLDER + 'quotes_vectorized_ngram=(1,3).pkl', 'wb') as file:
#     pickle.dump(X_vect, file)

In [20]:
X_test_vect = vectorizer.transform(X_test)

In [21]:
X_vect.shape

(90000, 29034)