In [1]:
from helpers import *

import sys

import bz2
import json

import pickle

import numpy as np

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

%load_ext autoreload
%autoreload 2

## Training our model on all our dataset

Now that we have established the best level of preprocessing and chosen 
a vectorizer and model we can train it on all the filtered quotebank data (6 million quotes).
We will use a 80:20 test:train split. 

From now and onwards we only use our most optimal text
cleaning/preprocessing (option E) which as mentioned before is our most thorough
version of cleaning and includes lemmatization.

We start by loading and preparing the data like we did for the preprocessed file
that contained all the text preprocessing variants.

In [2]:
path = fixpath(QUOTES_LABELED_CLEANED)
# path = fixpath(QUOTES_2020_LABELED_CLEANED)
df_raw = pd.read_json(path, orient='records', lines=True)
df_raw

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
6449996,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...
6449997,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...
6449998,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi
6449999,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others


Dropping unneeded rows and again we drop short quotes.

In [3]:
df = df_raw.copy()
# Droping quotes of people in both parties (except most popular members who were labeled manually)
df = df[df['party_label'] != 'RD']

df = drop_short_quotes(df, 0.1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
6449995,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor
6449996,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...
6449997,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...
6449999,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others


In [4]:
df['party_label'].value_counts()

R    2990528
D    2650684
Name: party_label, dtype: int64

Rebalance the data by downsampling

In [5]:
df=downsample2(df)
df.party_label.value_counts()

D    2650684
R    2650684
Name: party_label, dtype: int64

In [6]:
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
788906,2015-12-16-127076,"We have to take their oil, shut down all of th...",Ben Carson,2015-12-16 06:30:00,Q816459,R,take oil shut mechanism whereby disperse money
579518,2015-12-08-033827,"I don't think Donald Trump is serious,",Jeb Bush,2015-12-08 07:20:41,Q221997,R,think donald trump serious
105848,2015-03-10-078924,"There are nothing but hardliners in Iran, noth...",Tom Cotton,2015-03-10 00:45:02,Q3090307,R,nothing hardliner iran nothing hardline islami...
941412,2015-12-22-023084,"He's a strong leader, he's a powerful leader.....",Donald Trump,2015-12-22 15:37:00,Q22686,R,strong leader powerful leader actually got pop...


In [7]:
df_bcp = df.copy()

## Model Training

In [8]:
train_fraction = 0.8

#gets a random 80% of the entire set
df_train = df.sample(frac=train_fraction, random_state=1)
#gets the left out portion of the dataset
df_test = df.loc[~df.index.isin(df_train.index)]
df_test = df_test.sample(frac=1)

df_train.party_label = convert_labels(df_train.party_label)
df_test.party_label = convert_labels(df_test.party_label)

df_train.shape, df_test.shape

((4241094, 7), (1060274, 7))

In [9]:
X_train = df_train.quotation_clean.values
y_train = df_train.party_label.values

X_test = df_test.quotation_clean.values
y_test = df_test.party_label.values

In [10]:
df_train.party_label.value_counts()

-1    2121044
 1    2120050
Name: party_label, dtype: int64

In [11]:
df_test.party_label.value_counts()

 1    530634
-1    529640
Name: party_label, dtype: int64

In [12]:
# Vectorize in memory
# vectorizer=TfidfVectorizer()
# vectorizer=TfidfVectorizer(ngram_range=(1,2))
vectorizer=TfidfVectorizer(ngram_range=(1,3))
X_train_vect = vectorizer.fit_transform(X_train)

In [13]:
X_test_vect = vectorizer.transform(X_test)

In [14]:
X_train_vect, X_test_vect

(<4241094x35977444 sparse matrix of type '<class 'numpy.float64'>'
 	with 137359599 stored elements in Compressed Sparse Row format>,
 <1060274x35977444 sparse matrix of type '<class 'numpy.float64'>'
 	with 27670207 stored elements in Compressed Sparse Row format>)

In [15]:
clf = MultinomialNB(alpha=1.8)
clf.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.8)

Now we'll test the performance of our model!

In [26]:
clf.score(X_test_vect, y_test)

0.7569611762272044


We get 75.6% accuracy on the test set!

---

We will save our model (vectorizer and classifier) now using pickle to be able to use it without having
to rerun this whole notebook which takes a while.

In [18]:
save_pickle(vectorizer, MODEL_FINAL)
save_pickle(clf, VECTORIZER_FINAL)

---------

To further save time later we will already run predictions on all our data and save it to a pickle too
in a data frame.

In [19]:
df_final = df_raw.copy()
df_t = df_final.index.isin(df_train.index)
df_nt = df_final.index.isin(df_test.index)

# We will also add a label specifiying how we used the data for convenience.
df_final.loc[df_t,'data_use'] = 'train'   # data was used for training our model
df_final.loc[df_nt,'data_use'] = 'test'   # data was used for testing our model

# none of the above. This corresponds to quotes that weren't good for training data
# such as quotes that are too short or who have a speaker who was part of both polticial parties
# at some point but whom we didnt manually label like Trump or Clinton.
df_final.loc[df_final[df_final.data_use.isna()].index, 'data_use'] = 'none'
df_final

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...,train
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...,test
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...,train
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...,train
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test
...,...,...,...,...,...,...,...,...
6449996,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...,train
6449997,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...,train
6449998,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi,none
6449999,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others,train


Now we just vectorize all our quotes. Run predictions and extract the assigned probablity that a quote was said by a democrat. We then add this as a column to our quotes data frame and save the data_frame to both a pickle file and compressed json.

In [20]:
vectorized_quotes = vectorizer.transform(df_final.quotation_clean.values)

In [21]:
dem_probas = clf.predict_proba(vectorized_quotes)[:,1]

In [22]:
df_final['prob_dem'] = dem_probas

Lets see the resulting data frame

In [23]:
df_final

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use,prob_dem
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...,train,0.428277
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...,test,0.415816
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...,train,0.636808
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...,train,0.586204
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test,0.780465
...,...,...,...,...,...,...,...,...,...
6449996,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...,train,0.544174
6449997,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...,train,0.748537
6449998,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi,none,0.428595
6449999,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others,train,0.531386


In [24]:
save_pickle(df_final, QUOTES_LABELED_CLEANED_PREDICTED_PKL)

In [25]:
with bz2.open(QUOTES_LABELED_CLEANED_PREDICTED, 'wb') as d_file:
    df.to_json(d_file, orient='records', lines=True)

Now we will use `df_final` to perform most of our data analysis in the next notebooks. 
In the [next notebook](part3_3-political_distribution_analysis.ipynb) we will be a doing a big picture analysis of the distributions of political ideas of both parties.