In [7]:
from helpers import *

import sys

import bz2
import json

import pickle

import numpy as np
# import scipy

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Training our model on all our dataset

Now that we have established the best level of preprocessing and chosen 
a vectorizer and model we can train it on all the filtered quotebank data (6 million quotes).
We will use a 80:20 test:train split. 

From now and onwards we only use our most optimal text
cleaning/preprocessing (option E) which as mentioned before is our most thorough
version of cleaning and includes lemmatization.

We start by loading and preparing the data like we did for the preprocessed file
that contained all the text preprocessing variants.

In [3]:
path = fixpath(QUOTES_2020_LABELED_CLEANED)
df_raw = pd.read_json(path, orient='records', lines=True)
df_raw

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,Q367796,R,department homeland security livid strongly ur...
1,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,2020-03-19 19:14:00,Q816459,R,action allow household fha insured mortgage me...
2,2020-01-22-009723,be pivotal in addressing financial frustrations,Ben Carson,2020-01-22 21:07:39,Q816459,R,pivotal addressing financial frustration
3,2020-02-04-110477,We're talking about `Do we want to continue th...,Ben Carson,2020-02-04 23:02:36,Q816459,R,talking want continue lifestyle characterized ...
4,2020-01-28-051506,It's not just a matter of throwing more and mo...,Ben Carson,2020-01-28 19:23:36,Q816459,R,matter throwing money voucher service getting ...
...,...,...,...,...,...,...,...
371866,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...
371867,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...
371868,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi
371869,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others


Dropping unneeded rows and again we drop short quotes.

In [11]:
df = df_raw.copy()
# Droping quotes of people in both parties (except most popular members who were labeled manually)
df = df[df['party_label'] != 'RD']

df = drop_short_quotes(df, 0.1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,Q367796,R,department homeland security livid strongly ur...
1,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,2020-03-19 19:14:00,Q816459,R,action allow household fha insured mortgage me...
2,2020-01-22-009723,be pivotal in addressing financial frustrations,Ben Carson,2020-01-22 21:07:39,Q816459,R,pivotal addressing financial frustration
3,2020-02-04-110477,We're talking about `Do we want to continue th...,Ben Carson,2020-02-04 23:02:36,Q816459,R,talking want continue lifestyle characterized ...
4,2020-01-28-051506,It's not just a matter of throwing more and mo...,Ben Carson,2020-01-28 19:23:36,Q816459,R,matter throwing money voucher service getting ...
...,...,...,...,...,...,...,...
371865,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor
371866,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...
371867,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...
371869,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others


In [12]:
df['party_label'].value_counts()

D    186604
R    140651
Name: party_label, dtype: int64

Rebalance the data by downsampling

In [13]:
df=downsample2(df)
df.party_label.value_counts()

D    140651
R    140651
Name: party_label, dtype: int64

In [14]:
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
264157,2020-01-14-051866,"Look, the mayor wants billionaires to pay one ...",Pete Buttigieg,2020-01-14 10:57:18,Q7173106,D,look mayor want billionaire pay one tuition ki...
214017,2020-01-21-004813,"and I don't consider drunk driving a felony,",Joe Biden,2020-01-21 17:39:00,Q6279,D,consider drunk driving felony
367179,2020-01-16-098761,We are simply the administrators of a process ...,Jocelyn Benson,2020-01-16 14:06:52,Q13562375,D,simply administrator process citizen created c...
8546,2020-04-06-062644,"When New York City sneezes, Connecticut catche...",Ned Lamont,2020-04-06 19:00:00,Q1973878,D,new york city sneeze connecticut catch cold
329174,2020-03-18-052269,possibility of a shelter-in-place order. It ha...,Bill de Blasio,2020-03-18 07:39:58,Q4911497,D,possibility shelter place order happened yet d...
...,...,...,...,...,...,...,...
371859,2020-04-03-059924,There are people calling for all flights to st...,John F. Kennedy,2020-04-03 00:00:00,Q28836957,R,people calling flight stop travel contribute s...
371862,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
371865,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor
371869,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others


In [15]:
df_bcp = df.copy()

## Model Training

In [16]:
train_fraction = 0.8

#gets a random 80% of the entire set
df_train = df.sample(frac=train_fraction, random_state=1)
#gets the left out portion of the dataset
df_test = df.loc[~df.index.isin(df_train.index)]
df_test = df_test.sample(frac=1)

df_train.party_label = convert_labels(df_train.party_label)
df_test.party_label = convert_labels(df_test.party_label)

df_train.shape, df_test.shape

((225042, 7), (56260, 7))

In [17]:
X_train = df_train.quotation_clean.values
y_train = df_train.party_label.values

X_test = df_test.quotation_clean.values
y_test = df_test.party_label.values

In [18]:
df_train.party_label.value_counts()

 1    112713
-1    112329
Name: party_label, dtype: int64

In [19]:
df_test.party_label.value_counts()

-1    28322
 1    27938
Name: party_label, dtype: int64

In [20]:
# Vectorize in memory
# vectorizer=TfidfVectorizer()
# vectorizer=TfidfVectorizer(ngram_range=(1,2))
vectorizer=TfidfVectorizer(ngram_range=(1,3))
X_train_vect = vectorizer.fit_transform(X_train)

In [21]:
X_test_vect = vectorizer.transform(X_test)

In [23]:
X_train_vect, X_test_vect

(<225042x2707471 sparse matrix of type '<class 'numpy.float64'>'
 	with 7202941 stored elements in Compressed Sparse Row format>,
 <56260x2707471 sparse matrix of type '<class 'numpy.float64'>'
 	with 1276008 stored elements in Compressed Sparse Row format>)

In [24]:
clf = MultinomialNB(alpha=1.8)
clf.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.8)

Now we'll test the performance of our model!

In [25]:
clf.score(X_train_vect, y_train)

0.9632868531207508

In [26]:

clf.score(X_test_vect, y_test)

0.7605758976182012

---

We will save our model (vectorizer and classifier) now using pickle to be able to use it without having
to rerun this whole notebook which takes a while.

In [51]:
save_pickle(vectorizer, MODEL_FINAL)
save_pickle(clf, VECTORIZER_FINAL)

---------

To further save time later we will already run predictions on all our data and save it to a pickle too
in a data frame.

In [44]:
df_final = df_raw.copy()
df_t = df_final.index.isin(df_train.index)
df_nt = df_final.index.isin(df_test.index)

# We will also add a label specifiying how we used the data for convenience.
df_final.loc[df_t,'data_use'] = 'train'   # data was used for training our model
df_final.loc[df_nt,'data_use'] = 'test'   # data was used for testing our model

# none of the above. This corresponds to quotes that weren't good for training data
# such as quotes that are too short or who have a speaker who was part of both polticial parties
# at some point but whom we didnt manually label like Trump or Clinton.
df_final.loc[df_final[df_final.data_use.isna()].index, 'data_use'] = 'none'
df_final

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,Q367796,R,department homeland security livid strongly ur...,train
1,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,2020-03-19 19:14:00,Q816459,R,action allow household fha insured mortgage me...,train
2,2020-01-22-009723,be pivotal in addressing financial frustrations,Ben Carson,2020-01-22 21:07:39,Q816459,R,pivotal addressing financial frustration,train
3,2020-02-04-110477,We're talking about `Do we want to continue th...,Ben Carson,2020-02-04 23:02:36,Q816459,R,talking want continue lifestyle characterized ...,train
4,2020-01-28-051506,It's not just a matter of throwing more and mo...,Ben Carson,2020-01-28 19:23:36,Q816459,R,matter throwing money voucher service getting ...,train
...,...,...,...,...,...,...,...,...
371866,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...,train
371867,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...,train
371868,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi,none
371869,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others,test


Now we just vectorize all our quotes. Run predictions and extract the assigned probablity that a quote was said by a democrat. We then add this as a column to our quotes data frame and save the data_frame to both a pickle file and compressed json.

In [45]:
vectorized_quotes = vectorizer.transform(df_final.quotation_clean.values)

In [46]:
dem_probas = clf.predict_proba(vectorized_quotes)[:,1]

In [47]:
df_final['prob_dem'] = dem_probas

Lets see the resulting data frame

In [48]:
df_final

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use,prob_dem
0,2020-01-16-000088,[ Department of Homeland Security ] was livid ...,Sue Myrick,2020-01-16 12:00:13,Q367796,R,department homeland security livid strongly ur...,train,0.353979
1,2020-03-19-000276,[ These ] actions will allow households who ha...,Ben Carson,2020-03-19 19:14:00,Q816459,R,action allow household fha insured mortgage me...,train,0.236290
2,2020-01-22-009723,be pivotal in addressing financial frustrations,Ben Carson,2020-01-22 21:07:39,Q816459,R,pivotal addressing financial frustration,train,0.471626
3,2020-02-04-110477,We're talking about `Do we want to continue th...,Ben Carson,2020-02-04 23:02:36,Q816459,R,talking want continue lifestyle characterized ...,train,0.375666
4,2020-01-28-051506,It's not just a matter of throwing more and mo...,Ben Carson,2020-01-28 19:23:36,Q816459,R,matter throwing money voucher service getting ...,train,0.458933
...,...,...,...,...,...,...,...,...,...
371866,2020-02-09-057933,"Who will respect different types of people, di...",Susan Wild,2020-02-09 15:08:23,Q58323072,D,respect different type people different opinio...,train,0.578025
371867,2020-03-27-006868,Breaking news: Congressman Massie has tested p...,John Kerry,2020-03-27 21:57:02,Q22316,D,breaking news congressman massie tested positi...,train,0.782462
371868,2020-02-02-027604,"I've got news for you, Nancy Pelosi is 79,",John Kerry,2020-02-02 14:07:56,Q22316,D,got news nancy pelosi,none,0.322602
371869,2020-01-08-026410,I did not succeed without the help of others,George W. Bush,2020-01-08 20:08:51,Q207,R,succeed without help others,test,0.530596


In [None]:
save_pickle(df_final, QUOTES_LABELED_CLEANED_PREDICTED_PKL)

In [None]:
with bz2.open(QUOTES_LABELED_CLEANED_PREDICTED, 'wb') as d_file:
    df.to_json(d_file, orient='records', lines=True)

Now we will use `df_final` to perform most of our data analysis in the next notebooks. 
In the [next notebook](part3_3-political_distribution_analysis.ipynb) we will be a doing a big picture analysis of the distributions of political ideas of both parties.