In [2]:
from helpers import *

import sys

import bz2
import json

import pickle

import numpy as np
# import scipy

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

%load_ext autoreload
%autoreload 2

## Training our model on all our dataset

Now that we have established the best level of preprocessing and chosen 
a vectorizer and model we can train it on all the filtered quotebank data (6 million quotes).
We will use a 80:20 test:train split. 

From now and onwards we only use our most optimal text
cleaning/preprocessing (option E) which as mentioned before is our most thorough
version of cleaning and includes lemmatization.

We start by loading and preparing the data like we did for the preprocessed file
that contained all the text preprocessing variants.

In [3]:
path = fixpath(QUOTES_LABELED_CLEANED)
df_raw = pd.read_json(path, orient='records', lines=True)
df_raw

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor


Dropping unneeded columns and again we drop short quotes.

In [4]:
df = df_raw.copy()
# Droping quotes of people in both parties (except most popular members who were labeled manually)
df = df[df['party_label'] != 'RD']

df = drop_short_quotes(df, 0.1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor


In [5]:
df['party_label'].value_counts()

R    3310474
D    2896896
Name: party_label, dtype: int64

Rebalance the data by downsampling

In [7]:
df_d = df[df.party_label == 'D']
df_r = df[df.party_label == 'R'].sample(len(df[df.party_label == 'D']))

In [11]:
df_d

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
7090862,2020-04-16-051820,These outbreaks not only put seniors and other...,Josh Gottheimer,2020-04-16 00:00:00,Q6288908,D,outbreak put senior resident long term care fa...
7090864,2020-01-10-086593,This is a recommendation from our Capitol Poli...,Charniele Herring,2020-01-10 21:00:05,Q5086555,D,recommendation capitol police trust every day ...
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward


In [16]:
df = df_d.append(df_r)

In [17]:
# df = downsample(df, 'party_label')

In [18]:
df['party_label'].value_counts()

D    2896896
R    2896896
Name: party_label, dtype: int64

In [19]:
df_bcp = df.copy()

In [20]:
# df_mini = df_filt.sample(100000)
# df_mini = df_filt.sample(frac=1)

# df

# df = df.sample(frac=1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
2262691,2017-11-12-075115,"Unfortunately, the funders that pay for [ Metr...",Ray LaHood,2017-11-12 12:01:00,Q467891,R,unfortunately funders pay metro capital progra...
1586017,2016-07-28-050203,I'm hearing the reluctance because a lot of th...,Paul Manafort,2016-07-28 18:19:20,Q3371760,R,hearing reluctance lot unlike live time
36535,2015-12-17-048735,If this amendment is adopted to the current bi...,Ted Cruz,2015-12-17 16:42:16,Q2036942,R,amendment adopted current bill effect would mi...
928450,2015-01-16-089707,We need to make sure our patients are getting ...,Ann Rivers,2015-01-16 02:50:08,Q4766645,R,need make sure patient getting promised


## Model Training

In [21]:
train_fraction = 0.8

#gets a random 80% of the entire set
df_train = df.sample(frac=train_fraction, random_state=1)
#gets the left out portion of the dataset
df_test = df.loc[~df.index.isin(df_train.index)]
df_test = df_test.sample(frac=1)

df_train.party_label = convert_labels(df_train.party_label)
df_test.party_label = convert_labels(df_test.party_label)

df_train.shape, df_test.shape

((4635034, 7), (1158758, 7))

In [22]:
X_train = df_train.quotation_clean.values
y_train = df_train.party_label.values

X_test = df_test.quotation_clean.values
y_test = df_test.party_label.values

In [23]:
df_train.party_label.value_counts()

-1    2317978
 1    2317056
Name: party_label, dtype: int64

In [24]:
df_test.party_label.value_counts()

 1    579840
-1    578918
Name: party_label, dtype: int64

In [44]:
# Vectorize in memory
# vectorizer=TfidfVectorizer()
# vectorizer=TfidfVectorizer(ngram_range=(1,2))
vectorizer=TfidfVectorizer(ngram_range=(1,3), max_features=5_000_000)
X_train_vect = vectorizer.fit_transform(X_train)

In [45]:
X_test_vect = vectorizer.transform(X_test)

In [46]:
X_train_vect, X_test_vect

(<4635034x5000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 110615363 stored elements in Compressed Sparse Row format>,
 <1158758x5000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 26392000 stored elements in Compressed Sparse Row format>)

In [47]:
clf = MultinomialNB()
clf.fit(X_train_vect, y_train)

MultinomialNB()

Now we'll test the performance of our model!

In [48]:
clf.score(X_train_vect, y_train)

0.812784760586438

In [49]:

clf.score(X_test_vect, y_test)

0.7135855804231772

In [None]:
clf2 = LinearSVC()
clf2.fit(X_train_vect, y_train)

LinearSVC()

In [None]:
clf2.score(X_train_vect, y_train)

0.9252085313721539

In [None]:
clf2.score(X_test_vect, y_test)

0.7163868555815796

In [51]:
# save_pickle(vectorizer, MODELS_FOLDER + 'vectorizer_B.pkl')
save_pickle(clf, MODELS_FOLDER + 'classifier_B.pkl')

Models:
|   | vectorizer                                                 | classifer       | train_accuracy | test_accuracy |
|---|------------------------------------------------------------|-----------------|----------------|---------------|
| A | TfidfVectorizer(ngram_range=(1,2), max_features=1_000_000) | LinearSVC()     | 80.3%          | 69.5%         |
| B | TfidfVectorizer(ngram_range=(1,3), max_features=5_000_000) | MultinomialNB() | 81.2%          | 71.2%         |
| C |                      uses vectorizer B                     | LinearSVC()     | 92.5%          | 71.6%         |
    

---

We will save our model (vectorizer and classifier) now using pickle to be able to use it without having
to rerun this whole notebook which takes a while.

In [None]:
save_pickle(vectorizer, VECTORIZER_NGRAM13_V2)

In [None]:
save_pickle(clf, MODEL_MULTINOMIALNB_NGRAM13_V2)

To further save time later we will already run predictions on all our data and save it too a pickle too
in a data frame.

In [None]:
# find how to label the df by test or train

In [None]:
df = df_bcp.copy()

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2016-08-12-027018,I am disappointed the governor chose to veto t...,Andy Manar,2016-08-12 22:18:00,Q4760984,D,disappointed governor chose veto good importan...
1,2019-08-26-003606,Although we are all working hard to address th...,Phil Murphy,2019-08-26 13:32:27,Q260042,D,although working hard address near term priori...
2,2019-10-03-073537,overturn the limited version of collective bar...,Elizabeth Warren,2019-10-03 14:53:33,Q434706,D,overturn limited version collective bargaining
3,2015-03-02-073426,"there is nothing gloomy about this announcement,",Barbara Mikulski,2015-03-02 19:10:26,Q261147,D,nothing gloomy announcement
4,2019-11-19-023104,gives that candidate the ability to say that t...,Yvanna Cancela,2019-11-19 20:34:00,Q28595299,D,give candidate ability say vetted someone figh...
...,...,...,...,...,...,...,...
5793787,2019-05-24-072159,Our model says that an across-the-board 25 per...,Donald Trump,2019-05-24 00:53:04,Q22686,R,model say across board percent tariff china li...
5793788,2015-09-30-141287,work to make a difference for the people of Ea...,Cathy McMorris Rodgers,2015-09-30 23:49:46,Q293343,R,work make difference people eastern washington
5793789,2015-10-30-090828,That is an insult to every woman and man who p...,Kelly Ayotte,2015-10-30 20:09:33,Q22354,R,insult every woman man put life line serve cou...
5793790,2018-01-26-128626,We are not down and out yet. We'll make a stro...,Cathrynn Brown,2018-01-26 22:47:54,Q16200773,R,yet make strong case funding need granted reve...


---------

In [40]:
df_temp = df_raw.copy()
df_t = df_temp.index.isin(df_train.index)
df_nt = df_temp.index.isin(df_test.index)
df_temp.loc[df_t,'data_use'] = 'train'
df_temp.loc[df_nt,'data_use'] = 'test'
df_temp

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...,train
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...,test
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...,train
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...,train
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test
...,...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...,train
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...,train
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward,train
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor,train


In [41]:
df_temp.loc[df_temp[df_temp.data_use.isna()].index, 'data_use'] = 'none'

In [42]:
df_temp[df_temp.data_use == 'none']

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use
6,2015-04-15-038299,"If they destroy you, who is there left?",Bernie Sanders,2015-04-15 17:49:00,Q359442,D,destroy left,none
8,2015-10-14-073757,"labor is the source of all wealth,",Bernie Sanders,2015-10-14 20:09:23,Q359442,D,labor source wealth,none
10,2015-04-21-040961,"No, it's America,",Bernie Sanders,2015-04-21 20:01:11,Q359442,D,america,none
16,2015-05-28-003378,"And that is: you can't have it all,",Bernie Sanders,2015-05-28 23:24:24,Q359442,D,,none
17,2015-08-29-005147,"As our campaign progresses,",Bernie Sanders,2015-08-29 04:46:14,Q359442,D,campaign progress,none
...,...,...,...,...,...,...,...,...
7090842,2020-02-27-030322,I treat every race the same. I run on my recor...,Bobby Kaufmann,2020-02-27 17:37:10,Q22278538,R,treat every race run record talk people run po...,none
7090847,2020-02-18-050375,"It's really, for the first time gaining some t...",Matt Muratore,2020-02-18 18:02:24,Q19518518,R,really first time gaining traction beacon hill...,none
7090858,2020-01-09-079332,"The focus should be on the person, not the obj...",Stephanie Hansen,2020-01-09 00:00:00,Q28842862,D,focus person object,none
7090861,2020-04-03-059924,There are people calling for all flights to st...,John F. Kennedy,2020-04-03 00:00:00,Q28836957,R,people calling flight stop travel contribute s...,none


In [43]:
save_pickle(df_temp, TEMP_FOLDER + 'wow_data.pkl')

---

In [3]:
# df = load_pickle(MODELS_FOLDER + 'train_test_data.pkl')

In [57]:
# df
df_temp_b = df_temp.copy()

In [59]:
df_temp_b

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,data_use
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...,train
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...,test
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...,train
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...,train
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...,test
...,...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...,train
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...,train
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward,train
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor,train


array([[0.6369649 , 0.3630351 ],
       [0.61755714, 0.38244286],
       [0.42058631, 0.57941369],
       ...,
       [0.33707316, 0.66292684],
       [0.58546035, 0.41453965],
       [0.42206348, 0.57793652]])

In [53]:
vectorized_quotes = vectorizer.transform(df_temp.quotation_clean.values)

In [54]:
dem_probas = clf.predict_proba(vectorized_quotes)

In [63]:
dem_probas = dem_probas[:,0]

In [64]:
df_temp_b['prob_dem'] = dem_probas

In [65]:
save_pickle(df_temp_b, QUOTES_LABELED_CLEANED_PREDICTED_PKL)

In [None]:
with bz2.open(QUOTES_LABELED_CLEANED_PREDICTED, 'wb') as d_file:
    df.to_json(d_file, orient='records', lines=True)

In [None]:
def vectorize_with_file(X, vectorizer):
    path_temp = TEMP_FILE

    with open(path_temp, 'w') as d_file:
        d_file.writelines(X + '\n')

    with open(path_temp, 'r') as s_file:
        X_vect=vectorizer.fit_transform(s_file)

    return X_vect

In [None]:
# Vectorize by writing to file
# vectorizer=TfidfVectorizer(ngram_range=(1,3))
# X_vect = vectorize_with_file(X_train, vectorizer)