In [None]:
from helpers import *

import sys

import bz2
import json

import pickle

import numpy as np
# import scipy

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

%load_ext autoreload
%autoreload 2

## Training our model on all our dataset

Now that we have established the best level of preprocessing and chosen 
a vectorizer and model we can train it on all the filtered quotebank data (6 million quotes).
We will use a 80:20 test:train split. 

From now and onwards we only use our most optimal text
cleaning/preprocessing (option E) which as mentioned before is our most thorough
version of cleaning and includes lemmatization.

We start by loading and preparing the data like we did for the preprocessed file
that contained all the text preprocessing variants.

In [None]:
path = fixpath(QUOTES_LABELED_CLEANED)
df_raw = pd.read_json(path, orient='records', lines=True)
df_raw

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor


Dropping unneeded columns and again we drop short quotes.

In [None]:
df = df_raw.copy()
# Droping quotes of people in both parties (except most popular members who were labeled manually)
df = df[df['party_label'] != 'RD']

df = drop_short_quotes(df, 0.1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2015-08-31-000271,... a great day for veterans here in Littleton...,Jeanne Shaheen,2015-08-31 02:10:00,Q270316,D,great day veteran littleton across north count...
1,2015-02-03-074650,The safety and security of our nation is at ri...,Jeanne Shaheen,2015-02-03 20:02:24,Q270316,D,safety security nation risk hold funding homel...
2,2015-05-10-027625,It's been my experience that the faith communi...,Jeanne Shaheen,2015-05-10 22:48:46,Q270316,D,experience faith community willing partner gov...
3,2015-09-01-113454,we need to bring the parents in so they can se...,Jeanne Shaheen,2015-09-01 02:12:00,Q270316,D,need bring parent see manufacturing worked sho...
4,2015-10-25-000242,"' It is not now, nor has it ever been, the gol...",Bernie Sanders,2015-10-25 14:12:35,Q359442,D,ever gold standard trade agreement sander said...
...,...,...,...,...,...,...,...
7090867,2020-02-10-100845,We're just worried about making sure we keep t...,Sherrie Sprenger,2020-02-10 00:00:00,Q7495360,R,worried making sure keep balance expression ri...
7090868,2020-03-16-079753,We've all embraced strict proper hygiene proce...,Robert Abrams,2020-03-16 12:00:00,Q2156314,D,embraced strict proper hygiene procedure heard...
7090869,2020-01-13-091997,What's important is that we keep moving forward.,Laurie Jinkins,2020-01-13 19:51:15,Q6501617,D,important keep moving forward
7090870,2020-02-20-093793,"When they want a bill, they bring the bill to ...",J.T. Wilcox,2020-02-20 22:12:45,Q6104393,R,want bill bring bill floor


In [None]:
df['party_label'].value_counts()

R    3310474
D    2896896
Name: party_label, dtype: int64

Rebalance the data by downsampling

In [None]:
df = downsample(df, 'party_label')

In [None]:
df['party_label'].value_counts()

D    2896896
R    2896896
Name: party_label, dtype: int64

In [None]:
df_bcp = df.copy()

## Model Training

In [None]:
# df_mini = df_filt.sample(100000)
# df_mini = df_filt.sample(frac=1)

# df

# df = df.sample(frac=1)
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2016-08-12-027018,I am disappointed the governor chose to veto t...,Andy Manar,2016-08-12 22:18:00,Q4760984,D,disappointed governor chose veto good importan...
1,2019-08-26-003606,Although we are all working hard to address th...,Phil Murphy,2019-08-26 13:32:27,Q260042,D,although working hard address near term priori...
2,2019-10-03-073537,overturn the limited version of collective bar...,Elizabeth Warren,2019-10-03 14:53:33,Q434706,D,overturn limited version collective bargaining
3,2015-03-02-073426,"there is nothing gloomy about this announcement,",Barbara Mikulski,2015-03-02 19:10:26,Q261147,D,nothing gloomy announcement
4,2019-11-19-023104,gives that candidate the ability to say that t...,Yvanna Cancela,2019-11-19 20:34:00,Q28595299,D,give candidate ability say vetted someone figh...
...,...,...,...,...,...,...,...
5793787,2019-05-24-072159,Our model says that an across-the-board 25 per...,Donald Trump,2019-05-24 00:53:04,Q22686,R,model say across board percent tariff china li...
5793788,2015-09-30-141287,work to make a difference for the people of Ea...,Cathy McMorris Rodgers,2015-09-30 23:49:46,Q293343,R,work make difference people eastern washington
5793789,2015-10-30-090828,That is an insult to every woman and man who p...,Kelly Ayotte,2015-10-30 20:09:33,Q22354,R,insult every woman man put life line serve cou...
5793790,2018-01-26-128626,We are not down and out yet. We'll make a stro...,Cathrynn Brown,2018-01-26 22:47:54,Q16200773,R,yet make strong case funding need granted reve...


In [None]:
df.party_label = convert_labels(df.party_label)

In [None]:
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2016-08-12-027018,I am disappointed the governor chose to veto t...,Andy Manar,2016-08-12 22:18:00,Q4760984,1,disappointed governor chose veto good importan...
1,2019-08-26-003606,Although we are all working hard to address th...,Phil Murphy,2019-08-26 13:32:27,Q260042,1,although working hard address near term priori...
2,2019-10-03-073537,overturn the limited version of collective bar...,Elizabeth Warren,2019-10-03 14:53:33,Q434706,1,overturn limited version collective bargaining
3,2015-03-02-073426,"there is nothing gloomy about this announcement,",Barbara Mikulski,2015-03-02 19:10:26,Q261147,1,nothing gloomy announcement
4,2019-11-19-023104,gives that candidate the ability to say that t...,Yvanna Cancela,2019-11-19 20:34:00,Q28595299,1,give candidate ability say vetted someone figh...
...,...,...,...,...,...,...,...
5793787,2019-05-24-072159,Our model says that an across-the-board 25 per...,Donald Trump,2019-05-24 00:53:04,Q22686,-1,model say across board percent tariff china li...
5793788,2015-09-30-141287,work to make a difference for the people of Ea...,Cathy McMorris Rodgers,2015-09-30 23:49:46,Q293343,-1,work make difference people eastern washington
5793789,2015-10-30-090828,That is an insult to every woman and man who p...,Kelly Ayotte,2015-10-30 20:09:33,Q22354,-1,insult every woman man put life line serve cou...
5793790,2018-01-26-128626,We are not down and out yet. We'll make a stro...,Cathrynn Brown,2018-01-26 22:47:54,Q16200773,-1,yet make strong case funding need granted reve...


In [None]:
train_fraction = 0.8

#gets a random 80% of the entire set
df_train = df.sample(frac=train_fraction, random_state=1)
#gets the left out portion of the dataset
df_test = df.loc[~df.index.isin(df_train.index)]
df_test = df_test.sample(frac=1)


df_train.shape, df_test.shape

((4635034, 7), (1158758, 7))

In [None]:
X_train = df_train.quotation_clean.values
y_train = df_train.party_label.values

X_test = df_test.quotation_clean.values
y_test = df_test.party_label.values

In [None]:
df_train.party_label.value_counts()

-1    2317978
 1    2317056
Name: party_label, dtype: int64

In [None]:
df_test.party_label.value_counts()

 1    579840
-1    578918
Name: party_label, dtype: int64

In [None]:
# Vectorize in memory
# vectorizer=TfidfVectorizer(ngram_range=(1,2))
vectorizer=TfidfVectorizer(ngram_range=(1,3), max_features=5_000_000)
X_train_vect = vectorizer.fit_transform(X_train)

In [None]:
X_test_vect = vectorizer.transform(X_test)

In [None]:
X_train_vect, X_test_vect

(<4635034x5000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 110596736 stored elements in Compressed Sparse Row format>,
 <1158758x5000000 sparse matrix of type '<class 'numpy.float64'>'
 	with 26397333 stored elements in Compressed Sparse Row format>)

In [None]:
clf = MultinomialNB()
clf.fit(X_train_vect, y_train)

MultinomialNB()

Now we'll test the performance of our model!

In [None]:
clf.score(X_train_vect, y_train)

0.8128345984085553

In [None]:

clf.score(X_test_vect, y_test)

0.7128313245733794

In [None]:
clf2 = LinearSVC()
clf2.fit(X_train_vect, y_train)

LinearSVC()

In [None]:
clf2.score(X_train_vect, y_train)

0.9252085313721539

In [None]:
clf2.score(X_test_vect, y_test)

0.7163868555815796

In [None]:
# save_pickle(vectorizer, MODELS_FOLDER + 'vectorizer_B.pkl')
save_pickle(clf2, MODELS_FOLDER + 'classifier_C.pkl')

Models:
- A. TfidfVectorizer(ngram_range=(1,2), max_features=1_000_000) - LinearSVC() - 
- B. 
    

---

We will save our model (vectorizer and classifier) now using pickle to be able to use it without having
to rerun this whole notebook which takes a while.

In [None]:
save_pickle(vectorizer, VECTORIZER_NGRAM13_V2)

In [None]:
save_pickle(clf, MODEL_MULTINOMIALNB_NGRAM13_V2)

To further save time later we will already run predictions on all our data and save it too a pickle too
in a data frame.

In [None]:
# find how to label the df by test or train

In [None]:
df = df_bcp.copy()

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean
0,2016-08-12-027018,I am disappointed the governor chose to veto t...,Andy Manar,2016-08-12 22:18:00,Q4760984,D,disappointed governor chose veto good importan...
1,2019-08-26-003606,Although we are all working hard to address th...,Phil Murphy,2019-08-26 13:32:27,Q260042,D,although working hard address near term priori...
2,2019-10-03-073537,overturn the limited version of collective bar...,Elizabeth Warren,2019-10-03 14:53:33,Q434706,D,overturn limited version collective bargaining
3,2015-03-02-073426,"there is nothing gloomy about this announcement,",Barbara Mikulski,2015-03-02 19:10:26,Q261147,D,nothing gloomy announcement
4,2019-11-19-023104,gives that candidate the ability to say that t...,Yvanna Cancela,2019-11-19 20:34:00,Q28595299,D,give candidate ability say vetted someone figh...
...,...,...,...,...,...,...,...
5793787,2019-05-24-072159,Our model says that an across-the-board 25 per...,Donald Trump,2019-05-24 00:53:04,Q22686,R,model say across board percent tariff china li...
5793788,2015-09-30-141287,work to make a difference for the people of Ea...,Cathy McMorris Rodgers,2015-09-30 23:49:46,Q293343,R,work make difference people eastern washington
5793789,2015-10-30-090828,That is an insult to every woman and man who p...,Kelly Ayotte,2015-10-30 20:09:33,Q22354,R,insult every woman man put life line serve cou...
5793790,2018-01-26-128626,We are not down and out yet. We'll make a stro...,Cathrynn Brown,2018-01-26 22:47:54,Q16200773,R,yet make strong case funding need granted reve...


In [None]:
temp = df.index.isin(df_train.index)
df.loc[temp,'is_train'] = True
df.loc[~temp,'is_train'] = False

In [None]:
df

Unnamed: 0,quoteID,quotation,speaker,date,id,party_label,quotation_clean,is_train
0,2016-08-12-027018,I am disappointed the governor chose to veto t...,Andy Manar,2016-08-12 22:18:00,Q4760984,D,disappointed governor chose veto good importan...,True
1,2019-08-26-003606,Although we are all working hard to address th...,Phil Murphy,2019-08-26 13:32:27,Q260042,D,although working hard address near term priori...,False
2,2019-10-03-073537,overturn the limited version of collective bar...,Elizabeth Warren,2019-10-03 14:53:33,Q434706,D,overturn limited version collective bargaining,True
3,2015-03-02-073426,"there is nothing gloomy about this announcement,",Barbara Mikulski,2015-03-02 19:10:26,Q261147,D,nothing gloomy announcement,True
4,2019-11-19-023104,gives that candidate the ability to say that t...,Yvanna Cancela,2019-11-19 20:34:00,Q28595299,D,give candidate ability say vetted someone figh...,False
...,...,...,...,...,...,...,...,...
5793787,2019-05-24-072159,Our model says that an across-the-board 25 per...,Donald Trump,2019-05-24 00:53:04,Q22686,R,model say across board percent tariff china li...,True
5793788,2015-09-30-141287,work to make a difference for the people of Ea...,Cathy McMorris Rodgers,2015-09-30 23:49:46,Q293343,R,work make difference people eastern washington,True
5793789,2015-10-30-090828,That is an insult to every woman and man who p...,Kelly Ayotte,2015-10-30 20:09:33,Q22354,R,insult every woman man put life line serve cou...,False
5793790,2018-01-26-128626,We are not down and out yet. We'll make a stro...,Cathrynn Brown,2018-01-26 22:47:54,Q16200773,R,yet make strong case funding need granted reve...,True


In [None]:
save_pickle(df, MODELS_FOLDER + 'train_test_data.pkl')

In [None]:
vectorized_quotes = vectorizer.transform(df.quotation_clean.values)

In [None]:
dem_probas = clf.predict_proba(vectorized_quotes)

In [None]:
len(dem_probas)

5793792

In [None]:
df['prob_dem'] = dem_probas

ValueError: Wrong number of items passed 2, placement implies 1

In [None]:
save_pickle(df, QUOTES_LABELED_CLEANED_PREDICTED_PKL)

In [None]:
with bz2.open(QUOTES_LABELED_CLEANED_PREDICTED, 'wb') as d_file:
    df.to_json(d_file, orient='records', lines=True)

In [None]:
def vectorize_with_file(X, vectorizer):
    path_temp = TEMP_FILE

    with open(path_temp, 'w') as d_file:
        d_file.writelines(X + '\n')

    with open(path_temp, 'r') as s_file:
        X_vect=vectorizer.fit_transform(s_file)

    return X_vect

In [None]:
# Vectorize by writing to file
# vectorizer=TfidfVectorizer(ngram_range=(1,3))
# X_vect = vectorize_with_file(X_train, vectorizer)