In [1]:
from helpers import *

import sys

import bz2
import json

import numpy as np
import scipy

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score, cross_validate

%load_ext autoreload
%autoreload 2

In [2]:
# XGBoost

from xgboost import XGBClassifier

# This code prevents the kernel from stopping when XGBoost is running
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
# Printing without trunctions
# np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_colwidth', None)

In [4]:
path = fixpath(QUOTES_2020_LABELED_CLEANED_VARIANTS_MINI)
# path = fixpath(QUOTES_2020_LABELED_CLEANED_VARIANTS)

df = pd.read_json(path, orient='records', lines=True)

In [5]:
df['party_label'].value_counts()

D    41657
R    24835
Name: party_label, dtype: int64

In [6]:
def downsample(df:pd.DataFrame, label_col_name:str) -> pd.DataFrame:
    # find the number of observations in the smallest group
    nmin = df[label_col_name].value_counts().min()
    return (df
            # split the dataframe per group
            .groupby(label_col_name)
            # sample nmin observations from each group
            .apply(lambda x: x.sample(nmin))
            # recombine the dataframes
            .reset_index(drop=True)
            )

df = downsample(df, 'party_label')

In [7]:
df['party_label'].value_counts()

D    24835
R    24835
Name: party_label, dtype: int64

### Example usage of sklean Model

In [8]:
X = TfidfVectorizer().fit_transform(df['quotation_cleanD'].values)
y = df['party_label'].values
y = np.array([0 if label=='R' else 1 for label in y])
y, X

(array([1, 1, 1, ..., 0, 0, 0]),
 <49670x15997 sparse matrix of type '<class 'numpy.float64'>'
 	with 525853 stored elements in Compressed Sparse Row format>)

In [9]:
clf = MultinomialNB()
# clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=5)

print(f'Avg: {scores.mean():.3f}\tStd: {scores.std():.3f}')

Avg: 0.632	Std: 0.005


In [10]:
clf = MultinomialNB()

scoring=['accuracy', 'precision', 'recall', 'f1']
res = cross_validate(clf, X, y, scoring=scoring, cv=3)
{item: (value.mean(), value.std()) for (item, value) in res.items()}

{'fit_time': (0.01331639289855957, 0.003579976189041066),
 'score_time': (0.017519235610961914, 0.0014197159291590676),
 'test_accuracy': (0.628649125403162, 0.0021289266412349043),
 'test_precision': (0.6291958377123803, 0.0016518327517330285),
 'test_recall': (0.626575915218493, 0.00990880867503464),
 'test_f1': (0.6278394641953976, 0.004840235052109225)}

### Appling model to quotation variants

In [11]:
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())

In [12]:
cols = [
    'quotation_cleanA', 
    'quotation_cleanB',
    'quotation_cleanC',
    'quotation_cleanD',
    # 'quotation_cleanE'
    ]

def test_classifer(clf, break_after_one_iter=False):
    for col in cols:
        X = TfidfVectorizer().fit_transform(df[col].values)
        y = df['party_label'].values
        y = np.array([0.0 if label=='R' else 1.0 for label in y])

        # scores = cross_val_score(clf, X, y, cv=3)
        # print(f'Col: {col}\tAvg: {scores.mean():.3f}\tStd: {scores.std():.3f}')

        # scoring=['accuracy']
        # scoring=['f1']
        scoring=['accuracy', 'precision', 'recall', 'f1']
        res = cross_validate(clf, X, y, scoring=scoring, cv=3)
        res = {item: (value.mean(), value.std()) for (item, value) in res.items()}

        # Code isn't pretty but prints nice output!
        print(f'Col: {col}')
        print(f'\tfit_time - \t\tavg:{res["fit_time"][0]:.3f}\tstd:{res["fit_time"][1]:.3f}')
        print(f'\tscore_time - \t\tavg:{res["score_time"][0]:.3f}\tstd:{res["score_time"][1]:.3f}')
        print(f'\ttest_accuracy - \tavg:{res["test_accuracy"][0]:.3f}\tstd:{res["test_accuracy"][1]:.3f}')
        print(f'\ttest_precision - \tavg:{res["test_precision"][0]:.3f}\tstd:{res["test_precision"][1]:.3f}')
        print(f'\ttest_recall - \t\tavg:{res["test_recall"][0]:.3f}\tstd:{res["test_recall"][1]:.3f}')
        print(f'\ttest_f1 - \t\tavg:{res["test_f1"][0]:.3f}\tstd:{res["test_f1"][1]:.3f}')
        
        if break_after_one_iter:
            break

        
    # break

In [13]:
clf = MultinomialNB()
test_classifer(clf)

Col: quotation_cleanA
	fit_time - 		avg:0.013	std:0.000
	score_time - 		avg:0.025	std:0.000
	test_accuracy - 	avg:0.633	std:0.003
	test_precision - 	avg:0.628	std:0.001
	test_recall - 		avg:0.649	std:0.011
	test_f1 - 		avg:0.638	std:0.006
Col: quotation_cleanB
	fit_time - 		avg:0.018	std:0.004
	score_time - 		avg:0.033	std:0.006
	test_accuracy - 	avg:0.633	std:0.003
	test_precision - 	avg:0.628	std:0.001
	test_recall - 		avg:0.649	std:0.011
	test_f1 - 		avg:0.638	std:0.006
Col: quotation_cleanC
	fit_time - 		avg:0.011	std:0.001
	score_time - 		avg:0.024	std:0.001
	test_accuracy - 	avg:0.630	std:0.003
	test_precision - 	avg:0.628	std:0.001
	test_recall - 		avg:0.637	std:0.010
	test_f1 - 		avg:0.632	std:0.005
Col: quotation_cleanD
	fit_time - 		avg:0.011	std:0.000
	score_time - 		avg:0.025	std:0.001
	test_accuracy - 	avg:0.629	std:0.002
	test_precision - 	avg:0.629	std:0.002
	test_recall - 		avg:0.627	std:0.010
	test_f1 - 		avg:0.628	std:0.005


In [122]:
clf = LogisticRegression(n_jobs=-1, C=1e5)
test_classifer(clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Col: quotation_cleanA)
            	fit_time - 		avg:2.257	std:0.650
            	score_time - 		avg:0.033	std:0.006
            	test_accuracy - 	avg:0.603	std:0.003
            	test_precision - 	avg:0.603	std:0.003
            	test_recall - 		avg:0.603	std:0.004
            	test_f1 - 		avg:0.603	std:0.003            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Col: quotation_cleanB)
            	fit_time - 		avg:1.220	std:0.203
            	score_time - 		avg:0.029	std:0.003
            	test_accuracy - 	avg:0.602	std:0.003
            	test_precision - 	avg:0.602	std:0.004
            	test_recall - 		avg:0.603	std:0.004
            	test_f1 - 		avg:0.602	std:0.003            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Col: quotation_cleanC)
            	fit_time - 		avg:0.971	std:0.101
            	score_time - 		avg:0.037	std:0.004
            	test_accuracy - 	avg:0.598	std:0.004
            	test_precision - 	avg:0.598	std:0.004
            	test_recall - 		avg:0.597	std:0.009
            	test_f1 - 		avg:0.598	std:0.005            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Col: quotation_cleanD)
            	fit_time - 		avg:0.932	std:0.227
            	score_time - 		avg:0.027	std:0.001
            	test_accuracy - 	avg:0.608	std:0.003
            	test_precision - 	avg:0.607	std:0.003
            	test_recall - 		avg:0.612	std:0.003
            	test_f1 - 		avg:0.609	std:0.001            


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
clf = RandomForestClassifier(n_estimators=40, n_jobs=-1)
test_classifer(clf)

Col: quotation_cleanA
            	fit_time - 		avg:10.727	std:0.406
            	score_time - 		avg:0.158	std:0.002
            	test_accuracy - 	avg:0.603	std:0.004
            	test_precision - 	avg:0.618	std:0.002
            	test_recall - 		avg:0.541	std:0.012
            	test_f1 - 		avg:0.577	std:0.008            
Col: quotation_cleanB
            	fit_time - 		avg:11.434	std:0.082
            	score_time - 		avg:0.159	std:0.002
            	test_accuracy - 	avg:0.601	std:0.006
            	test_precision - 	avg:0.614	std:0.007
            	test_recall - 		avg:0.544	std:0.006
            	test_f1 - 		avg:0.577	std:0.006            
Col: quotation_cleanC
            	fit_time - 		avg:15.625	std:0.095
            	score_time - 		avg:0.284	std:0.005
            	test_accuracy - 	avg:0.610	std:0.000
            	test_precision - 	avg:0.615	std:0.001
            	test_recall - 		avg:0.588	std:0.005
            	test_f1 - 		avg:0.601	std:0.002            
Col: quotation_cleanD
      

In [83]:
clf.fit(X[100:], y[100:])
clf.predict_proba(X[0:5])

array([[0.225     , 0.775     ],
       [0.46659091, 0.53340909],
       [0.625     , 0.375     ],
       [0.625     , 0.375     ],
       [0.375     , 0.625     ]])

In [102]:
clf = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0,
    max_depth=1, random_state=0)

test_classifer(clf)

Col: quotation_cleanA
            	fit_time - 		avg:4.219	std:0.073
            	score_time - 		avg:0.028	std:0.000
            	test_accuracy - 	avg:0.522	std:0.002
            	test_precision - 	avg:0.676	std:0.021
            	test_recall - 		avg:0.085	std:0.013
            	test_f1 - 		avg:0.150	std:0.020            
Col: quotation_cleanB
            	fit_time - 		avg:4.117	std:0.066
            	score_time - 		avg:0.027	std:0.001
            	test_accuracy - 	avg:0.522	std:0.002
            	test_precision - 	avg:0.676	std:0.021
            	test_recall - 		avg:0.085	std:0.013
            	test_f1 - 		avg:0.150	std:0.020            
Col: quotation_cleanC
            	fit_time - 		avg:4.073	std:0.304
            	score_time - 		avg:0.023	std:0.002
            	test_accuracy - 	avg:0.519	std:0.001
            	test_precision - 	avg:0.688	std:0.071
            	test_recall - 		avg:0.085	std:0.038
            	test_f1 - 		avg:0.147	std:0.056            
Col: quotation_cleanD
         

In [14]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

test_classifer(clf, break_after_one_iter=True)

Col: quotation_cleanA
	fit_time - 		avg:75.513	std:7.367
	score_time - 		avg:0.032	std:0.002
	test_accuracy - 	avg:0.595	std:0.006
	test_precision - 	avg:0.605	std:0.006
	test_recall - 		avg:0.546	std:0.007
	test_f1 - 		avg:0.574	std:0.006


In [None]:
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
test_classifer(clf, break_after_one_iter=True)

In [12]:
# clf = XGBClassifier()
# test_classifer(clf)

NICKY's CELLS

In [91]:
nicky_clf = MultinomialNB()
nicky_X = X.copy()
nicky_y = y.copy()
nicky_scoring=['accuracy', 'precision', 'recall', 'f1']
nicky_res = cross_validate(clf, nicky_X, nicky_y, scoring=nicky_scoring, cv=3)
{item: (value.mean(), value.std()) for (item, value) in nicky_res.items()}

{'fit_time': (0.012681325276692709, 0.001947405681681884),
 'score_time': (0.020110925038655598, 0.0017169714300468797),
 'test_accuracy': (0.6276826330008158, 0.002578911298261747),
 'test_precision': (0.627421680742947, 0.0028552820459956225),
 'test_recall': (0.6287499254744705, 0.005024174382339343),
 'test_f1': (0.6280728267951473, 0.0029866956070025387)}

In [96]:
nicky_clf.fit(nicky_X,nicky_y)


array([1, 1, 0, ..., 1, 1, 0])

In [97]:
y_pred = nicky_clf.predict_proba(nicky_X)
y_pred

array([[0.36970839, 0.63029161],
       [0.49420464, 0.50579536],
       [0.56957207, 0.43042793],
       ...,
       [0.49779887, 0.50220113],
       [0.33527857, 0.66472143],
       [0.62765024, 0.37234976]])

In [None]:
df.columns

(49670, 2)

In [103]:
nicky_df = df.copy()
nicky_df['pred'] = y_pred[:,0]
nicky_df

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,id,party_label,US_congress_bio_ID,quotation_cleanA,quotation_cleanB,quotation_cleanC,quotation_cleanD,pred
0,2020-04-14-003330,And I praised him for his actions.,andrew cuomo,2020-04-14 20:57:21,1,Q11673,D,,And I praised him for his actions.,and i praised him for his actions,praised actions,prais action,0.369708
1,2020-02-17-079523,"We loved his optimism,",tina kotek,2020-02-17 02:52:08,1,Q1452454,D,,"We loved his optimism,",we loved his optimism,loved optimism,love optim,0.494205
2,2020-01-13-034733,"I'm part of the process and they're part of the process,",michael madigan,2020-01-13 23:28:22,1,Q961894,D,,"I'm part of the process and they're part of the process,",i m part of the process and they re part of the process,part process part process,part process part process,0.569572
3,2020-01-30-118952,"We're looking at all of our parliamentary options to try to force as many votes as we can to put Republicans on the record,",chris murphy,2020-01-30 23:20:17,3,Q1077594,D,M001169,"We're looking at all of our parliamentary options to try to force as many votes as we can to put Republicans on the record,",we re looking at all of our parliamentary options to try to force as many votes as we can to put republicans on the record,looking parliamentary options try force many votes put republicans record,look parliamentari option tri forc mani vote put republican record,0.516648
4,2020-01-17-041823,"In a unique idiosyncracy of San Diego Election Law, we have another public fundraising deadline this Friday for the first 18 days of January.... Look I know these fundraising emails can sometimes seem tedious, but I really do need help. Media outlets and pundits are going to be looking at this fundraising number to try and make predictions about the state of the mayoral race, so I cannot let my opponents raise more online grassroots donations than our campaign ahead of this deadline.",todd gloria,2020-01-17 07:12:40,1,Q7812385,D,,"In a unique idiosyncracy of San Diego Election Law, we have another public fundraising deadline this Friday for the first days of January.... Look I know these fundraising emails can sometimes seem tedious, but I really do need help. Media outlets and pundits are going to be looking at this fundraising number to try and make predictions about the state of the mayoral race, so I cannot let my opponents raise more online grassroots donations than our campaign ahead of this deadline.",in a unique idiosyncracy of san diego election law we have another public fundraising deadline this friday for the first days of january look i know these fundraising emails can sometimes seem tedious but i really do need help media outlets and pundits are going to be looking at this fundraising number to try and make predictions about the state of the mayoral race so i cannot let my opponents raise more online grassroots donations than our campaign ahead of this deadline,unique idiosyncracy san diego election law another public fundraising deadline friday first days january look know fundraising emails sometimes seem tedious really need help media outlets pundits going looking fundraising number try make predictions state mayoral race cannot let opponents raise online grassroots donations campaign ahead deadline,uniqu idiosyncraci san diego elect law anoth public fundrais deadlin friday first day januari look know fundrais email sometim seem tedious realli need help media outlet pundit go look fundrais number tri make predict state mayor race cannot let oppon rais onlin grassroot donat campaign ahead deadlin,0.196959
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49665,2020-01-05-033758,the new policy tools are effective.,ben bernanke,2020-01-05 00:00:00,13,Q201795,R,,the new policy tools are effective.,the new policy tools are effective,new policy tools effective,new polici tool effect,0.511507
49666,2020-03-20-007821,"Because of the corona crisis, those kids have been sent home,",arnold schwarzenegger,2020-03-20 13:28:14,1,Q2685,R,,"Because of the corona crisis, those kids have been sent home,",because of the corona crisis those kids have been sent home,corona crisis kids sent home,corona crisi kid sent home,0.547786
49667,2020-03-06-022434,"I have a message for Joe Biden and Beto O’Rourke, if you want to take everyone's AR-15s in America, why don't you swing by my office in Washington, D.C., and start with this one?",ken buck,2020-03-06 16:30:00,10,Q1439421,R,B001297,"I have a message for Joe Biden and Beto O'Rourke, if you want to take everyone's AR-15s in America, why don't you swing by my office in Washington, D.C., and start with this one?",i have a message for joe biden and beto o'rourke if you want to take everyone s ar 15s in america why don t you swing by my office in washington d c and start with this one,message joe biden beto 'rourke want take everyone ar 15s america swing office washington c start one,messag joe biden beto rourk want take everyon ar 15s america swing offic washington c start one,0.497799
49668,2020-02-26-041628,"It's another avenue to highlight this issue, this rural, urban divide and how the city [ of Chicago ] continues to drive legislation down and how it impacts the rest of us,",brad halbrook,2020-02-26 11:09:33,1,Q4953971,R,,"It's another avenue to highlight this issue, this rural, urban divide and how the city [ of Chicago ] continues to drive legislation down and how it impacts the rest of us,",it s another avenue to highlight this issue this rural urban divide and how the city of chicago continues to drive legislation down and how it impacts the rest of us,another avenue highlight issue rural urban divide city chicago continues drive legislation impacts rest us,anoth avenu highlight issu rural urban divid citi chicago continu drive legisl impact rest us,0.335279


In [107]:
nicky_df['rounded_pred'] = np.argmax(y_pred,axis = 1)

In [111]:
import plotly.express as px
fig = px.scatter(nicky_df.sample(1000), x = 'party_label', y = 'pred', color = 'rounded_pred' ,hover_data= ['party_label','speaker','quotation'])
fig.show()

In [None]:
def get_result_df(regressor,filename):
    
    results = []
    
    with pd.read_json(filename, lines=True, compression='bz2', chunksize=500) as df_reader:
        for df in df_reader:
            df = df.sample(50)
            x = df['glove_embedding'].values
            y = df[['party_number']].values[:,0]
            x = np.array(list(map(lambda val: np.array(val),x))).reshape(x.shape[0],-1)

            ypred = regressor.predict(x)

            tmp_df = pd.DataFrame(data = {'party_number': df['party_number'].values, \
                                        'prediction': ypred, 'quotation': df['quotation'].values, 'speaker': df['speaker'].values} )
            
            
            results.append(tmp_df.copy())
        
            result_df = pd.concat(results)
                
            result_df['rounded_pred'] = result_df['prediction'].apply(lambda x: 0 if x <= 0.5 else 1)
    display(result_df.head(1))
    
    fig = px.scatter(result_df, x = 'party_number', y = 'prediction', color = 'rounded_pred' ,hover_data= ['party_number','speaker','quotation'])
    fig.show()