In [1]:
import pandas as pd 
import numpy as np 
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import tqdm
import pprint as pp

from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV

In [2]:
df = pd.read_csv("./Data/final_dummies_add.csv")

In [12]:
df.columns

Index(['bill_id', 'title', 'sponsor_title', 'sponsor_name', 'sponsor_state',
       'introduced_date', 'cosponsors', 'cosponsors_by_party', 'committees',
       'summary',
       ...
       'clean_committees_Rules and Administration', 'clean_committees_Science',
       'clean_committees_Science and Technology',
       'clean_committees_Science, Space, and Technology',
       'clean_committees_Small Business',
       'clean_committees_Small Business and Entrepreneurship',
       'clean_committees_Standards of Official Conduct',
       'clean_committees_Transportation and Infrastructure',
       'clean_committees_Ways and Means', 'law_Law'],
      dtype='object', length=123)

In [4]:
x = df.select_dtypes([int, float, bool]).drop("law_Law", axis = 1)
y = df.law_Law

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [27]:
train = x_train.join(y_train)

In [60]:
sample = resample(train[train['law_Law'] == 1], replace = True, n_samples = 50_000, random_state = 13)

In [61]:
train_samples = pd.concat([train, sample])

In [62]:
train_samples['law_Law'].value_counts(normalize = True)

0    0.606378
1    0.393622
Name: law_Law, dtype: float64

In [63]:
x_train = train.drop("law_Law", axis = 1)
y_train = train.law_Law

# Feature Selection

In [64]:
rfe = RFECV(lr, step = 1, cv = 5, n_jobs = 4)

In [65]:
rfe.fit(x_train, y_train)



RFECV(cv=5,
      estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='warn', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='warn', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=4, scoring=None, step=1, verbose=0)

In [66]:
rfe.support_

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [67]:
rfe.ranking_

array([1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [68]:
arr = rfe.support_
arr = np.where(arr)[0]

In [69]:
x_train_good = x_train.iloc[:, arr]

In [70]:
x_test_good = x_test.iloc[:, arr]

# Logistic Regression

In [71]:
lr = LogisticRegression(penalty="l2")

lr.fit(x_train_good, y_train)

print(lr.score(x_train_good, y_train))

lr.score(x_test_good, y_test)



0.9664711111111111


0.9643377777777777

In [72]:
preds = lr.predict(x_test_good)

In [73]:
#high precision -- low false positives
#high recall -- low false negatives

In [74]:
clm = classification_report(y_test, preds)
pp.pprint(clm)

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.97      1.00      0.98     27111\n'
 '           1       0.56      0.05      0.09      1014\n'
 '\n'
 '    accuracy                           0.96     28125\n'
 '   macro avg       0.77      0.52      0.53     28125\n'
 'weighted avg       0.95      0.96      0.95     28125\n')


In [75]:
cm = confusion_matrix(y_test, preds)
cm

array([[27074,    37],
       [  966,    48]])

# Random Forest Model

In [76]:
rf = RandomForestClassifier()

rf.fit(x_train_good, y_train)
print(rf.score(x_train_good, y_train))

rf.score(x_test_good, y_test)



0.9916918518518518


0.9632711111111111

In [77]:
preds = rf.predict(x_test_good)

clm = classification_report(y_test, preds)
pp.pprint(clm)

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.97      0.99      0.98     27111\n'
 '           1       0.48      0.20      0.28      1014\n'
 '\n'
 '    accuracy                           0.96     28125\n'
 '   macro avg       0.72      0.60      0.63     28125\n'
 'weighted avg       0.95      0.96      0.96     28125\n')


In [78]:
preds = rf.predict(x_test)

clm = classification_report(y_test, preds)
pp.pprint(clm)

ValueError: Number of features of the model must match the input. Model n_features is 102 and input n_features is 104 

In [None]:
params = 

In [None]:
predict_probas = rf.predict_proba

In [None]:
cm = confusion_matrix(y_test, preds)
cm

In [None]:
plt.xticks(rotation = 45)
sns.countplot(y_test, color = "crimson", alpha = .5)
sns.countplot(preds, color = "slateblue", alpha = .5);
#the model is overpredicting referred to committee. Especially bad at referred to full chamber

In [None]:
probas = rf.predict_proba(x_test)

In [None]:
plt.hist(probas);

In [None]:
probas

In [None]:
probas_df = pd.DataFrame(probas)

In [None]:
probas_df

In [None]:
probas_df[probas_df[0] < .9]

# precision recall curve

In [None]:
thresh = np.arange(0, 1.05, .05)

In [None]:
precisions = []
recalls = []

mu = np.mean(y_test)
bogus = np.random.uniform(0, 1, size=probas.shape[0])

for t in thresh:
    predict_pos = probas[:,1]>=t
    #predict_pos = bogus >= t
    trues = ((predict_pos == 1) & (y_test == 1)).sum()
    falses = ((predict_pos == 1) & (y_test == 0)).sum()
    false_neg = ((predict_pos == 0) & (y_test == 1)).sum()
    precision = trues/(trues + falses)
    precisions.append(precision)
    recall = trues / (trues + false_neg)
    recalls.append(recall)
    
    print(f"{t} -- precision: {precision}, recall:{recall}")

In [None]:
plt.plot([0, 1], [1, 0])
plt.plot(recalls, precisions);

In [None]:
arr = probas_df[(probas_df[0] < .9) & (probas_df[0] > 0)].index

In [None]:
#looking at uncertain ones

In [None]:
x_test.iloc[arr]

# Adaboost model

In [79]:
ada = AdaBoostClassifier()

ada.fit(x_train, y_train)

print(ada.score(x_train, y_train))

ada.score(x_test, y_test)

0.9650133333333333


0.9632355555555555

In [80]:
preds = ada.predict(x_test)

In [81]:
clm = classification_report(y_test, preds)
pp.pprint(clm)

('              precision    recall  f1-score   support\n'
 '\n'
 '           0       0.97      1.00      0.98     27111\n'
 '           1       0.43      0.06      0.11      1014\n'
 '\n'
 '    accuracy                           0.96     28125\n'
 '   macro avg       0.70      0.53      0.55     28125\n'
 'weighted avg       0.95      0.96      0.95     28125\n')
