In [1]:
import pandas as pd
import numpy as np
from env import api_key
import requests
import json
from acquire import *
from bs4 import BeautifulSoup
import os
from prepare import *
import matplotlib.pyplot as plt
import re
from nlp_modeling_functions import *
import IPython
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")

# Acquire

In [2]:
# function calling the acquired df
links = get_links_to_bills()

In [3]:
df = acquire_bills(links, filename="master_df.csv")

In [4]:
df = df.sample(3000)

In [5]:
df = df[(df.party == "D") | (df.party == "R")]

# Prepare

In [6]:
df = prepare_bills_for_processing(df)

In [7]:
# creating a lemmatized column and cleaning the df
df['lem']= df.bill_text.apply(clean_text)

In [8]:
df['model']= df.lem.apply(join)
df.head()

Unnamed: 0,sponsor,party,bill_text,lem,model
586,"Napolitano, Grace F.",D,\n\n\n \n To expand and improve access to ...,"[expand, improve, access, traumainformed, ment...",expand improve access traumainformed mental he...
6041,"Biggs, Andy",R,\n\n\n \n To repeal the National Voter...,"[repeal, national, voter, registration, act, 1...",repeal national voter registration act 1993 en...
22938,"Wild, Susan",D,\n\n\n \nTo amend the Higher Education Act of ...,"[amend, higher, education, act, 1965, establis...",amend higher education act 1965 establish inco...
17903,"Connolly, Gerald E.",D,\n\n\n \n Calling on the United States Gove...,"[calling, united, state, government, uphold, f...",calling united state government uphold foundin...
20111,"Stabenow, Debbie",D,\n\n\n \n Supporting the goals and ideals of S...,"[supporting, goal, ideal, social, work, month,...",supporting goal ideal social work month world ...


In [9]:
# splitting the data
train, X_train, y_train, X_val, y_val, X_test, y_test = split_data(df, 'party')

In [10]:
train.shape

(1666, 5)

# Modeling

In [11]:
X_train.head()

Unnamed: 0,sponsor,bill_text,lem,model
16064,"Reed, Jack",\n\n\n \nTo amend the Public Health Service Ac...,"[amend, public, health, service, act, ensure, ...",amend public health service act ensure provisi...
10890,"Sanchez, Linda T.",\n\n\n \n To amend titles XVIII and XIX of the...,"[amend, title, xviii, xix, social, security, a...",amend title xviii xix social security act proh...
3744,"Murray, Patty","\n\n\n \n To amend the Federal Food, Drug, and...","[amend, federal, food, drug, cosmetic, act, re...",amend federal food drug cosmetic act respect l...
7312,"Lummis, Cynthia M.",\n\n\n \n To prohibit the President from is...,"[prohibit, president, issuing, moratorium, lea...",prohibit president issuing moratorium leasing ...
2494,"Steube, W. Gregory",\n\n\n \nTo amend title 11 of the United State...,"[amend, title, 11, united, state, code, prohib...",amend title 11 united state code prohibit paym...


In [12]:
X_train, X_val, X_test = vectorize_data(X_train, X_val, X_test, target_col = "bill_text")

## Grid Search

In [14]:
params = {'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
          'max_iter': [100, 500, 1000],
          'penalty': ['l1', 'l2', 'elasticnet', None],
          'n_jobs': [-1],
          'random_state':[1969]}

In [15]:
logit = LogisticRegression()


In [16]:
grid = GridSearchCV(logit, params, cv=3)

In [17]:
grid.fit(X_train, y_train)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further opti

In [18]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_iter', 'param_n_jobs', 'param_penalty', 'param_random_state', 'param_solver', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [19]:
test_scores = results['mean_test_score']
test_scores

array([       nan, 0.61644738,        nan,        nan,        nan,
       0.61464666, 0.66206602, 0.66206602, 0.66206602, 0.66206602,
       0.66206602, 0.66326722,        nan,        nan,        nan,
              nan,        nan,        nan, 0.6584635 ,        nan,
       0.67047011, 0.6614719 , 0.67827252, 0.68307516,        nan,
       0.61644738,        nan,        nan,        nan, 0.61464666,
       0.66206602, 0.66206602, 0.66206602, 0.66206602, 0.66206602,
       0.66326722,        nan,        nan,        nan,        nan,
              nan,        nan, 0.6584635 ,        nan, 0.67047011,
       0.6614719 , 0.67526951, 0.6764718 ,        nan, 0.61644738,
              nan,        nan,        nan, 0.61464666, 0.66206602,
       0.66206602, 0.66206602, 0.66206602, 0.66206602, 0.66326722,
              nan,        nan,        nan,        nan,        nan,
              nan, 0.6584635 ,        nan, 0.67047011, 0.6614719 ,
       0.67707456, 0.67647072])

In [20]:
params = results['params']
params

[{'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'lbfgs'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'liblinear'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'newton-cg'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'newton-cholesky'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'sag'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l1',
  'random_state': 1969,
  'solver': 'saga'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 1969,
  'solver': 'lbfgs'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 1969,
  'solver': 'liblinear'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 1969,
  'solver': 'newton-cg'},
 {'max_iter': 100,
  'n_jobs': -1,
  'penalty': 'l2',
  'random_state': 1969,

In [21]:
for p, s in zip(params, test_scores):
    p['score'] = s

pd.DataFrame(params).sort_values(by='score')

Unnamed: 0,max_iter,n_jobs,penalty,random_state,solver,score
5,100,-1,l1,1969,saga,0.614647
29,500,-1,l1,1969,saga,0.614647
53,1000,-1,l1,1969,saga,0.614647
1,100,-1,l1,1969,liblinear,0.616447
25,500,-1,l1,1969,liblinear,0.616447
...,...,...,...,...,...,...
62,1000,-1,elasticnet,1969,newton-cg,
63,1000,-1,elasticnet,1969,newton-cholesky,
64,1000,-1,elasticnet,1969,sag,
65,1000,-1,elasticnet,1969,saga,


## Baseline

In [22]:
len(y_train[y_train == 'D'])/ len(y_train)

0.5966386554621849

## Modeling

In [23]:
lr_mod(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

Accuracy for Logistic Regression classifier on training set:   0.8523
Accuracy for Logistic Regression classifier on validation set: 0.7003


(0.8523409363745498, 0.7002801120448179)

In [24]:
rand_forest(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

Accuracy for Random Forest classifier on training set:   0.6753
Accuracy for Random Forest classifier on validation set: 0.6331


(0.6752701080432173, 0.6330532212885154)

In [25]:
dec_tree(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

Accuracy for Decision Tree classifier on training set:   0.7059
Accuracy for Decision Tree classifier on validation set: 0.6261


(0.7058823529411765, 0.6260504201680672)

In [26]:
IPython.display.Audio("ding.mp3", autoplay=True)