In [1]:
import pandas as pd
import numpy as np
from env import api_key
import requests
import json
from acquire import *
from bs4 import BeautifulSoup
import os
from prepare import *
import matplotlib.pyplot as plt
import re
from nlp_modeling_functions import *
import IPython
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

#Removes warnings and imporves asthenics
import warnings
warnings.filterwarnings("ignore")

# Acquire

In [2]:
# function calling the acquired df
links = get_links_to_bills()

In [3]:
df = acquire_bills(links, filename="master_df.csv")

In [4]:
df = df.sample(3000)

In [5]:
df = df[(df.party == "D") | (df.party == "R")]

# Prepare

In [6]:
df = prepare_bills_for_processing(df)

In [7]:
# creating a lemmatized column and cleaning the df
df['lem']= df.bill_text.apply(clean_text)

In [8]:
df['model']= df.lem.apply(join)
df.head()

Unnamed: 0,sponsor,party,bill_text,lem,model
7599,"Bost, Mike",R,\n\n\n \n To establish an advisory commission ...,"[establish, advisory, commission, regarding, e...",establish advisory commission regarding eligib...
9653,"Sanchez, Linda T.",D,\n\n\n \n Expressing support for designation o...,"[expressing, support, designation, welcome, ho...",expressing support designation welcome home vi...
2320,"Kildee, Daniel T.",D,\n\n\n \nTo direct the Neighborhood Reinvestme...,"[direct, neighborhood, reinvestment, corporati...",direct neighborhood reinvestment corporation r...
22964,"Davidson, Warren",R,\n\n\n \nTo prevent class-based loan forgivene...,"[prevent, classbased, loan, forgiveness, feder...",prevent classbased loan forgiveness federal st...
4500,"Cook, Paul",R,\n\n\n \nTo amend the Bridgeport Indian Colony...,"[amend, bridgeport, indian, colony, land, trus...",amend bridgeport indian colony land trust heal...


In [9]:
# splitting the data
train, X_train, y_train, X_val, y_val, X_test, y_test = split_data(df, 'party')

In [10]:
train.shape

(1669, 5)

# Modeling

In [11]:
X_train.head()

Unnamed: 0,sponsor,bill_text,lem,model
6617,"Wittman, Robert J.",\n\n\n \n To improve the provision of health c...,"[improve, provision, health, care, department,...",improve provision health care department veter...
18570,"Green, Mark E.",\n\n\n \n To require educational agencies and ...,"[require, educational, agency, institution, pr...",require educational agency institution provide...
9748,"Sullivan, Dan",\n\n\n \nTo establish the American Fisheries A...,"[establish, american, fishery, advisory, commi...",establish american fishery advisory committee ...
22800,"Whitehouse, Sheldon",\n\n\n \n To establish a process for expedit...,"[establish, process, expedited, consideration,...",establish process expedited consideration legi...
4676,"Guthrie, Brett",\n\n\n \nTo designate certain future interstat...,"[designate, certain, future, interstate, high,...",designate certain future interstate high prior...


In [12]:
X_train, X_val, X_test = vectorize_data(X_train, X_val, X_test, target_col = "bill_text")

## Grid Search

In [20]:
params = {"n_neighbors": [1, 3, 5, 7, 9],
          "weights":["uniform", "distance"],
          "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
          "leaf_size": [10, 20, 30, 50, 100],
          "n_jobs": [-1],
          "metric":["minkowski", "cityblock", "cosine", "euclidean", "haversine", "l1", "l2", "manhattan", "nan_euclidean"]}

In [21]:
rf = KNeighborsClassifier()


In [22]:
grid = GridSearchCV(rf, params, cv=3)

In [None]:
grid.fit(X_train, y_train)

In [None]:
results = grid.cv_results_
results.keys()

In [None]:
test_scores = results['mean_test_score']
test_scores

In [None]:
params = results['params']
params

In [None]:
for p, s in zip(params, test_scores):
    p['score'] = s

score_df = pd.DataFrame(params).sort_values(by='score', ascending=False)

In [None]:
score_df.dropna()

In [None]:
score_df[score_df.score == score_df.score.max()]

## Baseline

In [None]:
len(y_train[y_train == 'D'])/ len(y_train)

## Modeling

In [None]:
lr_mod(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

In [None]:
rand_forest(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

In [None]:
dec_tree(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

In [None]:
knn_mod(X_train, y_train, X_val, y_val, metric = 1, print_scores = True)

In [None]:
IPython.display.Audio("ding.mp3", autoplay=True)