In [2]:
import psycopg2
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import string

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from create_staging_tables import create_loyalty_scores_df

import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

In [3]:
engine = create_engine('postgresql://localhost:5432/wa_leg_staging')

In [15]:
merged_final_df = pd.read_sql_query('select * from "merged_final"',con=engine)
bill_text_df = pd.read_sql_query('select * from "bill_text"',con=engine)
rep_score_df = pd.read_sql_query('select * from "rep_score"',con=engine)
loyalty_df = pd.read_sql_query('select * from "loyalty"',con=engine)

current_df = pd.read_sql_query('select * from "current_clean"',con=engine)
current_bill_text = pd.read_sql_query('select * from "current_bill_text"',con=engine)

In [12]:
def create_clean_train(merged_final_df, bill_text_df, rep_score_df, loyalty_df):
    
    # Add bill_text
    clean = merged_final_df.merge(bill_text_df, how='left', on=['unique_id', 'htm_url'])
    clean.drop(['index_x', 'index_y'], axis=1, inplace=True)
    clean.rename(columns={'sponsor_party': 'primary_sponsor_party'}, inplace=True)

    
    # Add rep_score and replace -1 values, then make dem_score
    clean['bill_num'] = clean['bill_id'].apply(lambda x: x.split()[1])
    clean['bill_num_unique'] = clean['biennium'] + ' ' + clean['bill_num']
    clean = clean.merge(rep_score_df, how='left', on='bill_num_unique')

    def replace_with_p_sponsor_party(row):
        primary_sponsor_party = row['primary_sponsor_party']
        if row['rep_score'] == -1:
            return primary_sponsor_party
        else:
            return row['rep_score']
    clean['rep_score'] = clean.apply(replace_with_p_sponsor_party, axis=1)
    clean['dem_score'] = 1 - clean['rep_score']
    
    
#     Add loyalty scores
    clean = clean.merge(loyalty_df, how = 'left', on='voter_id')
    clean.drop(['index_x', 'index_y'], axis=1, inplace=True)
    
    
    # Feature Engineering functions
    def create_congress_nums(biennium):
        '''Numbers bienniums so that 1991 is 1 and 1993 is 3.'''
        first_year = int((biennium.split('-'))[0])
        return first_year - 1990
    
    def find_num_sponsors(secondary_sponsors):
        if type(secondary_sponsors) == str:
            return round((len(secondary_sponsors) / 5) + 1)
        else: return 1
    
    # Feature Engineering
    clean['bill_length'] = clean['bill_text'].apply(lambda x: len(x))
    clean['congress_num'] = clean['biennium'].apply(create_congress_nums)
    clean['num_sponsors'] = clean['secondary_sponsors'].apply(find_num_sponsors)

    return clean

In [34]:
def create_clean_test(current_df, current_bill_text_df, loyalty_df):
    
    # Add bill_text
    clean = current_df.merge(current_bill_text_df, how='left', on=['bill_id', 'biennium', 'htm_url'])
    clean.rename(columns={'sponsor_party': 'primary_sponsor_party'}, inplace=True)

    
    # Make dem_score
    clean['dem_score'] = 1 - clean['rep_score']
    
    
#     Add loyalty scores
    clean = clean.merge(loyalty_df, how = 'left', on='voter_id')
    clean.drop('index', axis=1, inplace=True)
    
    # Feature Engineering functions
    def create_congress_nums(biennium):
        '''Numbers bienniums so that 1991 is 1 and 1993 is 3.'''
        first_year = int((biennium.split('-'))[0])
        return first_year - 1990
    
    def find_num_sponsors(secondary_sponsors):
        if type(secondary_sponsors) == str:
            return round((len(secondary_sponsors) / 5) + 1)
        else: return 1
    
    # Feature Engineering
    clean['bill_length'] = clean['bill_text'].apply(lambda x: len(x))
    clean['congress_num'] = clean['biennium'].apply(create_congress_nums)

    return clean

In [13]:
train_clean = create_clean_train(merged_final_df, bill_text_df, rep_score_df, loyalty_df)

In [36]:
test_clean = create_clean_test(current_df, current_bill_text, loyalty_df)

### Current Top Model

In [37]:
# TRAIN
X_train_t = train_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors']]

y_train_t = train_clean['vote']

# TEST
X_test_t = test_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors']]

# y_test_t = sample_test['vote']

In [39]:
top_model = RandomForestClassifier(n_estimators=1000, max_depth=7, n_jobs=2, random_state=709)
top_model.fit(X_train_t, y_train_t)
y_pred_t = (top_model.predict_proba(X_test_t))[:, 1]
% time

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 11.9 µs


In [40]:
y_pred_t.min()

0.4304161936612869

In [41]:
test_clean['predicted_vote'] = y_pred_t

In [44]:
label_df = test_clean[['bill_id', 'bill_num', 'biennium', 'voter_id', 'voting_agency', 'predicted_vote']]
# 'district', 'first_name', 'last_name', 'party'

In [46]:
legislator_df = pd.read_sql_query('select * from "legislator"',con=engine)

In [49]:
def change_agency_to_int(agency):
    if agency == 'House':
        return 0
    if agency == 'Senate':
        return 1

In [50]:
legislator_df['agency'] = legislator_df['agency'].apply(change_agency_to_int)

In [51]:
legislator_df.rename(columns={'id': 'voter_id', 'agency': 'voting_agency'}, inplace=True)

In [53]:
label_df.merge(legislator_df, how='left', on=['voter_id', 'voting_agency'])

Unnamed: 0,bill_id,bill_num,biennium,voter_id,voting_agency,predicted_vote,district,first_name,party,last_name
0,SHB 2264,2264,2017-18,3478,1,0.864147,30,Mark,0,Miloscia
1,SHB 2264,2264,2017-18,3478,1,0.864147,30,Mark,1,Miloscia
2,SHB 2264,2264,2017-18,17226,1,0.921304,5,Mark,0,Mullet
3,SHB 2264,2264,2017-18,13161,0,0.944950,34,Sharon,0,Nelson
4,SHB 2264,2264,2017-18,13161,1,0.936559,34,Sharon,0,Nelson
5,SHB 2264,2264,2017-18,12002,0,0.944867,43,Jamie,0,Pedersen
6,SHB 2264,2264,2017-18,12002,1,0.937289,43,Jamie,0,Pedersen
7,SHB 2264,2264,2017-18,14074,1,0.936222,40,Kevin,0,Ranker
8,SHB 2264,2264,2017-18,15814,0,0.817996,18,Ann,1,Rivers
9,SHB 2264,2264,2017-18,15814,1,0.835347,18,Ann,1,Rivers
