In [1]:
import psycopg2
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import string

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, roc_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from create_staging_tables import create_loyalty_scores_df
from edit_current_legislator import filter_out_duplicates_from_current_leg

import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
engine = create_engine('postgresql://localhost:5432/wa_leg_staging')

In [3]:
merged_final_df = pd.read_sql_query('select * from "merged_final"',con=engine)
bill_text_df = pd.read_sql_query('select * from "bill_text"',con=engine)
rep_score_df = pd.read_sql_query('select * from "rep_score"',con=engine)
loyalty_df = pd.read_sql_query('select * from "loyalty"',con=engine)

current_df = pd.read_sql_query('select * from "current_clean"',con=engine)
current_bill_text = pd.read_sql_query('select * from "current_bill_text"',con=engine)
current_legislator_df = pd.read_sql_query('select * from "current_legislator"',con=engine)

In [4]:
len(current_df)

445116

In [5]:
def create_clean_train(merged_final_df, bill_text_df, rep_score_df, loyalty_df):
    
    # Add bill_text
    clean = merged_final_df.merge(bill_text_df, how='left', on=['unique_id', 'htm_url'])
    clean.drop(['index_x', 'index_y'], axis=1, inplace=True)
    clean.rename(columns={'sponsor_party': 'primary_sponsor_party'}, inplace=True)

    
    # Add rep_score and replace -1 values, then make dem_score
    clean['bill_num'] = clean['bill_id'].apply(lambda x: x.split()[1])
    clean['bill_num_unique'] = clean['biennium'] + ' ' + clean['bill_num']
    clean = clean.merge(rep_score_df, how='left', on='bill_num_unique')

    def replace_with_p_sponsor_party(row):
        primary_sponsor_party = row['primary_sponsor_party']
        if row['rep_score'] == -1:
            return primary_sponsor_party
        else:
            return row['rep_score']
    clean['rep_score'] = clean.apply(replace_with_p_sponsor_party, axis=1)
    clean['dem_score'] = 1 - clean['rep_score']
    
    
#     Add loyalty scores
    clean = clean.merge(loyalty_df, how = 'left', on='voter_id')
    clean.drop(['index_x', 'index_y'], axis=1, inplace=True)
    
    
    # Feature Engineering functions
    def create_congress_nums(biennium):
        '''Numbers bienniums so that 1991 is 1 and 1993 is 3.'''
        first_year = int((biennium.split('-'))[0])
        return first_year - 1990
    
    def find_num_sponsors(secondary_sponsors):
        if type(secondary_sponsors) == str:
            return round((len(secondary_sponsors) / 5) + 1)
        else: return 1
    
    # Feature Engineering
    clean['bill_length'] = clean['bill_text'].apply(lambda x: len(x))
    clean['congress_num'] = clean['biennium'].apply(create_congress_nums)
    clean['num_sponsors'] = clean['secondary_sponsors'].apply(find_num_sponsors)

    return clean

In [6]:
def create_clean_test(current_df, current_bill_text_df, loyalty_df):
    
    # Add bill_text
    clean = current_df.merge(current_bill_text_df, how='left', on=['bill_id', 'biennium', 'htm_url'])
    clean.rename(columns={'sponsor_party': 'primary_sponsor_party'}, inplace=True)

    
    # Make dem_score
    clean['dem_score'] = 1 - clean['rep_score']
    
    
#   Add loyalty scores
    clean = clean.merge(loyalty_df, how = 'left', on='voter_id')
    clean.drop('index', axis=1, inplace=True)
    
    # Feature Engineering functions
    def create_congress_nums(biennium):
        '''Numbers bienniums so that 1991 is 1 and 1993 is 3.'''
        first_year = int((biennium.split('-'))[0])
        return first_year - 1990
    
    def find_num_sponsors(secondary_sponsors):
        if type(secondary_sponsors) == str:
            return round((len(secondary_sponsors) / 5) + 1)
        else: return 1
    
    # Feature Engineering
    clean['bill_length'] = clean['bill_text'].apply(lambda x: len(x))
    clean['congress_num'] = clean['biennium'].apply(create_congress_nums)

    return clean

In [7]:
train_clean = create_clean_train(merged_final_df, bill_text_df, rep_score_df, loyalty_df)

In [8]:
test_clean = create_clean_test(current_df, current_bill_text, loyalty_df)

In [9]:
test_clean['current'] = test_clean.apply(filter_out_duplicates_from_current_leg, axis=1)
test_clean = test_clean[test_clean['current'] == True]

In [10]:
test_clean[(test_clean['bill_num'] == '2299') & (test_clean['first_name'] == 'Michelle')]

Unnamed: 0,biennium,bill_id,bill_num,bill_unique,current,district,first_name,htm_create_date,htm_url,last_name,...,num_sponsors,is_minority_party,is_secondary_sponsor,primary_sponsor_party,rep_score,bill_text,dem_score,loyalty_score,bill_length,congress_num
149954,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,26,Michelle,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Caldier,...,1,1,False,0,0.0,ENGROSSED SUBSTITUTE HOUSE BILL 2299State of W...,1.0,-0.142184,909286,27


In [11]:
test_clean[test_clean['voting_agency'].isnull()]['voting_agency']

Series([], Name: voting_agency, dtype: int64)

In [12]:
current_df[current_df['voting_agency'].isnull()]['voting_agency']

Series([], Name: voting_agency, dtype: int64)

### Current Top Model

In [13]:
# TRAIN
X_train_t = train_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors']]

y_train_t = train_clean['vote']

# TEST
X_test_t = test_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors']]

# y_test_t = sample_test['vote']

In [14]:
top_model = RandomForestClassifier(n_estimators=1000, max_depth=7, n_jobs=2, random_state=709)
top_model.fit(X_train_t, y_train_t)
y_pred_t = (top_model.predict_proba(X_test_t))[:, 1]
% time

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 23.1 µs


In [15]:
y_pred_t.min()

0.4304161936612869

In [16]:
test_clean['predicted_vote'] = y_pred_t

In [17]:
label_df = test_clean[['bill_id', 'bill_num', 'biennium', 'voter_id', 'voting_agency', 'predicted_vote', 
                       'rep_score', 'htm_url', 'secondary_sponsors']]

In [18]:
label_df = label_df.merge(current_legislator_df, how='left', on=['voter_id', 'voting_agency'])

In [19]:
label_df = label_df[label_df['last_name'].notnull()]

In [21]:
label_df[label_df['bill_num'] == '2299']

Unnamed: 0,bill_id,bill_num,biennium,voter_id,voting_agency,predicted_vote,rep_score,htm_url,secondary_sponsors,district,first_name,party,last_name,current
149949,ESHB 2299,2299,2017-18,10031,0,0.497933,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,23,Sherry,0,Appleton,True
149950,ESHB 2299,2299,2017-18,24075,0,0.704095,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,2,Andrew,1,Barkis,True
149951,ESHB 2299,2299,2017-18,17227,0,0.453527,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,11,Steve,0,Bergquist,True
149952,ESHB 2299,2299,2017-18,8317,0,0.502746,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,19,Brian,0,Blake,True
149953,ESHB 2299,2299,2017-18,15820,0,0.697767,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,42,Vincent,1,Buys,True
149954,ESHB 2299,2299,2017-18,20760,0,0.719471,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,26,Michelle,1,Caldier,True
149955,ESHB 2299,2299,2017-18,3469,0,0.695055,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,15,Bruce,1,Chandler,True
149956,ESHB 2299,2299,2017-18,26176,0,0.431744,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,24,Mike,0,Chapman,True
149957,ESHB 2299,2299,2017-18,1659,0,0.502858,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,43,Frank,0,Chopp,True
149958,ESHB 2299,2299,2017-18,8209,0,0.499192,0.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,41,Judy,0,Clibborn,True


In [22]:
engine_label = create_engine('postgresql://localhost:5432/wa_leg_label')

In [23]:
con = engine_label.connect()

In [24]:
label_df.to_sql('label', con, if_exists='replace', index=False)

In [None]:
len(label_df)

In [None]:
engine = create_engine('postgresql://localhost:5432/wa_leg_label')

In [25]:
labels = pd.read_sql_query('select * from "label"',con=engine_label)

In [26]:
label_bill_example = labels[labels['bill_num'] == '8408']

In [27]:
sorted_label = label_bill_example.sort_values('last_name')

In [29]:
sorted_label

Unnamed: 0,bill_id,bill_num,biennium,voter_id,voting_agency,predicted_vote,rep_score,htm_url,secondary_sponsors,district,first_name,party,last_name,current
445070,SCR 8408,8408,2017-18,14208,1,0.878938,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,26,Jan,1,Angel,True
444978,SCR 8408,8408,2017-18,10031,0,0.883090,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,23,Sherry,0,Appleton,True
445071,SCR 8408,8408,2017-18,8238,1,0.878891,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,10,Barbara,1,Bailey,True
444979,SCR 8408,8408,2017-18,24075,0,0.877445,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,2,Andrew,1,Barkis,True
445072,SCR 8408,8408,2017-18,15780,1,0.880005,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,6,Michael,1,Baumgartner,True
445073,SCR 8408,8408,2017-18,14083,1,0.876378,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,2,Randi,1,Becker,True
444980,SCR 8408,8408,2017-18,17227,0,0.880274,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,11,Steve,0,Bergquist,True
445074,SCR 8408,8408,2017-18,15811,1,0.836074,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,3,Andy,0,Billig,True
444981,SCR 8408,8408,2017-18,8317,0,0.882459,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,19,Brian,0,Blake,True
445075,SCR 8408,8408,2017-18,17289,1,0.878721,1.0,http://app.leg.wa.gov/documents/billdocs/2017-...,,20,John,1,Braun,True


In [None]:
engine = create_engine('postgresql://localhost:5432/wa_leg_staging')

In [None]:
legislator_df = pd.read_sql_query('select * from "legislator"',con=engine)

In [None]:
legislator_df[legislator_df['id'] == 17279]

In [None]:
current_legislator_df[current_legislator_df['voter_id'] == 17279]