In [7]:
import psycopg2
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import string

from sklearn.ensemble import RandomForestClassifier

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from data_cleaning.create_staging_tables import create_loyalty_scores_df
from data_cleaning.edit_current_legislator import filter_out_duplicates_from_current_leg
from clean_for_model import prepare_bill_text_for_clean_df, create_clean_test, create_clean_test, create_label_df

import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

In [39]:
engine = create_engine('postgresql://localhost:5432/wa_leg_staging')

In [77]:
merged_final_df = pd.read_sql_query('select * from "merged_final"',con=engine)
bill_text_df_dirty = pd.read_sql_query('select * from "bill_text"',con=engine)
rep_score_df = pd.read_sql_query('select * from "rep_score"',con=engine)
loyalty_df = pd.read_sql_query('select * from "loyalty"',con=engine)

current_df = pd.read_sql_query('select * from "current_clean"',con=engine)
current_bill_text_dirty = pd.read_sql_query('select * from "current_bill_text"',con=engine)
current_legislator_df = pd.read_sql_query('select * from "current_legislator"',con=engine)

In [85]:
train_clean, bill_type_dct = create_clean_train(merged_final_df, bill_text_df, rep_score_df, loyalty_df)

In [89]:
test_clean = create_clean_test(current_df, current_bill_text_df, loyalty_df, bill_type_dct)

### Use Current Top Model

In [90]:
# TRAIN
X_train_t = train_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors', 'years_until_vote', 'percent_yea', 'num_sections', 'bill_type_score']]

y_train_t = train_clean['vote']

# TEST
X_test_t = test_clean[['voter_id', 'voting_agency', 'sponsor_agency', 'district', 
                     'party', 'is_primary_sponsor', 'is_secondary_sponsor', 'is_minority_party', 
                     'primary_sponsor_party', 'rep_score', 'loyalty_score', 'bill_length', 'bill_num', 
                     'num_sponsors', 'years_until_vote', 'percent_yea', 'num_sections', 'bill_type_score']]

In [91]:
top_model = RandomForestClassifier(n_estimators=1000, max_depth=11, n_jobs=2, random_state=709)
top_model.fit(X_train_t, y_train_t)
y_pred_t = (top_model.predict_proba(X_test_t))[:, 1]
% time

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 9.78 µs


In [92]:
y_pred_t.min()

0.10278695233763821

In [None]:
label_df = create_label_df(test_clean, y_pred_t, current_legislator_df)

# Save to wa_leg_label database

In [98]:
engine_label = create_engine('postgresql://localhost:5432/wa_leg_label')

In [99]:
con = engine_label.connect()

In [100]:
label_df.to_sql('label_second', con, if_exists='replace', index=False)