In [57]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from data_cleaning.create_staging_tables import create_staging_legislator_df_STEP_ONE, create_staging_bill_df_STEP_THREE

In [2]:
engine = create_engine('postgresql://localhost:5432/wa_leg_raw')

In [5]:
raw_bill_df = pd.read_sql_query('select * from "bill_api"', con=engine)
raw_sponsor_df = pd.read_sql_query('select * from "sponsor_api"', con=engine)
raw_vote_df = pd.read_sql_query('select * from "vote_api"', con=engine)
raw_committee_member_df = pd.read_sql_query('select * from "committee_member_api"', con=engine)
missing_leg_info_df = pd.read_csv('../data/missing_legislators.csv', sep = '|')
raw_sponsor_df = pd.read_sql_query('select * from "sponsor_api"', con=engine)

In [34]:
legislator_df = create_staging_legislator_df_STEP_ONE(raw_vote_df, raw_committee_member_df, missing_leg_info_df)

In [55]:
legislator_df['party'] = legislator_df['party'].apply(change_party_word_to_int)

In [14]:
bill_df = create_staging_bill_df_STEP_THREE(raw_bill_df, raw_sponsor_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  sponsor_df_reformatted['bill_num'] = sponsor_df_reformatted['bill_id'].apply(lambda x: x.split()[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  sponsor_df_reformatted['bill_num_unique'] = sponsor_df_reformatted['biennium'] + ' ' + sponsor_df_reformatted['bill_num']


In [None]:
def create_final_bill_df(staging_bill_df, raw_vote_df):
    """Creates a dataframe with all bills and bill info including a column labeling whether the bill has
    been voted on or not. This dataframe will be used to model whether a bill will make it to a vote."""
    
    raw_vote_df['bill_unique'] = raw_vote_df['biennium'] + ' ' + raw_vote_df['bill_id']
    bills_already_voted_on = raw_vote_df['bill_unique'].unique()
    
    def determine_if_reached_vote(bill_unique):
        if bill_unique in bills_already_voted_on:
            return True
        return False
    
    def filter_out_old_versions(row):
        count = len(bill_df[bill_df['bill_num_unique'] == row['bill_num_unique']])
        if count > 1 and row['reached_vote'] == False:
            return False
        return True
    
    def count_sponsors(secondary_sponsors):
        """Counts secondary sponsors, adding one for the primary sponsors"""
        if type(secondary_sponsors) == list:
            return len(secondary_sponsors) + 1
        else: return 1
        
    def change_agency_to_int(agency):
        if agency == 'House':
            return 0
        if agency == 'Senate':
            return 1
        
    def create_bill_rep_score(row):
        '''Return # of republican sponsors / total sponsors. If there are no secondary sponsors return -1 so that
        later, when this table is joined with merged_final, the primary sponsor party can fill this field.'''
        sponsors = row['secondary_sponsors']
        if type(sponsors) == list:
            sponsors.append(row['primary_sponsor_id'])

            sponsor_parties = []
            for s in sponsors:
                subset_leg = legislator_df[(legislator_df['id'] == int(s))]
                if len(subset_leg) > 0:
                    sponsor_party = sum(subset_leg.iloc[:, 4]) / len(subset_leg)
                    sponsor_parties.append(sponsor_party)
            bill_rep_score = np.mean(sponsor_parties)
            return bill_rep_score
        else:
            return -1
    
    staging_bill_df['reached_vote'] = staging_bill_df['bill_unique'].apply(determine_if_reached_vote)
    staging_bill_df['is_final_version'] = staging_bill_df.apply(filter_out_old_versions, axis=1)
    bill_df = staging_bill_df[staging_bill_df['is_final_version'] == True]
    bill_df = bill_df.drop(['is_final_version', 'description', 'long_friendly_name', 'name', 
                            'htm_last_modified_date'], axis=1)
    bill_df['sponsor_agency'] = bill_df['sponsor_agency'].apply(change_agency_to_int)
    bill_df['num_sponsors'] = bill_df['secondary_sponsors'].apply(count_sponsors)
    bill_df['rep_score'] = bill_df.apply(create_bill_rep_score, axis=1)
    
    return bill_df

In [38]:
bill_df.head()

Unnamed: 0,biennium,bill_id,class,htm_create_date,htm_url,type,bill_unique,bill_num,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors,reached_vote
1,1991-92,SHB 1001,Bills,1991-02-01T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1001,1001,1991-92 1001,House,251,"[444, 72, 34, 54, 48, 7, 481, 301, 478, 264, 2...",True
2,1991-92,HB 1002,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1002,1002,1991-92 1002,House,251,"[332, 474, 475, 219, 285, 227, 180, 110, 484, ...",False
4,1991-92,SHB 1003,Bills,1991-02-21T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1003,1003,1991-92 1003,House,311,"[474, 54, 110]",True
5,1991-92,HB 1004,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1004,1004,1991-92 1004,House,311,"[475, 304, 110, 32, 54]",False
6,1991-92,HB 1005,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1005,1005,1991-92 1005,House,311,"[23, 475]",False


In [47]:
bill_df.head()

Unnamed: 0,biennium,bill_id,class,htm_create_date,htm_url,type,bill_unique,bill_num,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors,reached_vote,num_sponsors
1,1991-92,SHB 1001,Bills,1991-02-01T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1001,1001,1991-92 1001,0,251,"[444, 72, 34, 54, 48, 7, 481, 301, 478, 264, 2...",True,34
2,1991-92,HB 1002,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1002,1002,1991-92 1002,0,251,"[332, 474, 475, 219, 285, 227, 180, 110, 484, ...",False,18
4,1991-92,SHB 1003,Bills,1991-02-21T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1003,1003,1991-92 1003,0,311,"[474, 54, 110]",True,4
5,1991-92,HB 1004,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1004,1004,1991-92 1004,0,311,"[475, 304, 110, 32, 54]",False,6
6,1991-92,HB 1005,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1005,1005,1991-92 1005,0,311,"[23, 475]",False,3


In [58]:
bill_df['rep_score'] = bill_df.apply(create_bill_rep_score, axis=1)

In [60]:
bill_df.head()

Unnamed: 0,biennium,bill_id,class,htm_create_date,htm_url,type,bill_unique,bill_num,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors,reached_vote,num_sponsors,rep_score
1,1991-92,SHB 1001,Bills,1991-02-01T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1001,1001,1991-92 1001,0,251,"[444, 72, 34, 54, 48, 7, 481, 301, 478, 264, 2...",True,34,0.684211
2,1991-92,HB 1002,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1002,1002,1991-92 1002,0,251,"[332, 474, 475, 219, 285, 227, 180, 110, 484, ...",False,18,0.388889
4,1991-92,SHB 1003,Bills,1991-02-21T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 SHB 1003,1003,1991-92 1003,0,311,"[474, 54, 110, 311]",True,4,0.25
5,1991-92,HB 1004,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1004,1004,1991-92 1004,0,311,"[475, 304, 110, 32, 54, 311]",False,6,0.666667
6,1991-92,HB 1005,Bills,1991-01-14T00:00:00,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bills,1991-92 HB 1005,1005,1991-92 1005,0,311,"[23, 475, 311]",False,3,0.333333


In [51]:
def create_bill_rep_score(row):
    '''Return # of republican sponsors / total sponsors. If there are no secondary sponsors return -1 so that
    later, when this table is joined with merged_final, the primary sponsor party can fill this field.'''
    sponsors = row['secondary_sponsors']
    if type(sponsors) == list:
        sponsors.append(row['primary_sponsor_id'])

        sponsor_parties = []
        for s in sponsors:
            subset_leg = legislator_df[(legislator_df['id'] == int(s))]
            if len(subset_leg) > 0:
                sponsor_party = sum(subset_leg.iloc[:, 4]) / len(subset_leg)
                sponsor_parties.append(sponsor_party)
        bill_rep_score = np.mean(sponsor_parties)
        return bill_rep_score
    else:
        return -1

In [45]:
def change_agency_to_int(agency):
    if agency == 'House':
        return 0
    if agency == 'Senate':
        return 1

In [54]:
def change_party_word_to_int(party):
        if party == 'Democrat':
            return 0
        if party == 'Republican':
            return 1

#### features to create:
rep_score DONE
minority_score
num_sponsors DONE
date_introduced
type_dummies
bill_length


features to clean:
sponsor_agency DONE