In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

import psycopg2 as pg2
import sqlalchemy
from functools import reduce

from WA_state_API_functions import get_status_data
from web_scrape_functions import scrape_bill_url
from create_staging_tables import load_and_clean_party_minority_history_df

In [2]:
engine_raw = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_raw')
engine_staging = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_staging')

In [3]:
status_df = pd.read_sql_query('select * from "status_api"',con=engine_raw)
staging_bill_df = pd.read_sql_query('select * from "bill"',con=engine_staging)
current_legislator_df = pd.read_sql_query('select * from "current_legislator"',con=engine_staging)
legislator_df = pd.read_sql_query('select * from "legislator"',con=engine_staging)
current_bill_text_df = pd.read_sql_query('select * from "current_bill_text"',con=engine_staging)

In [4]:
def create_staging_current_bill_text_df(current_bill_df):
    '''Create staging_current_bill_text using unique bills from current_bill_df_df and scraping the urls.
    Input
    current_bill_df: pandas dataframe loaded from wa_leg_raw database, current_bill table
    '''
    
    current_bill_text_df = current_bill_df[['biennium', 'bill_id', 'htm_url']]
    current_bill_text_df['bill_text'] = ''
    
    for i, row in current_bill_text_df.iterrows():
            url = row['htm_url']
            try: 
                bill_text = scrape_bill_url(url)
                current_bill_text_df.iloc[i,-1] = bill_text

            except:

                continue
    return current_bill_text_df

In [5]:
def create_current_clean(status_df, staging_bill_df, current_legislator_df, legislator_df):
    '''
    Create current_df pandas dataframe.
    
    Input
    -----
    status_df: pandas dataframe loaded from wa_leg_raw database, status_api table
    staging_bill_df: pandas dataframe loaded from wa_leg_staging database, bill table
    current_legislator_df: pandas dataframe loaded from wa_leg_staging database, current_legislator table
    legislator_df: pandas dataframe loaded from wa_leg_staging database, legislator table
    
    Output
    ------
    current_X: pandas dataframe
    '''
    # Merge with bill info and bill text
    current_bill_df = status_df.merge(staging_bill_df, how='left', on= ['bill_id', 'biennium', 'bill_num'])
    
    # Filter out duplicate rows
    unique_bill_nums = current_bill_df['bill_id'].unique()
    final_rows = []
    for bill_num in unique_bill_nums:
        subset = current_bill_df[current_bill_df['bill_id'] == bill_num]
        if len(subset) > 1:
            htm_dates = []
            for i, row in subset.iterrows():
                htm_date = row['htm_last_modified_date']
                htm_dates.append(htm_date)
            most_recent_idx = np.argmax(np.array(htm_dates))
            final_rows.append(subset.iloc[most_recent_idx, :])
        else:
            final_rows.append(subset)
    current_bill_df = reduce((lambda x, y: x.append(y)), final_rows)
    
    # Match every current bill to every current legislator
    final = []
    for i, bill_row in current_bill_df.iterrows():
        bill_dct = bill_row.to_dict()
        for i, leg_row in current_legislator_df.iterrows():
            leg_dct = leg_row.to_dict()
            leg_dct.update(bill_dct)
            final.append(leg_dct)
    current_X_all = pd.DataFrame(final)
    
    # rename and drop columns
    current_X_all = current_X_all.rename(index=str, columns={"id": "voter_id", "agency": "voting_agency"})
    current_X = current_X_all.drop(['action_date', 'bill_in_process', 'status', 'type', 'class', 
                'htm_last_modified_date', 'history_line', 'bill_num_unique', 'description'], axis=1)
    
    # feature engineering functions
    def change_agency_to_int(agency):
        if agency == 'House':
            return 0
        if agency == 'Senate':
            return 1
    
    def make_sec_sponsors_a_list(sponsors):
        if type(sponsors) == str:
            clean_sponsors = sponsors.strip('{}')
            id_list = clean_sponsors.split(',')
            return id_list
        else: 
            return sponsors
        
    def make_num_sponsors(secondary_sponsors):
        return len(secondary_sponsors) + 1

    def make_is_minority_party(party):
        if party == 0:
            return 0
        if party == 1:
            return 1

    def make_is_secondary_sponsor(row):
        secondary_sponsors = row['secondary_sponsors']
        if len(secondary_sponsors) > 0:
            voter_id = str(row['voter_id'])
            return voter_id in secondary_sponsors
        else: return False

    def find_sponsor_party(row):
        ps_id = row['primary_sponsor_id']
        ps_agency = row['sponsor_agency']
        subset_leg = legislator_df[(legislator_df['id'] == int(ps_id)) & 
                                   (legislator_df['agency'] == int(ps_agency))]
        if len(subset_leg) > 0:
            sponsor_party = subset_leg.iloc[0, 4]
            return sponsor_party
        else: return 2  
        
    def create_bill_rep_score(row):
        '''Return # of republican sponsors / total sponsors. If there are no secondary sponsors return -1 so that
        later, when this table is joined with merged_final, the primary sponsor party can fill this field.'''
        sponsors = row['secondary_sponsors']
        if type(sponsors) != list:
            sponsors = []
        if type(sponsors) == list:
            sponsors.append(row['primary_sponsor_id'])

            sponsor_parties = []
            for s in sponsors:
                subset_leg = legislator_df[(legislator_df['id'] == int(s))]
                if len(subset_leg) > 0:
                    sponsor_party = sum(subset_leg.iloc[:, 4]) / len(subset_leg)
                    sponsor_parties.append(sponsor_party)
            bill_rep_score = np.mean(sponsor_parties)
            return bill_rep_score
        else:
            return None
    
    def get_secondary_sponsors(row):
        subset = current_X[(current_X['bill_id'] == row['bill_id']) & (current_X['biennium'] == row['biennium'])]
        if len(subset) > 0:
            secondary_sponsors = subset['secondary_sponsors'][0]
            return secondary_sponsors
        else: return []
        
    # feature engineering
    current_X['sponsor_agency'] = current_X['sponsor_agency'].apply(change_agency_to_int)
    current_X['voting_agency'] = current_X['voting_agency'].apply(change_agency_to_int)
    current_X['primary_sponsor_id'] = current_X['primary_sponsor_id'].apply(int)
    current_X['secondary_sponsors'] = current_X['secondary_sponsors'].apply(make_sec_sponsors_a_list)
    current_X['secondary_sponsors'].fillna('', inplace=True)
    current_X['is_primary_sponsor'] = current_X['voter_id'] == current_X['primary_sponsor_id']
    
    minority_hist = load_and_clean_party_minority_history_df()
    legislator_df['agency'] = legislator_df['agency'].apply(change_agency_to_int)
    
    current_X['num_sponsors'] = current_X['secondary_sponsors'].apply(make_num_sponsors)
    current_X['is_minority_party'] = current_X['party'].apply(make_is_minority_party)
    current_X['is_secondary_sponsor'] = current_X.apply(make_is_secondary_sponsor, axis=1)
    current_X['sponsor_party'] = current_X.apply(find_sponsor_party, axis=1)
    
    # Create smaller dataframe with only unique bills
    current_X_unique_bills = current_X[['bill_id', 'biennium', 'primary_sponsor_id']]
    current_X_unique_bills.drop_duplicates(keep='first', inplace=True)
    current_X_unique_bills['secondary_sponsors'] = current_X_unique_bills.apply(get_secondary_sponsors, axis=1)
    current_X_unique_bills['rep_score'] = current_X_unique_bills.apply(create_bill_rep_score, axis=1)
    current_X_unique_bills = current_X_unique_bills.drop('secondary_sponsors', axis=1)

    current_X = current_X.merge(current_X_unique_bills, how='left', on=['bill_id', 'biennium', 'primary_sponsor_id'])
    
    return current_X

In [6]:
current_df = create_current_clean(status_df, staging_bill_df, current_legislator_df, legislator_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
current_df[current_df['bill_num'] == '2299']

Unnamed: 0,biennium,bill_id,bill_num,bill_unique,current,district,first_name,htm_create_date,htm_url,last_name,...,secondary_sponsors,sponsor_agency,voter_id,voting_agency,is_primary_sponsor,num_sponsors,is_minority_party,is_secondary_sponsor,sponsor_party,rep_score
149940,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,23,Sherry,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Appleton,...,,0,10031,,False,1,0,False,0,0.0
149941,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,2,Andrew,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Barkis,...,,0,24075,,False,1,1,False,0,0.0
149942,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,11,Steve,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Bergquist,...,,0,17227,,False,1,0,False,0,0.0
149943,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,19,Brian,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Blake,...,,0,8317,,False,1,0,False,0,0.0
149944,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,42,Vincent,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Buys,...,,0,15820,,False,1,1,False,0,0.0
149945,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,26,Michelle,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Caldier,...,,0,20760,,False,1,1,False,0,0.0
149946,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,15,Bruce,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Chandler,...,,0,3469,,False,1,1,False,0,0.0
149947,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,24,Mike,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Chapman,...,,0,26176,,False,1,0,False,0,0.0
149948,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,43,Frank,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Chopp,...,,0,1659,,False,1,0,False,0,0.0
149949,2017-18,ESHB 2299,2299,2017-18 ESHB 2299,True,41,Judy,2018-02-24T18:02:44.91,http://app.leg.wa.gov/documents/billdocs/2017-...,Clibborn,...,,0,8209,,False,1,0,False,0,0.0


In [8]:
con = engine_staging.connect()

In [None]:
current_df.to_sql('current_clean', con, if_exists='replace', index=False)