In [110]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

import psycopg2 as pg2
import sqlalchemy

from WA_state_API_functions import get_status_data
from web_scrape_functions import scrape_bill_url
from create_staging_tables import load_and_clean_party_minority_history_df

In [3]:
engine_raw = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_raw')
engine_staging = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_staging')

In [10]:
status_df = pd.read_sql_query('select * from "status_api"',con=engine_raw)
staging_bill_df = pd.read_sql_query('select * from "bill"',con=engine_staging)
current_legislator_df = pd.read_sql_query('select * from "current_legislator"',con=engine_staging)
legislator_df = pd.read_sql_query('select * from "legislator"',con=engine_staging)
current_bill_text_df = pd.read_sql_query('select * from "current_bill_text"',con=engine_staging)

In [9]:
def create_staging_current_bill_text_df(current_bill_df):
    '''Create staging_current_bill_text using unique bills from current_bill_df_df and scraping the urls.
    Input
    current_bill_df: pandas dataframe loaded from wa_leg_raw database, current_bill table
    '''
    
    current_bill_text_df = current_bill_df[['biennium', 'bill_id', 'htm_url']]
    current_bill_text_df['bill_text'] = ''
    
    for i, row in current_bill_text_df.iterrows():
            url = row['htm_url']
            try: 
                bill_text = scrape_bill_url(url)
                current_bill_text_df.iloc[i,-1] = bill_text

            except:

                continue
    return current_bill_text_df

In [161]:
def create_current_clean(status_df, staging_bill_df, current_legislator_df, legislator_df):
    '''
    Create current_clean pandas dataframe.
    
    Input
    -----
    status_df: pandas dataframe loaded from wa_leg_raw database, status_api table
    staging_bill_df: pandas dataframe loaded from wa_leg_staging database, bill table
    current_legislator_df: pandas dataframe loaded from wa_leg_staging database, current_legislator table
    legislator_df: pandas dataframe loaded from wa_leg_staging database, legislator table
    
    Output
    ------
    current_X: pandas dataframe
    '''
    # Merge with bill info and bill text
    current_bill_df = status_df.merge(staging_bill_df, how='left', on= ['bill_id', 'biennium', 'bill_num'])
    
    # Match every current bill to every current legislator
    final = []
    for i, bill_row in current_bill_df.iterrows():
        bill_dct = bill_row.to_dict()
        for i, leg_row in current_legislator_df.iterrows():
            leg_dct = leg_row.to_dict()
            leg_dct.update(bill_dct)
            final.append(leg_dct)
    current_X_all = pd.DataFrame(final)
    
    # rename and drop columns
    current_X_all = current_X_all.rename(index=str, columns={"id": "voter_id", "agency": "voting_agency"})
    current_X = current_X_all.drop(['action_date', 'bill_in_process', 'status', 'type', 'class', 
                'htm_last_modified_date', 'history_line', 'bill_num_unique', 'description'], axis=1)
    
    # feature engineering functions
    def change_agency_to_int(agency):
        if agency == 'House':
            return 0
        if agency == 'Senate':
            return 1
    
    def make_sec_sponsors_a_list(sponsors):
        if type(sponsors) == str:
            clean_sponsors = sponsors.strip('{}')
            id_list = clean_sponsors.split(',')
            return id_list
        else: 
            return sponsors
        
    def make_num_sponsors(secondary_sponsors):
        return len(secondary_sponsors) + 1

    def make_is_minority_party(row):
        subset_m_history = minority_hist[(minority_hist['biennium'] == row['biennium']) &
                                         (minority_hist['agency'] == row['voting_agency'])]
        return (subset_m_history['minority_party'] == row['party']).iloc[0]

    def make_is_secondary_sponsor(row):
        secondary_sponsors = row['secondary_sponsors']
        if len(secondary_sponsors) > 0:
            voter_id = str(row['voter_id'])
            return voter_id in secondary_sponsors
        else: return False

    def find_sponsor_party(row):
        ps_id = row['primary_sponsor_id']
        ps_agency = row['sponsor_agency']
        subset_leg = legislator_df[(legislator_df['id'] == int(ps_id)) & 
                                   (legislator_df['agency'] == int(ps_agency))]
        if len(subset_leg) > 0:
            sponsor_party = subset_leg.iloc[0, 4]
            return sponsor_party
        else: return 2  
        
    def create_bill_rep_score(row):
        '''Return # of republican sponsors / total sponsors. If there are no secondary sponsors return -1 so that
        later, when this table is joined with merged_final, the primary sponsor party can fill this field.'''
        sponsors = row['secondary_sponsors']
        if type(sponsors) != list:
            sponsors = []
        if type(sponsors) == list:
            sponsors.append(row['primary_sponsor_id'])

            sponsor_parties = []
            for s in sponsors:
                subset_leg = legislator_df[(legislator_df['id'] == int(s))]
                if len(subset_leg) > 0:
                    sponsor_party = sum(subset_leg.iloc[:, 4]) / len(subset_leg)
                    sponsor_parties.append(sponsor_party)
            bill_rep_score = np.mean(sponsor_parties)
            return bill_rep_score
        else:
            return None
    
    def get_secondary_sponsors(row):
        subset = current_X[(current_X['bill_id'] == row['bill_id']) & (current_X['biennium'] == row['biennium'])]
        if len(subset) > 0:
            secondary_sponsors = subset['secondary_sponsors'][0]
            return secondary_sponsors
        else: return []
        
    # feature engineering
    current_X['sponsor_agency'] = current_X['sponsor_agency'].apply(change_agency_to_int)
    current_X['voting_agency'] = current_X['voting_agency'].apply(change_agency_to_int)
    current_X['primary_sponsor_id'] = current_X['primary_sponsor_id'].apply(int)
    current_X['secondary_sponsors'] = current_X['secondary_sponsors'].apply(make_sec_sponsors_a_list)
    current_X['secondary_sponsors'].fillna('', inplace=True)
    current_X['is_primary_sponsor'] = current_X['voter_id'] == current_X['primary_sponsor_id']
    
    minority_hist = load_and_clean_party_minority_history_df()
    legislator_df['agency'] = legislator_df['agency'].apply(change_agency_to_int)
    
    current_X['num_sponsors'] = current_X['secondary_sponsors'].apply(make_num_sponsors)
    current_X['is_minority_party'] = current_X.apply(make_is_minority_party, axis=1)
    current_X['is_secondary_sponsor'] = current_X.apply(make_is_secondary_sponsor, axis=1)
    current_X['sponsor_party'] = current_X.apply(find_sponsor_party, axis=1)
    
    # Create smaller dataframe with only unique bills
    current_X_unique_bills = current_X[['bill_id', 'biennium', 'primary_sponsor_id']]
    current_X_unique_bills.drop_duplicates(keep='first', inplace=True)
    current_X_unique_bills['secondary_sponsors'] = current_X_unique_bills.apply(get_secondary_sponsors, axis=1)
    current_X_unique_bills['rep_score'] = current_X_unique_bills.apply(create_bill_rep_score, axis=1)
    current_X_unique_bills = current_X_unique_bills.drop('secondary_sponsors', axis=1)

    current_X = current_X.merge(current_X_unique_bills, how='left', on=['bill_id', 'biennium', 'primary_sponsor_id'])
    
    return current_X