In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

import psycopg2 as pg2
import sqlalchemy

from WA_state_API_functions import get_status_data
from web_scrape_functions import scrape_bill_url
from create_staging_tables import load_and_clean_party_minority_history_df

In [3]:
engine_raw = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_raw')
engine_staging = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_staging')

In [10]:
status_df = pd.read_sql_query('select * from "status_api"',con=engine_raw)
staging_bill_df = pd.read_sql_query('select * from "bill"',con=engine_staging)
current_legislator_df = pd.read_sql_query('select * from "current_legislator"',con=engine_staging)
rep_score_df = pd.read_sql_query('select * from "rep_score"',con=engine_staging)
legislator_df = pd.read_sql_query('select * from "legislator"',con=engine_staging)
current_bill_text_df = pd.read_sql_query('select * from "current_bill_text"',con=engine_staging)

In [5]:
status_df.head()

Unnamed: 0,action_date,biennium,bill_id,bill_num,history_line,status,bill_in_process
0,2018-01-08T00:00:00,2017-18,HB 1000,1000,"By resolution, reintroduced and retained in pr...",HPublic Safety,1
1,2018-01-08T00:00:00,2017-18,HB 1002,1002,"By resolution, reintroduced and retained in pr...",HHC/Wellness,1
2,2018-01-08T00:00:00,2017-18,HB 1003,1003,"By resolution, reintroduced and retained in pr...",HHC/Wellness,1
3,2018-01-08T00:00:00,2017-18,HB 1004,1004,"By resolution, reintroduced and retained in pr...",HJudiciary,1
4,2018-01-08T00:00:00,2017-18,HB 1005,1005,"By resolution, reintroduced and retained in pr...","HState Govt, El",1


In [6]:
staging_bill_df.head()

Unnamed: 0,biennium,bill_id,class,description,htm_create_date,htm_last_modified_date,htm_url,long_friendly_name,name,type,bill_num,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors,bill_unique
0,1991-92,HB 1001,Bills,,1991-08-30T00:00:00,2006-07-10T17:13:53.543,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1001,1001,House Bills,1001,1991-92 1001,House,251,"{188,32,7,17,286,11,34,140,72,254,328,325,54,4...",1991-92 HB 1001
1,1991-92,SHB 1001,Bills,,1991-02-01T00:00:00,2006-07-10T17:13:54.903,http://app.leg.wa.gov/documents/billdocs/1991-...,Substitute House Bill 1001,1001-S,House Bills,1001,1991-92 1001,House,251,"{188,32,7,17,286,11,34,140,72,254,328,325,54,4...",1991-92 SHB 1001
2,1991-92,HB 1002,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.637,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1002,1002,House Bills,1002,1991-92 1002,House,251,"{443,285,320,219,7,484,180,472,475,394,23,110,...",1991-92 HB 1002
3,1991-92,HB 1003,Bills,,1991-01-14T00:00:00,2006-07-10T17:13:11.747,http://app.leg.wa.gov/documents/billdocs/1991-...,House Bill 1003,1003,House Bills,1003,1991-92 1003,House,311,"{54,110,474}",1991-92 HB 1003
4,1991-92,SHB 1003,Bills,,1991-02-21T00:00:00,2006-07-10T17:14:07.357,http://app.leg.wa.gov/documents/billdocs/1991-...,Substitute House Bill 1003,1003-S,House Bills,1003,1991-92 1003,House,311,"{54,110,474}",1991-92 SHB 1003


In [7]:
current_bill_df = status_df.merge(staging_bill_df, how='left', on= ['bill_id', 'biennium', 'bill_num'])

In [8]:
len(current_bill_df['bill_num'].unique())

3028

In [9]:
def create_staging_current_bill_text_df(current_bill_df):
    '''Create staging_current_bill_text using unique bills from current_bill_df_df and scraping the urls.
    Input
    current_bill_df: pandas dataframe loaded from wa_leg_raw database, current_bill table
    '''
    
    current_bill_text_df = current_bill_df[['biennium', 'bill_id', 'htm_url']]
    current_bill_text_df['bill_text'] = ''
    
    count = 0
    for i, row in current_bill_text_df.iterrows():
            url = row['htm_url']
            try: 
                bill_text = scrape_bill_url(url)
                current_bill_text_df.iloc[i,-1] = bill_text
                count += 1
                print(count)
            except:
                count += 1
                print(count)
                continue
    return current_bill_text_df

In [None]:
# current_bill_text_df = create_staging_current_bill_text_df(current_bill_df)

In [11]:
current_bill_text_df.head()

Unnamed: 0,biennium,bill_id,htm_url,bill_text
0,2017-18,HB 1000,http://app.leg.wa.gov/documents/billdocs/2017-...,H-0360.1HOUSE BILL 1000State of Washington65th...
1,2017-18,HB 1002,http://app.leg.wa.gov/documents/billdocs/2017-...,H-0289.1HOUSE BILL 1002State of Washington65th...
2,2017-18,HB 1003,http://app.leg.wa.gov/documents/billdocs/2017-...,H-0240.1HOUSE BILL 1003State of Washington65th...
3,2017-18,HB 1004,http://app.leg.wa.gov/documents/billdocs/2017-...,H-0401.1HOUSE BILL 1004State of Washington65th...
4,2017-18,HB 1005,http://app.leg.wa.gov/documents/billdocs/2017-...,H-0166.2HOUSE BILL 1005State of Washington65th...


In [12]:
current_bill_merged_df = current_bill_df.merge(current_bill_text_df, how='left', on=['biennium', 'bill_id', 'htm_url'])

In [13]:
current_bill_merged_df

Unnamed: 0,action_date,biennium,bill_id,bill_num,history_line,status,bill_in_process,class,description,htm_create_date,...,htm_url,long_friendly_name,name,type,bill_num_unique,sponsor_agency,primary_sponsor_id,secondary_sponsors,bill_unique,bill_text
0,2018-01-08T00:00:00,2017-18,HB 1000,1000,"By resolution, reintroduced and retained in pr...",HPublic Safety,1,Bills,,2016-12-05T13:15:05.217,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1000,1000,House Bills,2017-18 1000,House,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",2017-18 HB 1000,H-0360.1HOUSE BILL 1000State of Washington65th...
1,2018-01-08T00:00:00,2017-18,HB 1002,1002,"By resolution, reintroduced and retained in pr...",HHC/Wellness,1,Bills,,2016-12-05T13:15:16.217,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1002,1002,House Bills,2017-18 1002,House,15006,"{20752,15813,17223,24075,20759,20741,3469,1851...",2017-18 HB 1002,H-0289.1HOUSE BILL 1002State of Washington65th...
2,2018-01-08T00:00:00,2017-18,HB 1003,1003,"By resolution, reintroduced and retained in pr...",HHC/Wellness,1,Bills,,2016-12-05T13:15:16.607,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1003,1003,House Bills,2017-18 1003,House,14202,"{15006,17223,20757,13209,24075,20741,3469,1851...",2017-18 HB 1003,H-0240.1HOUSE BILL 1003State of Washington65th...
3,2018-01-08T00:00:00,2017-18,HB 1004,1004,"By resolution, reintroduced and retained in pr...",HJudiciary,1,Bills,,2016-12-05T13:15:16.967,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1004,1004,House Bills,2017-18 1004,House,14202,"{15006,20741,18516,17221,15820,9996}",2017-18 HB 1004,H-0401.1HOUSE BILL 1004State of Washington65th...
4,2018-01-08T00:00:00,2017-18,HB 1005,1005,"By resolution, reintroduced and retained in pr...","HState Govt, El",1,Bills,,2016-12-05T13:15:27.31,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1005,1005,House Bills,2017-18 1005,House,15006,"{20752,17223,20757,20761,17213,13209,24075,823...",2017-18 HB 1005,H-0166.2HOUSE BILL 1005State of Washington65th...
5,2018-01-08T00:00:00,2017-18,HB 1006,1006,"By resolution, reintroduced and retained in pr...",HLabor & Workpl,1,Bills,,2016-12-05T13:15:27.67,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1006,1006,House Bills,2017-18 1006,House,14202,"{15820,18516,15006,20741}",2017-18 HB 1006,H-0116.1HOUSE BILL 1006State of Washington65th...
6,2018-01-08T00:00:00,2017-18,HB 1007,1007,"By resolution, reintroduced and retained in pr...",HLabor & Workpl,1,Bills,,2016-12-05T13:15:28.09,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1007,1007,House Bills,2017-18 1007,House,14202,"{15820,15006,20741,3469}",2017-18 HB 1007,H-0115.1HOUSE BILL 1007State of Washington65th...
7,2018-01-08T00:00:00,2017-18,HB 1008,1008,"By resolution, reintroduced and retained in pr...",HCap Budget,1,Bills,,2016-12-05T13:15:38.497,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1008,1008,House Bills,2017-18 1008,House,14202,"{15006,13209,20741,11952,15820,9996}",2017-18 HB 1008,H-0039.1HOUSE BILL 1008State of Washington65th...
8,2018-01-08T00:00:00,2017-18,HB 1009,1009,"By resolution, reintroduced and retained in pr...",HEnvironment,1,Bills,,2016-12-05T13:15:38.843,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1009,1009,House Bills,2017-18 1009,House,14202,"{15006,20741,18516,11952,15820,17158,9996}",2017-18 HB 1009,H-0439.1HOUSE BILL 1009State of Washington65th...
9,2018-01-08T00:00:00,2017-18,HB 1011,1011,"By resolution, reintroduced and retained in pr...",HJudiciary,1,Bills,,2016-12-05T13:15:39.547,...,http://app.leg.wa.gov/documents/billdocs/2017-...,House Bill 1011,1011,House Bills,2017-18 1011,House,15006,"{17223,17213,13209,20741,18516,14207,1649,1195...",2017-18 HB 1011,H-0248.1HOUSE BILL 1011State of Washington65th...


In [14]:
final = []
for i, bill_row in current_bill_merged_df.iterrows():
    bill_dct = bill_row.to_dict()
    for i, leg_row in current_legislator_df.iterrows():
        leg_dct = leg_row.to_dict()
        leg_dct.update(bill_dct)
        final.append(leg_dct)
current_X_all = pd.DataFrame(final)

In [15]:
current_X_all.columns

Index(['action_date', 'agency', 'biennium', 'bill_id', 'bill_in_process',
       'bill_num', 'bill_num_unique', 'bill_text', 'bill_unique', 'class',
       'description', 'district', 'first_name', 'history_line',
       'htm_create_date', 'htm_last_modified_date', 'htm_url', 'id',
       'last_name', 'long_friendly_name', 'name', 'party',
       'primary_sponsor_id', 'secondary_sponsors', 'sponsor_agency', 'status',
       'type'],
      dtype='object')

In [16]:
current_X_all = current_X_all.rename(index=str, columns={"id": "voter_id", "agency": "voting_agency"})

In [38]:
current_X = current_X_all.drop(['action_date', 'bill_in_process', 'status', 'type', 'class', 
                'htm_last_modified_date', 'history_line', 'bill_num_unique', 'description'], axis=1)

In [39]:
current_X

Unnamed: 0,voting_agency,biennium,bill_id,bill_num,bill_text,bill_unique,district,first_name,htm_create_date,htm_url,voter_id,last_name,long_friendly_name,name,party,primary_sponsor_id,secondary_sponsors,sponsor_agency
0,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,23,Sherry,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,10031,Appleton,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
1,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,2,Andrew,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,24075,Barkis,House Bill 1000,1000,1,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
2,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,11,Steve,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,17227,Bergquist,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
3,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,19,Brian,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,8317,Blake,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
4,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,42,Vincent,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,15820,Buys,House Bill 1000,1000,1,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
5,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,26,Michelle,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,20760,Caldier,House Bill 1000,1000,1,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
6,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,15,Bruce,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,3469,Chandler,House Bill 1000,1000,1,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
7,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,24,Mike,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,26176,Chapman,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
8,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,43,Frank,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,1659,Chopp,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House
9,House,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,41,Judy,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,8209,Clibborn,House Bill 1000,1000,0,26175,"{23902,26174,10031,3483,17224,17241,20755,1543...",House


In [40]:
def change_agency_to_int(agency):
    if agency == 'House':
        return 0
    if agency == 'Senate':
        return 1

In [41]:
def make_sec_sponsors_a_list(sponsors):
    if type(sponsors) == str:
        clean_sponsors = sponsors.strip('{}')
        id_list = clean_sponsors.split(',')
        return id_list
    else: 
        return sponsors

In [42]:
current_X['sponsor_agency'] = current_X['sponsor_agency'].apply(change_agency_to_int)
current_X['voting_agency'] = current_X['voting_agency'].apply(change_agency_to_int)
current_X['primary_sponsor_id'] = current_X['primary_sponsor_id'].apply(int)
current_X['secondary_sponsors'] = current_X['secondary_sponsors'].apply(make_sec_sponsors_a_list)
current_X['secondary_sponsors'].fillna('', inplace=True)
current_X['is_primary_sponsor'] = current_X['voter_id'] == current_X['primary_sponsor_id']

In [43]:
minority_hist = load_and_clean_party_minority_history_df()

In [49]:
minority_hist.head()

Unnamed: 0,agency,biennium,minority_party
0,1,1991-92,0
1,0,1991-92,1
2,1,1993-94,1
3,0,1993-94,1
4,1,1995-96,1


In [65]:
def make_num_sponsors(secondary_sponsors):
    return len(secondary_sponsors) + 1

def make_is_minority_party(row):
    subset_m_history = minority_hist[(minority_hist['biennium'] == row['biennium']) &
                                     (minority_hist['agency'] == row['voting_agency'])]
    return (subset_m_history['minority_party'] == row['party']).iloc[0]

def make_is_secondary_sponsor(row):
    secondary_sponsors = row['secondary_sponsors']
    if len(secondary_sponsors) > 0:
        voter_id = str(row['voter_id'])
        return voter_id in secondary_sponsors
    else: return False

def find_sponsor_party(row):
    ps_id = row['primary_sponsor_id']
    ps_agency = row['sponsor_agency']
    subset_leg = legislator_df[(legislator_df['id'] == int(ps_id)) & 
                               (legislator_df['agency'] == int(ps_agency))]
    if len(subset_leg) > 0:
        sponsor_party = subset_leg.iloc[0, 4]
        return sponsor_party
    else: return 2

In [46]:
current_X['num_sponsors'] = current_X['secondary_sponsors'].apply(make_num_sponsors)

In [79]:
len(current_X)

603856

In [83]:
test = current_X.iloc[0:30000, :]

In [86]:
current_X['is_minority_party'] = current_X.apply(make_is_minority_party, axis=1)

In [88]:
current_X['is_secondary_sponsor'] = current_X.apply(make_is_secondary_sponsor, axis=1)

In [103]:
legislator_df['agency'] = legislator_df['agency'].apply(change_agency_to_int)

In [None]:
current_X['sponsor_party'] = current_X.apply(find_sponsor_party, axis=1)

In [91]:
current_X

Unnamed: 0,voting_agency,biennium,bill_id,bill_num,bill_text,bill_unique,district,first_name,htm_create_date,htm_url,...,name,party,primary_sponsor_id,secondary_sponsors,sponsor_agency,is_primary_sponsor,num_sponsors,is_minority_party,is_secondary_sponsor,sponsor_party
0,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,23,Sherry,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,True,2
1,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,2,Andrew,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,1,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,True,False,2
2,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,11,Steve,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,False,2
3,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,19,Brian,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,False,2
4,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,42,Vincent,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,1,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,True,False,2
5,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,26,Michelle,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,1,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,True,False,2
6,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,15,Bruce,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,1,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,True,False,2
7,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,24,Mike,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,False,2
8,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,43,Frank,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,False,2
9,0,2017-18,HB 1000,1000,H-0360.1HOUSE BILL 1000State of Washington65th...,2017-18 HB 1000,41,Judy,2016-12-05T13:15:05.217,http://app.leg.wa.gov/documents/billdocs/2017-...,...,1000,0,26175,"[23902, 26174, 10031, 3483, 17224, 17241, 2075...",0,False,10,False,False,2


In [None]:
def create_bill_rep_score(row):
        '''Return # of republican sponsors / total sponsors. If there are no secondary sponsors return -1 so that
        later, when this table is joined with merged_final, the primary sponsor party can fill this field.'''
        sponsors = row['secondary_sponsors']
        if type(sponsors) == list:
            sponsors.append(row['primary_sponsor_id'])

            sponsor_parties = []
            for s in sponsors:
                subset_leg = legislator_df[(legislator_df['id'] == int(s))]
                if len(subset_leg) > 0:
                    sponsor_party = sum(subset_leg.iloc[:, 4]) / len(subset_leg)
                    sponsor_parties.append(sponsor_party)
            bill_rep_score = np.mean(sponsor_parties)
            return bill_rep_score
        else:
            return None

In [None]:
rep_score_df['rep_score'] = rep_score_df.apply(create_bill_rep_score, axis=1)