In [92]:
import psycopg2
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine

from collections import defaultdict
import pandas as pd

from selenium.webdriver import (Chrome, Firefox)
import urllib
from bs4 import BeautifulSoup

In [93]:
def scrape_bill_topic_table(year):
    '''Scrape data from apps.leg.wa.gov/billsbytopic organize into a dataframe with the following columns:
       bill_id, bill_topic, bill_topic_expanded.
       
       year: str in format '2015' 
       '''
    
    browser = Firefox()
    browser.get('http://apps.leg.wa.gov/billsbytopic/Results.aspx?year={}'.format(year))

    table = browser.find_elements_by_css_selector('div#divContent table')[1]
    html = table.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    rows = soup.select('tr')
    data = []
    heading = None
    
    for i, row in enumerate(rows):
        row_data = {}
        tds = row.select('td')
        
        if not tds: 
            continue
        if tds[0].attrs.get('width') != '5%':
            heading = row.select_one('td a').text.strip()
            continue
            
        row_data['bill_topic'] = heading
        row_data['bill_topic_expanded'] = tds[1].text.partition(':')[0]
        bill_ids = [a.text for a in tds[1].select('a')]

        for bill_id in bill_ids:
            row_data['bill_id'] = bill_id
            data.append(row_data)

    return pd.DataFrame(data)

In [107]:
def save_topic_data_STEP_FIVE():
    '''The API contains data that dates back to 1991. This data will 
    not change and therefore only needs to be retrieved once.

    Creates table: topic_scrape
    '''  
    # Create connection to wa_leg_raw postgres database
    engine = sqlalchemy.create_engine('postgresql://emilykarboski@localhost:5432/wa_leg_raw')
    con = engine.connect()

    # Scrape topic data and put in postgres table
    years = ['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
             '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', 
             '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']

    topic_dfs = []
    for year in years:
        topic_df = scrape_bill_topic_table(year)
        topic_df['year'] = year
        topic_dfs.append(topic_df)
        
    topic_dfs[0].to_sql('topic_scrape', con, if_exists='replace', index=False)
    
    for df in topic_dfs[1:]:
        df.to_sql('topic_scrape', con, if_exists='append', index=False)

In [128]:
topic_2000 = topic_df_n[0]
topic_2001 = topic_df_n[1]

In [132]:
topic_df_sub = topic_df[topic_df['year'] != '20002001']

In [135]:
topic_df_sub = pd.concat([topic_df_sub, topic_2000])

In [136]:
topic_df_sub = pd.concat([topic_df_sub, topic_2001])

In [138]:
topic_df_sub['year'].unique()

array(['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2005', '2002', '2003', '2004', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2000', '2001'], dtype=object)

In [140]:
topic_df_sub.to_sql('topic_scrape', con, if_exists='replace', index=False)

# Load data

In [158]:
engine = create_engine('postgresql://localhost:5432/wa_leg_staging')
con = engine.connect()

In [76]:
bill_df = pd.read_sql_query('select * from "bill_api"',con=engine)

In [141]:
topic_df = pd.read_sql_query('select * from "topic_scrape"',con=engine)

# Prepare topic_data

In [142]:
topic_df['bill_id'] = topic_df['bill_id'].apply(lambda x: x.split(',')[0])

In [143]:
topic_df.head()

Unnamed: 0,bill_id,bill_topic,bill_topic_expanded,year
0,HB 2682,ABANDONED PROPERTY,"Intangible property, when presumed abandoned a...",1991
1,SB 5049,ABANDONED PROPERTY,"Junk vehicles, disposal of abandoned vehicles ...",1991
2,SHB 1137,ABANDONED PROPERTY,"Uncashed checks, duties of local governments h...",1991
3,SSB 5185,ABANDONED PROPERTY,"Uncashed checks, local governments authorized ...",1991
4,SSB 5185,ABANDONED PROPERTY,"Uncashed checks, local governments authorized ...",1991


In [144]:
topic_df['year'].unique()

array(['1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2002', '2005', '2003', '2004', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2000', '2001'], dtype=object)

In [145]:
def change_year_to_biennium(year):
    '''Changes year to biennium. Input is string, output is string'''
    if year == '1991' or year == '1992':
        return '1991-92'
    if year == '1993' or year == '1994':
        return '1993-94'
    if year == '1995' or year == '1996':
        return '1995-96'
    if year == '1997' or year == '1998':
        return '1997-98'
    if year == '1999' or year == '2000':
        return '1999-00'
    if year == '2001' or year == '2002':
        return '2001-02'
    if year == '2003' or year == '2004':
        return '2003-04'
    if year == '2005' or year == '2006':
        return '2005-06'
    if year == '2007' or year == '2008':
        return '2007-08'
    if year == '2009' or year == '2010':
        return '2003-04'
    if year == '2011' or year == '2012':
        return '2011-12'
    if year == '2013' or year == '2014':
        return '2013-14'
    if year == '2015' or year == '2016':
        return '2015-16'
    if year == '2017' or year == '2018':
        return '2017-18'

In [146]:
topic_df['biennium'] = topic_df['year'].apply(change_year_to_biennium)

In [147]:
topic_df['bill_num'] = topic_df['bill_id'].str[-4:]

In [148]:
topic_df = topic_df.drop(['bill_topic_expanded', 'year'], axis = 1)

In [149]:
topic_df.drop_duplicates(keep='first', inplace=True)

In [150]:
topic_df[topic_df['bill_num'] == '2293']

Unnamed: 0,bill_id,bill_topic,biennium,bill_num
15,ESHB 2293,"ACCOUNTANCY, BOARD",1991-92,2293
27,ESHB 2293,ACCOUNTANTS AND ACCOUNTING,1991-92,2293
2930,ESHB 2293,COLLEGES AND UNIVERSITIES,1991-92,2293
8669,ESHB 2293,GOVERNOR,1991-92,2293
13041,ESHB 2293,MINORITIES,1991-92,2293
15279,ESHB 2293,PUBLIC FUNDS AND ACCOUNTS,1991-92,2293
51760,HB 2293,"COMMUNITY AND TECHNICAL COLLEGES, BOARD",1993-94,2293
66503,HB 2293,"PUBLIC INSTRUCTION, SUPERINTENDENT",1993-94,2293
68465,HB 2293,SCHOOLS AND SCHOOL DISTRICTS,1993-94,2293
72700,HB 2293,TELECOMMUNICATIONS,1993-94,2293


In [151]:
t = topic_df.loc[:, ['bill_id', 'biennium', 'bill_num']]

In [152]:
t = t.drop_duplicates(keep='first')

In [153]:
len(t), len(topic_df)

(51375, 140500)

In [154]:
unique_topics = topic_df['bill_topic'].unique()
for topic in unique_topics:
    t[topic] = 0

In [155]:
for row in topic_df.iterrows():
#     print(row[1]['bill_topic'])
    t.loc[(t['bill_num'] == row[1]['bill_num']) & (t['biennium'] == row[1]['biennium']), row[1]['bill_topic']] = 1

In [None]:
t.loc[(t['bill_num'] == '1001') & (t['biennium'] == '2015-16'), 'BUDGETS']

In [157]:
t.shape

(51375, 1784)

In [160]:
t.to_csv('topic_staging.csv', sep="|")

In [None]:
bill_df['bill_num'] = bill_df['bill_id'].str[-4:]

# Make small data sets to play with

In [None]:
topic_df_1991 = topic_df[topic_df['biennium'] == '1991-92']

In [None]:
bill_df_1991 = bill_df[bill_df['biennium'] == '1991-92']

In [None]:
merged_df = bill_df.merge(topic_df, how='outer', on=['bill_num', 'biennium'])

In [None]:
len(merged_df)

In [None]:
len(merged_df[merged_df['bill_topic'].isnull()])

In [None]:
len(merged_df['bill_topic'].unique())

In [None]:
merged_df['bill_topic']