In [6]:
import os
import re
import wget
import PyPDF2
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Comment

# Summary

This notebook walks through scraping bills from the offical gov sites of CA, NY, NY, PA, and SD. It is broken up into an initial text extraction functions and somewhat similar scraping functions for each state. It demonstrates how to use requests, beautiful soup, and regex to find links to pages with bill text from navigation pages. We also show the code to extract raw bill text from either html or pdf.

## 0. Raw text extraction functions

In [17]:
# Basic functions to extract all visible text on a page

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(soup):
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)


def get_raw_text(link, pdf=False):
    if pdf:
        filename = wget.download(link)
        pdfReader = PyPDF2.PdfFileReader(filename)
        count = pdfReader.numPages
        text = ""
        for i in range(count):
            page = pdfReader.getPage(i)
            text = text + page.extractText()
        os.remove(filename)
        return text
    else:
        r = requests.get(link)
        soup = BeautifulSoup(r.text, 'html.parser')
        viz_text = text_from_html(soup)
        return viz_text

## 1. Pennsylvania Scraper



In [126]:
# Get PA Links

def get_pa_links(start_year, end_year):
    """
    Gets links to full text data of bills from PA bill site. 

    Parameters
    ----------
    start_year : int
        start year of PA bill session

    end_year: int
        end year of PA bill session 

    Returns
    -------
    dataframe of links with assocaited year
  
    
    """
    base_link = "https://www.legis.state.pa.us/cfdocs/legis/CL/Public/cl_view_action2.cfm?sess_yr={0}&sess_ind=0&cl_typ=GA&cl_nbr="
    dfs = []
    for year in range(start_year, end_year):
        text = requests.get(base_link.format(year))
        soup = BeautifulSoup(text.text, 'html.parser')
        table = soup.find('table')
        table_rows = table.find_all('tr')
        res = []
        for tr in table_rows:
            td = tr.find_all('td')
            row = [tr.text.strip() for tr in td if tr.text.strip()]
            if row:
                res.append(row)
        xf = pd.DataFrame(res)
        xf['year'] = year
        dfs.append(xf)
    df = pd.concat(dfs).reset_index(drop=True)
    text_link = "https://www.legis.state.pa.us//WU01/LI/LI/US/HTM/{0}/0/{1}..HTM"
    df['link'] = df.apply(lambda row: text_link.format(row['year'], str(row[0]).zfill(4)), axis=1)
    df.columns = ['bill_num', 'date', 'bill_id', 'title', 'year', 'link']
    return df

In [127]:
pa_test = get_pa_links(2018,2019)

In [128]:
pa_test['raw_text'] = pa_test['link'].apply(lambda x: get_raw_text(x))

In [129]:
pa_test.head()

Unnamed: 0,bill_num,date,bill_id,title,year,link,raw_text
0,1,Feb 1,"HB 1608\nAddBillSummaryTooltip('#Bill_1','2017...",Trooper Landon E. Weaver Memorial Bridge - des...,2018,https://www.legis.state.pa.us//WU01/LI/LI/US/H...,TROOPER LANDON E. WEAVER MEMORIAL BRIDG...
1,2,Feb 14,"HB 1175\nAddBillSummaryTooltip('#Bill_2','2017...","Public Officers (65 Pa.C.S.) - registration, r...",2018,https://www.legis.state.pa.us//WU01/LI/LI/US/H...,PUBLIC OFFICERS (65 PA.C.S.) - REGISTRA...
2,3,Feb 15,"HB 359\nAddBillSummaryTooltip('#Bill_3','2017'...",Game and Wildlife Code (34 Pa.C.S.) - powers a...,2018,https://www.legis.state.pa.us//WU01/LI/LI/US/H...,GAME AND WILDLIFE CODE (34 PA.C.S.) - P...
3,4,Feb 15,"HB 1602\nAddBillSummaryTooltip('#Bill_4','2017...",Multiple designations in multiple counties - d...,2018,https://www.legis.state.pa.us//WU01/LI/LI/US/H...,MULTIPLE DESIGNATIONS IN MULTIPLE COUNT...
4,5,Feb 15,"HB 1653\nAddBillSummaryTooltip('#Bill_5','2017...",Higher Education Scholarship Law - eligibility...,2018,https://www.legis.state.pa.us//WU01/LI/LI/US/H...,HIGHER EDUCATION SCHOLARSHIP LAW - ELIG...


In [130]:
pa_test['raw_text'][0]

"       TROOPER LANDON E. WEAVER MEMORIAL BRIDGE - DESIGNATION       Act of Feb. 1, 2018,\n                  P.L. 1,\n                  No. 1  Cl. 87    20180001ua  An Act    Designating the bridge carrying U.S. Route 22 over the Juniata River, Norfolk Southern\n            Corporation rail tracks and State Route 1010 (Penn Street) in Smithfield and Henderson\n            Townships, Huntingdon County, as the Trooper Landon E. Weaver Memorial Bridge.    The General Assembly of the Commonwealth of Pennsylvania hereby enacts as follows:    20180001u1s  Section 1. \xa0Trooper Landon E. Weaver Memorial Bridge.  (a) \xa0Findings.--The General Assembly finds and declares as follows:  (1) \xa0Trooper Landon E. Weaver was born November 11, 1993, in Altoona, Blair County, son\n            of Eric E. and Christine L. Weaver.  (2) \xa0Trooper Weaver, who grew up in rural Blair County, lived with his wife Macy Weaver\n            in Martinsburg.  (3) \xa0Trooper Weaver was a 2012 graduate of Centra

## 2. South Dakota Scraper

In [132]:
# Get SD Links

def get_sd_links(start_year, end_year):
    """
    Gets links to full text data of bills from SD bill site. 

    Parameters
    ----------
    start_year : int
        start year of SD bill session

    end_year: int
        end year of SD bill session 

    Returns
    -------
    dataframe of links with assocaited year
    
    """
    base_link = "https://sdlegislature.gov/docs/legsession/{0}/Bills/{1}{2}ENR.pdf"
    dfs = []

    for year in range(start_year, end_year):
        r = requests.get("https://sdlegislature.gov/Legislative_Session/Bill_Reports/default.aspx?Session={0}".format(year))
        soup = BeautifulSoup(r.text, 'html.parser')
        for a in soup.find_all(attrs={'data-title':'Bill'}): 
            dfs.append([year, a.text[0:2] + a['id'],
                              base_link.format(year, a.text[0:2], a['id']
                              )])
    df = pd.DataFrame(dfs)
    df.columns = ['year', 'bill_id', 'link']
    return df

In [133]:
sd_test = get_sd_links(2018,2019)

In [134]:
sd_test['raw_text'] = sd_test['link'].apply(lambda x: get_raw_text(x,pdf=True))

In [135]:
sd_test.head()

Unnamed: 0,year,bill_id,link,raw_text
0,2018,HB1002,https://sdlegislature.gov/docs/legsession/2018...,"AN ACT\nENTITLED, An Act to revise \ncertain p..."
1,2018,HB1003,https://sdlegislature.gov/docs/legsession/2018...,"AN ACT\nENTITLED, An Act to revise certain pro..."
2,2018,HB1004,https://sdlegislature.gov/docs/legsession/2018...,"AN ACT\nENTITLED, An Act to revise certain pro..."
3,2018,HB1005,https://sdlegislature.gov/docs/legsession/2018...,"AN ACT\nENTITLED, An Act to revise certain req..."
4,2018,HB1006,https://sdlegislature.gov/docs/legsession/2018...,"AN ACT\nENTITLED, An Act to revise the extent ..."


In [137]:
sd_test['raw_text'][0]



## 3. New Hampshire Scraper

In [138]:
# Get nh Links

def get_nh_links(start_year, end_year):
    """
    Gets links to full text data of bills from nh bill site. 

    Parameters
    ----------
    start_year : int
        start year of nh bill session

    end_year: int
        end year of nh bill session 

    Returns
    -------
    dataframe of links with assocaited year
    
    """
    base_link = "http://www.gencourt.state.nh.us{0}"
    dfs = []

    for year in range(start_year, end_year):
        r = requests.get("http://www.gencourt.state.nh.us/legislation/{0}/".format(year))
        soup = BeautifulSoup(r.text, features="html")
        bills = soup.find_all('a', {'href': re.compile(r'html')})
        for bill in bills:
            dfs.append([year, 
                        bill['href'].split('/')[-1].split('.')[0],
                        base_link.format(bill['href'])])
    df = pd.DataFrame(dfs)
    df.columns = ['year', 'bill_id', 'link']
    return df

In [139]:
nh_test = get_nh_links(2017,2019)

In [140]:
nh_test['raw_text'] = nh_test['link'].apply(lambda x: get_raw_text(x))

In [141]:
nh_test.head()

Unnamed: 0,year,bill_id,link,raw_text
0,2017,cacr0001,http://www.gencourt.state.nh.us/legislation/20...,CACR 1 1 - AS INTRODUCED 2017 SESSION 17-...
1,2017,cacr0002,http://www.gencourt.state.nh.us/legislation/20...,CACR 2 2 - AS INTRODUCED 2017 SESSION 17-...
2,2017,cacr0003,http://www.gencourt.state.nh.us/legislation/20...,CACR 3 3 - AS INTRODUCED 2017 SESSION 17-...
3,2017,cacr0004,http://www.gencourt.state.nh.us/legislation/20...,CACR 4 4 - AS INTRODUCED 2017 SESSION 17-...
4,2017,cacr0005,http://www.gencourt.state.nh.us/legislation/20...,CACR 5 5 - AS INTRODUCED 2017 SESSION 17-...


In [142]:
nh_test['raw_text'][0]

'   CACR 1 1 - AS INTRODUCED   2017 SESSION 17-0412 06/10  CONSTITUTIONAL AMENDMENT CONCURRENT RESOLUTION 1  RELATING TO:\tthe general court.  PROVIDING THAT:\tthe general court shall hold sessions biennially.  SPONSORS:\tRep. Silber, Belk. 2  COMMITTEE:\tLegislative Administration  -----------------------------------------------------------------  ANALYSIS   This constitutional amendment concurrent resolution provides that the general court meet biennially.  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  Explanation:\tMatter added to current law appears in bold italics. Matter removed from current law appears [ in brackets and struckthrough. ] Matter which is either (a) all new or (b) repealed and reenacted appears in regular type  17-0412 06/10 STATE OF NEW HAMPSHIRE  In the Year of Our Lord Two Thousand Seventeen  CONCURRENT RESOLUTION PROPOSING CONSITUTIONAL AMENDMENT  RELATING T

## 4. California Scraper

In [85]:
# Get ca Links

def get_ca_links(start_year, end_year):
    """
    Gets links to full text data of bills from ca bill site. 

    Parameters
    ----------
    start_year : int
        start year of ca bill session

    end_year: int
        end year of ca bill session 

    Returns
    -------
    dataframe of links with assocaited year
    
    """
    base_link = "http://leginfo.legislature.ca.gov"
    dfs = []

    for year in range(start_year, end_year):
        r = requests.get("http://leginfo.legislature.ca.gov/faces/billSearchClient.xhtml?session_year={0}{1}&house=Both&author=All&lawCode=All".format(year, year+1))
        soup = BeautifulSoup(r.text, features="html")
        bills = soup.find_all('a', {'href': re.compile(r'bill_id')})
        for bill in bills:
            dfs.append([year, 
                        bill['href'].split(str(year+1))[-1],
                        base_link + bill['href']])
    df = pd.DataFrame(dfs)
    df.columns = ['year', 'bill_id', 'link']
    return df

In [89]:
ca_test = get_ca_links(2017,2020)

In [102]:
# CA takes a long time to hit raw text, many links and may block. I suggest introducing a sleep
# in the loop if you intend to extract lots of raw text
ca_demo = ca_test.sample(10)

In [103]:
ca_demo['raw_text'] = ca_demo['link'].apply(lambda x: get_raw_text(x))

In [106]:
ca_demo

Unnamed: 0,year,bill_id,link,raw_text
2075,2017,0AB2076,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
2100,2017,0AB2101,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
7619,2019,0AJR15,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
8475,2019,0SB780,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
5004,2017,0SB1248,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
1445,2017,0AB1446,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
2109,2017,0AB2110,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
1706,2017,0AB1707,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
576,2017,0AB577,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...
2571,2017,0AB2572,http://leginfo.legislature.ca.gov/faces/billNa...,skip to content home accessibilit...


In [105]:
ca_demo.iloc[0]['raw_text']

"         skip to content    home  accessibility  FAQ  feedback  sitemap   login   x                   Quick Search:    Bill Number  Bill Keyword             Home   Bill Information   California Law   Publications   Other Resources   My Subscriptions   My Favorites                  Bill Information >>  Bill Search >>  Text        Bill Text Bill Information         PDF2  PDF    | Add To My Favorites | Version:  07/16/18 - Chaptered  07/05/18 - Enrolled  02/27/18 - Amended Assembly  02/07/18 - Introduced         AB-2076 County employees’ retirement: disability: date of retirement. (2017-2018)      Text   >>   Votes   >>   History  >>   Bill Analysis  >>   Today's Law As Amended  >>    Compare Versions  >>   Status  >>   Comments To Author   >>   Add To My Favorites  >>            SHARE THIS:    Date Published: 07/16/2018 09:00 PM   Bill Start Assembly Bill\n              \n                        No. 2076 CHAPTER 97 An act to add Section 31541.1 to the Government Code, relating to county

## 5. New York Scraper

In [144]:
# Get ny Links

def get_ny_links(start_year, end_year):
    """
    Gets links to full text data of bills from ny bill site. 

    Parameters
    ----------
    start_year : int
        start year of ny bill session

    end_year: int
        end year of ny bill session 

    Returns
    -------
    dataframe of links with associated year
    
    """
    base_link = "https://nyassembly.gov/leg/"
    dfs = []

    for year in range(start_year, end_year):
        r = requests.post('https://nyassembly.gov/leg/?sh=advanced', 
                          data=dict(evt_fld='Search',
                                    by='a',
                                    term=str(year),
                                    leg_type='B',
                                    comm_status='C',
                                    bill_status='L'))
        soup = BeautifulSoup(r.text, features="html")
        bills = soup.find_all('a', {'href': re.compile(r'&term=')})
        for bill in bills:
            dfs.append([year, 
                        bill['href'].split("?bn=")[1].split("&term")[0],
                        base_link + bill['href'] + "&Text=Y"])
    df = pd.DataFrame(dfs)
    df.columns = ['year', 'bill_id', 'link']
    return df

In [118]:
ny_test = get_ny_links(2015, 2020)

In [120]:
# CA takes a long time to hit raw text, many links and may block. I suggest introducing a sleep
# in the loop if you intend to extract lots of raw text
ny_demo = ny_test.sample(10)
ny_demo['raw_text'] = ny_demo['link'].apply(lambda x: get_raw_text(x))

In [121]:
ny_demo

Unnamed: 0,year,bill_id,link,raw_text
836,2015,A09349,https://nyassembly.gov/leg/?bn=A09349&term=201...,New York State Assembly ...
3906,2017,S07292,https://nyassembly.gov/leg/?bn=S07292&term=201...,New York State Assembly ...
3372,2017,S02495,https://nyassembly.gov/leg/?bn=S02495&term=201...,New York State Assembly ...
1760,2015,S05892,https://nyassembly.gov/leg/?bn=S05892&term=201...,New York State Assembly ...
259,2015,A05816,https://nyassembly.gov/leg/?bn=A05816&term=201...,New York State Assembly ...
4400,2019,A05269,https://nyassembly.gov/leg/?bn=A05269&term=201...,New York State Assembly ...
3960,2017,S07418,https://nyassembly.gov/leg/?bn=S07418&term=201...,New York State Assembly ...
1186,2015,S01697,https://nyassembly.gov/leg/?bn=S01697&term=201...,New York State Assembly ...
3682,2017,S05943,https://nyassembly.gov/leg/?bn=S05943&term=201...,New York State Assembly ...
2519,2017,A06801,https://nyassembly.gov/leg/?bn=A06801&term=201...,New York State Assembly ...


In [124]:
ny_demo.iloc[0]['raw_text']

"                    New York State  Assembly  Speaker Carl E. Heastie               WATCH LIVE         Assembly Members  Bill Search & Legislative Info  Standing Committee Public Hearing Calendar  Speaker's Press Releases  Assembly Reports  Committees, Commissions & Task Forces                 Javascript must be enabled to properly view this page.     Bill Search Home Laws  Legislative Calendar Public Hearing Schedule Assembly Calendars Assembly Committee Agenda          Bill No.:       Summary   Actions  Committee&nbspVotes  Floor&nbspVotes  Memo   Text   A09349 Summary: BILL NO A09349  SAME AS SAME AS UNI. S06819   SPONSOR Santabarbara  COSPNSR Steck  MLTSPNSR   Relates to eligibility for real property tax credit for payments in lieu of taxes made by a Qualified Empire Zone Enterprise for property located at 560 Broadway, Schenectady, New York. Go to top A09349 Text:     STATE OF NEW YORK ________________________________________________________________________\n \n            S. 681