# Senate Bills Webscraper
This program can collect Philippine bills filed by the senate from the 13th Congress to the the current 18th Congress from this Government website: https://www.senate.gov.ph/lis/leg_sys.aspx
<br>
The collected data is then saved to a sql database.

In [1]:
import re
import sqlite3
from collections import OrderedDict

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

Note: Install and setup chromedriver first if this is the first time using it. You can download it here: https://sites.google.com/a/chromium.org/chromedriver/downloads
<br>
Also, make sure chromedriver is compatible with installed Google Chrome App.<br>
Go to Help -> About Google Chrome -> Chrome will automatically look for updates _(update Chrome to the latest version)_.

In [2]:
# Instantiate a chrome options object.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")

In [3]:
# Instantiate a web driver object.
driver_path = '/Users/emilolbinado/opt/miniconda3/bin/chromedriver'
driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path)

### Define Bill class

In [4]:
class Bill:
    '''This class help define an object to be a bill with specified
    bill attributes.'''

    def __init__(self, bill_id, num, link, congress, short_title, long_title,
                 date_filed, scope, status, author, subject, pri_committee,
                 date_lastUpdate, logs, ra):
        self.bill_id = bill_id
        self.num = num
        self.link = link
        self.congress = congress
        self.short_title = short_title
        self.long_title = long_title
        self.date_filed = date_filed
        self.scope = scope
        self.status = status
        self.author = author
        self.subject = subject
        self.pri_committee = pri_committee
        self.date_lastUpdate = date_lastUpdate
        self.logs = logs
        self.ra = ra
        self.entities = (bill_id, num, link, congress, short_title, long_title,
                         date_filed, scope, status, author, subject,
                         pri_committee, date_lastUpdate, logs, ra)

    def __str__(self):
        '''This function will show details of the class Bill
        if an instance of Bill is passed on a print statement.'''
        return ('Congress: ' + str(self.congress) + ' ' + self.num + ': '
                + self.short_title)

    def insert_bill():
        pass

    def update_bill():
        pass

    def remove_bill():
        pass

### Compile Regex Patterns
These regex patterns will be used to extract relevant strings only from the webscraping function.

In [5]:
bill_number_regex = re.compile(r'(?<=\d\d&q=).*')
filed_date_regex = re.compile(r'(?<=Filed on ).*(?= by )')
author_regex = re.compile(r'(?<= by ).*(?=\n)')
status_regex = re.compile(r'.*(?= \()')
last_update_regex = re.compile(r'(?<=\().*(?=\))')
logs_regex = re.compile(r'(?<=\n\[ \d\d\d\d \]\n).*(?=\(The (legislative history)|(LEGISLATIVE HISTORY))', re.DOTALL)
logs_wFootNote_regex = re.compile(r'(?<=\[ \d\d\d\d ]\n).*', re.DOTALL)
ra_regex = re.compile(r'(?<=\nRepublic Act No\. )\d*(?=(\n)| )', re.DOTALL)

### Define Web Scraper functions
getBill function is our main web scraping script. It follows the following workflow: <br>
1. Access Home URL of 'senate.gov.ph' <br>
2. Collect all the bills' links on that page.<br>
3. Iterate over the gathered links.<br>
4. Click the "All Information" button and then scrape all the bills' informations from that frame.<br>
5. Go to next page from the Home URL and repeat all steps again.<br>

In [6]:
def getBill(driver, congress_num, page_num):
    '''The getBill function accepts two arguments; 1. congress_num(type: int)
    to specify nth congress and 2. page_num(type: int) to specify the page
    number. Both will help Selenium navigate specific page to scrape.'''
    home_url = ('http://www.senate.gov.ph/lis/leg_sys.aspx?congress=' +
                str(congress_num)+'&type=bill&p='+str(page_num))
    res = requests.get(home_url)
    res.raise_for_status()
    print('Accessing ' +
          'http://www.senate.gov.ph' + ' ...\n')

    soup = BeautifulSoup(res.content, 'html.parser')
    sublinks = [str(bill.attrs['href']) for bill in soup.select('a')
                if 'bill_res' in str(bill.attrs['href'])]
    print('There are ' + str(len(sublinks)) + ' bills on this page, ' +
          'p' + str(page_num) + '.\n')

    bills_dict = OrderedDict()
    fetch_errors = []
    for sublink in sublinks:
        link = 'http://www.senate.gov.ph/lis/'+sublink
        num = bill_number_regex.search(sublink).group()
        print("Trying to access child link " + link + ' ...')
        driver.get(link)
        driver.implicitly_wait(100)
        try:
            allInfo_btn = driver.find_element_by_xpath("//a[@id='lbAll']")
        except NoSuchElementException:
            print('\t' + str(num) + ' is missing! Skipping...\n')
            fetch_errors.append(num)
            continue

        allInfo_btn.click()
        element = WebDriverWait(driver,
                                10).until(lambda x:
                                          x.find_element_by_xpath("//p[contains(text(), 'Legislative History')]"))
        soup_allinfo = BeautifulSoup(driver.page_source, 'html.parser')
        short_title = soup_allinfo.select('p[class="h1_bold"]')[0].getText()
        allinfo_frame_text = soup_allinfo.select('td[id="content"]')[0].getText()
        date_filed = filed_date_regex.search(allinfo_frame_text).group()
        author = author_regex.search(allinfo_frame_text).group()
        long_title = soup_allinfo.select('blockquote')[0].getText()
        scope = soup_allinfo.select('blockquote')[1].getText()
        legis_status = soup_allinfo.select('blockquote')[2].getText()
        # Place holder for bills w/ "House counterpart bill no."
        blkqt_idx = 3 if 'HBN-' in legis_status else 2
        legis_status = soup_allinfo.select('blockquote')[blkqt_idx].getText()
        status = status_regex.search(legis_status).group()
        date_lastUpdate = last_update_regex.search(legis_status).group()
        # Special case if a bill was 'Withdrawn.'
        if 'Withdrawn' in status:
            subject, pri_committee, logs, ra = ('', '', '', '')
        else:
            subject = soup_allinfo.select('blockquote')[blkqt_idx + 1].getText()
            pri_committee = (soup_allinfo.select('blockquote')
                             [blkqt_idx + 2].getText())

            all_blkquotes = '\n'.join([soup_allinfo.select('blockquote')[i].
                                       getText()
                                       for i in range(len(soup_allinfo.select('blockquote')))])
            logs = bills_logScraper(all_blkquotes)

            try:
                ra = ra_regex.search(all_blkquotes).group()
            except AttributeError:
                ra = ''
        # Create unique id for this record.
        bill_id = str(congress_num) + str(num)
        bills_dict[bill_id] = (bill_id, num, link, congress_num, short_title,
                               long_title, date_filed, scope, status, author,
                               subject, pri_committee, date_lastUpdate, logs,
                               ra)
        print('\t' + str(num) + ' has been successfully scraped.\n')
    print('All Bills have been successfully scraped on this page.')
    return bills_dict, fetch_errors

In [7]:
def bills_logScraper(logs):
    '''Scraper for special cases of bill's logs wherein
    the footnote maybe missing.'''
    try:
        return logs_regex.search(logs).group()
    except AttributeError:
        try:
            return logs_wFootNote_regex.search(logs).group()
        except AttributeError:
            return 0

In [8]:
def getMax_page(driver, congress_num):
    '''This function will get the maximum/last page number of a
    Congressional Bill list.'''
    home_url = ('http://www.senate.gov.ph/lis/leg_sys.aspx?congress=' +
                str(congress_num)+'&type=bill&p=999')
    driver.get(home_url)
    driver.implicitly_wait(100)
    soup_pagination = BeautifulSoup(driver.page_source, 'html.parser')
    maxPage = (int(soup_pagination.
                   select('div[class="lis_pagenav"] a')[-1].getText()) + 1)
    return maxPage

In [9]:
def scrape_thisCongress(congress, start_page):
    '''This function accepts nth Congress as an argument, (type: int),
    scrapes through all the bills in all the available pages,
    starting from 'start_page', (type: int), under that Congress.'''
    driver = webdriver.Chrome(options=chrome_options,
                              executable_path=driver_path)
    pages = list(range(start_page, getMax_page(driver, congress) + 1, 1))
    conn, cursor = connect_db('phBills.db')
    create_bills_table(conn, cursor)
    bill_count = 0
    scraping_failures = []
    for page in pages:
        thisPage_bills, scrape_failures = getBill(driver, congress, page)
        scraping_failures.extend(scrape_failures)
        for bill, contents in thisPage_bills.items():
            some_bill = Bill(*contents)
            if check_bill_exists(some_bill.bill_id, conn, cursor):
                print(str(some_bill) + ''' already exists in our database.
                Updating 'date_lastUpdate' field instead.''')
                update_date_lastUpdate(some_bill.bill_id,
                                       some_bill.date_lastUpdate, conn, cursor)
            else:
                insert_bill(some_bill.entities, conn, cursor)
            bill_count += 1
    print('\nCollected ' + str(bill_count) + ' bills from ' +
          str(congress) + 'th congress.')
    print('Disconnecting from http://www.senate.gov.ph...')
    driver.quit()
    conn.close()
    print('Done!')
    return bill_count, scraping_failures

### Define Database functions
Functions will create a database connection instance. It will create table within the database, check for records, insert records, as well as update existing records.

In [10]:
def connect_db(database):
    '''Establish a connection to the database.'''
    try:
        conn = sqlite3.connect(database)
        cursor = conn.cursor()
        return conn, cursor
    except sqlite3.Error as error:
        print("Error in connecting to sqlite3", error)


def create_bills_table(conn, cursor):
    '''If not existing, create a database table to house the senate bills data.
    '''
    cursor.execute("""CREATE TABLE if not exists senateBills(
                    bill_id text PRIMARY KEY,
                    num text,
                    link text,
                    congress integer,
                    short_title text,
                    long_title text,
                    date_filed text,
                    scope text,
                    status integer,
                    author text,
                    subject text,
                    pri_committee text,
                    date_lastUpdate text,
                    logs text,
                    ra text)""")
    conn.commit()


def check_bill_exists(bill_id, conn, cursor):
    '''Check if certain scraped bill already exist on the database.'''
    with conn:
        cursor.execute("SELECT bill_id FROM senateBills WHERE bill_id = (?)",
                       (bill_id,))
    rows = cursor.fetchall()
    return True if len(rows) else False


def insert_bill(entities, conn, cursor):
    '''Insert the scraped bill to the database.'''
    with conn:
        cursor.execute("""INSERT INTO senateBills(
                        bill_id, num, link, congress, short_title, long_title,
                        date_filed, scope, status, author, subject,
                        pri_committee, date_lastUpdate, logs, ra)
                        VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", (entities))
        conn.commit()


def update_date_lastUpdate(bill_id, date_lastUpdate, conn, cursor):
    '''If bill is already existing to database, update the record instead. '''
    with conn:
        cursor.execute('''UPDATE senateBills
        SET date_lastUpdate = (?) WHERE bill_id = (?)''', (date_lastUpdate,
                                                           bill_id))
        conn.commit()

### Main Program

In [11]:
# Scrape bills from 335th page to last page of the 13th Congress Senate Bills.
scrape_thisCongress(13, 335)

Accessing http://www.senate.gov.ph ...

There are 8 bills on this page, p335.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-10 ...
	SBN-10 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-9 ...
	SBN-9 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-8 ...
	SBN-8 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-7 ...
	SBN-7 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-6 ...
	SBN-6 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-5 ...
	SBN-5 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=13&q=SBN-4 ...
	SBN-4 has been succ

(10, [])