# bills_webscraper.ipynb

### Import python packages

In [10]:
import warnings
warnings.filterwarnings('ignore')

import re, sqlite3, sys
from collections import OrderedDict

import requests
from bs4 import BeautifulSoup
import selenium.webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

### Define Bill class

In [11]:
class Bill:
    
    def __init__(self, num, link, congress, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, sec_committee, date_lastUpdate, logs):
        self.num = num
        self.link = link
        self.congress = congress
        self.short_title = short_title
        self.long_title = long_title
        self.date_filed = date_filed
        self.scope = scope
        self.status = status
        self.author = author
        self.subject = subject
        self.pri_committee = pri_committee
        self.sec_committee = sec_committee
        self.date_lastUpdate = date_lastUpdate
        self.logs = logs
        self.entities = (num, link, congress, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, sec_committee, date_lastUpdate, logs)
        
    def __str__(self):
        return 'from ' + str(self.congress) + '_' + self.num + ': ' + self.short_title
    
    def insert_bill():
        pass
    
    def update_bill():
        pass
    
    def remove_bill():
        pass

### Compile Regex Patterns
These regex patterns will be used to extract relevant strings only from the webscraping function.

In [12]:
bill_number_regex = re.compile(r'(?<=\d\d&q=).*')
filed_date_regex = re.compile(r'(?<=Filed on ).*(?= by )')
author_regex = re.compile(r'(?<= by ).*(?=\n)')
status_regex = re.compile(r'.*(?= \()')
last_update_regex = re.compile(r'(?<=\().*(?=\))')
logs_regex = re.compile(r'(?<=\[ FIRST REGULAR SESSION, \d\d\w\w CONGRESS ]\n\[ \d\d\d\d ]\n).*(?=\(The legislative history)',
                       re.DOTALL)
logs_wFootNote_regex = re.compile(r'(?<=\[ FIRST REGULAR SESSION, \d\d\w\w CONGRESS ]\n\[ \d\d\d\d ]\n).*(?=;)',
                                       re.DOTALL)

### Define getBill function
getBill function is our web scraping script. It follows the following workflow: <br>
1. Access Home URL of 'senate.gov.ph' <br>
2. Collect all the bills' links on that page.<br>
3. Iterate over the gathered links.<br>
4. Click the "All Information" button and then scrape all the bills' informations from that frame.<br>
5. Go to next page from the Home URL and repeat all steps again.<br>

#### Define bills_logScraper function
Scraper for special case of logs wherein the footnote is missing.

In [13]:
def bills_logScraper(logs):
    try:
        return logs_regex.search(logs).group()
    except AttributeError:
        return logs_wFootNote_regex.search(logs).group()

In [14]:
def getBill(congress_num, page_num):
    
    driver = selenium.webdriver.PhantomJS()
    
    home_url = 'http://www.senate.gov.ph/lis/leg_sys.aspx?congress='+str(congress_num)+'&type=bill&p='+str(page_num)
    res = requests.get(home_url)
    res.raise_for_status()
    print('Accessing '+ 'http://www.senate.gov.ph' + ' ...\n')
    
    soup = BeautifulSoup(res.content, 'html.parser')
    sublinks=[str(bill.attrs['href']) for bill in soup.select('a') if 'bill_res' in str(bill.attrs['href'])]
    print('There are ' + str(len(sublinks)) + ' bills on this page, '+ 'p' + str(page_num) + '.\n')
    
    bills_dict = OrderedDict()
    for sublink in sublinks:
        #
        link = 'http://www.senate.gov.ph/lis/'+sublink
        num = bill_number_regex.search(sublink).group()
        print("Trying to access child link "+ link + ' ...')
        driver.get(link)
        driver.implicitly_wait(100)
        
        allInfo_btn = driver.find_element_by_xpath("//a[@id='lbAll']")
        allInfo_btn.click()
        element = WebDriverWait(driver, 10).until(lambda x:x.find_element_by_xpath("//p[contains(text(),'Subject(s)')]")) 
        soup_allinfo = BeautifulSoup(driver.page_source, 'html.parser')
        
        short_title = soup_allinfo.select('p[class="h1_bold"]')[0].getText()
        allinfo_frame_text = soup_allinfo.select('td[id="content"]')[0].getText()
        date_filed = filed_date_regex.search(allinfo_frame_text).group()
        author = author_regex.search(allinfo_frame_text).group()
        long_title = soup_allinfo.select('blockquote')[0].getText()
        scope = soup_allinfo.select('blockquote')[1].getText()
        legis_status = soup_allinfo.select('blockquote')[2].getText()
        status = status_regex.search(legis_status).group()
        date_lastUpdate = last_update_regex.search(legis_status).group()
        
        # Special case if a bill was 'Withdrawn.'
        if 'Withdrawn' in status:
            subject, pri_committee, sec_committee = ('','','')
        else: 
            subject = soup_allinfo.select('blockquote')[3].getText()       
            pri_committee = soup_allinfo.select('blockquote')[4].getText()
        
        # Place holder for varying positions of 'logs' variable due to unpredictable 'blockquote' tag length change.
        blockquote_length = len(soup_allinfo.select('blockquote'))
        if blockquote_length==7: # for the case of missing 'sec_committee' value.
            sec_committee = soup_allinfo.select('blockquote')[-2].getText()
        elif blockquote_length==6:
            sec_committee = ''            
        else:
            print('Unrecognized blockquote format. Format should be added to this program. Exiting...')
            sys.exit()
        logs = bills_logScraper(soup_allinfo.select('blockquote')[-1].getText())   
        bills_dict[num]=(num, link, congress_num, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, sec_committee, date_lastUpdate, logs)
        print('\t' + str(num) + ' has been successfully scraped.\n')
        
    print('All Bills have been successfully scraped on this page.')
    # Must close the ghostDriver before exiting this function...
    driver.quit()
    
    return bills_dict

### Define getMax_page function
This function will get the maximum/last page number of a Congressional Bill list.

In [15]:
def getMax_page(congress_num):
    driver = selenium.webdriver.PhantomJS()
    home_url = 'http://www.senate.gov.ph/lis/leg_sys.aspx?congress='+str(congress_num)+'&type=bill&p=1'
    clicks = 1
    driver.get(home_url)
    
    while True:
        try:
            more_btn = driver.find_element_by_xpath("//div[@id='pnl_NavBottom']//a[contains(text(),'Next')]")
            more_btn.click()
            bill_elem = WebDriverWait(driver, 10).until(lambda x:x.find_element_by_xpath("//span[@class='h1_sub']"))
            clicks += 1
            continue
        except NoSuchElementException:
            print('We reached last page... that is page ' + str(clicks) + '.')
            break
            
    driver.quit()
    return clicks

### Define Database functions
These database functions will create a database connection instance. It will create table within the database, check for records, insert records, as well as update existing records.

In [16]:
def connect_db(database):
    try:
        conn = sqlite3.connect(database)
        cursor = conn.cursor()
        return conn, cursor
    except sqlite3.Error as error:
        print("Error in connecting to sqlite3", error)

def create_bills_table(conn, cursor):
    cursor.execute("""CREATE TABLE if not exists senateBills(
                    num text PRIMARY KEY,
                    link text,
                    congress integer,
                    short_title text,
                    long_title text,
                    date_filed text,
                    scope text,
                    status integer,
                    author text,
                    subject text,
                    pri_committee text,
                    sec_committee text,
                    date_lastUpdate text,
                    logs text)""")
    
def check_bill_exists(bill_num, conn, cursor):
    with conn:
        cursor.execute("SELECT num FROM senateBills WHERE num = (?)",(bill_num,))
    rows = cursor.fetchall()
    return True if len(rows) else False

def insert_bill(entities, conn, cursor):
    with conn:
        cursor.execute("""INSERT INTO senateBills(
                        num, link, congress, short_title, long_title, date_filed, 
                        scope, status, author, subject, pri_committee, sec_committee, 
                        date_lastUpdate, logs) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",(entities))
        
def update_date_lastUpdate(bill_num, date_lastUpdate, conn, cursor):
    with conn:
        cursor.execute("UPDATE senateBills SET date_lastUpdate = (?) WHERE num = (?)",(date_lastUpdate, bill_num))

### Define scrape_thisCongress function
This function accepts nth Congress as an argument, _and where n is an integer,_ and scrapes through all the bills in all the available pages under that Congress. 

In [17]:
def scrape_thisCongress(congress):
    pages = list(range(1, getMax_page(congress) + 1, 1))
    conn, cursor = connect_db('phBills.db')
    create_bills_table(conn, cursor)
    bill_count = 0
    for page in pages:
        thisPage_bills = getBill(congress, page)
        for bill, contents in thisPage_bills.items():
            some_bill = Bill(*contents)
            if check_bill_exists(some_bill.num, conn, cursor):
                print(str(some_bill) + " already exists in our database. Updating 'date_lastUpdate' instead.")
                update_date_lastUpdate(some_bill.num, some_bill.date_lastUpdate, conn, cursor)
            else:
                insert_bill(some_bill.entities, conn, cursor)
            bill_count += 1
    print('Collected ' + str(bill_count) + 'bills from ' + congress + 'th congress.')
    print('Disconnecting from http://www.senate.gov.ph...')
    conn.close()
    print('Done!')

### Main Program

In [18]:
scrape_thisCongress(18)

We reached last page... that is page 145.
Accessing http://www.senate.gov.ph ...

There are 8 bills on this page, p1.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1153 ...
	SBN-1153 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1152 ...
	SBN-1152 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1151 ...
	SBN-1151 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1150 ...
	SBN-1150 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1149 ...
	SBN-1149 has been successfully scraped.

Trying to access child link http://www.senate.gov.ph/lis/bill_res.aspx?congress=18&q=SBN-1148 ...
	SBN-1148 has been successfully scraped.

Trying to access child link http://www.se

SystemExit: 