# houseBills_webscraper.ipynb

### Import python packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import re, sqlite3, sys
from collections import OrderedDict

import requests
from bs4 import BeautifulSoup
import selenium.webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

### Define Bill class

In [4]:
class Bill:
    
    def __init__(self, bill_id, num, link, congress, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, date_lastUpdate, logs, ra):
        self.bill_id = bill_id
        self.num = num
        self.link = link
        self.congress = congress
        self.short_title = short_title
        self.long_title = long_title
        self.date_filed = date_filed
        self.scope = scope
        self.status = status
        self.author = author
        self.subject = subject
        self.pri_committee = pri_committee
        self.date_lastUpdate = date_lastUpdate
        self.logs = logs
        self.ra = ra
        self.entities = (bill_id, num, link, congress, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, date_lastUpdate, logs, ra)
        
    def __str__(self):
        return 'Congress: ' + str(self.congress) + ' ' + self.num + ': ' + self.short_title
    
    def insert_bill():
        pass
    
    def update_bill():
        pass
    
    def remove_bill():
        pass

### Compile Regex Patterns
These regex patterns will be used to extract relevant strings only from the webscraping function.

In [5]:
bill_number_regex = re.compile(r'(?<=\d\d&q=).*')
filed_date_regex = re.compile(r'(?<=Filed on ).*(?= by )')
author_regex = re.compile(r'(?<= by ).*(?=\n)')
status_regex = re.compile(r'.*(?= \()')
last_update_regex = re.compile(r'(?<=\().*(?=\))')
logs_regex = re.compile(r'(?<=\n\[ \d\d\d\d \]\n).*(?=\(The (legislative history)|(LEGISLATIVE HISTORY))',re.DOTALL)
logs_wFootNote_regex = re.compile(r'(?<=\[ \d\d\d\d ]\n).*',re.DOTALL)
ra_regex = re.compile(r'(?<=\nRepublic Act No\. )\d*(?=(\n)| )',re.DOTALL)

### Define Scraper functions

getBill function is our main web scraping script. It follows the following workflow: <br>
1. Access Home URL of 'senate.gov.ph' <br>
2. Collect all the bills' links on that page.<br>
3. Iterate over the gathered links.<br>
4. Click the "All Information" button and then scrape all the bills' informations from that frame.<br>
5. Go to next page from the Home URL and repeat all steps again.<br>

#### bills_logScraper
Scraper for special case of logs wherein the footnote is missing.

In [6]:
def bills_logScraper(logs):
    try:
        return logs_regex.search(logs).group()
    except AttributeError:
        try:
            return logs_wFootNote_regex.search(logs).group()
        except AttributeError:
            return 0

#### getBill function
Our main scraping function.

In [14]:
def getBill(congress_num, page_num):
    
    driver = selenium.webdriver.PhantomJS()
    
    home_url = 'http://www.congress.gov.ph/legisdocs/?v=bills'
    res = requests.get(home_url)
    res.raise_for_status()
    print('Accessing '+ 'http://www.congress.gov.ph' + ' ...\n')
    
    # Select th congress from dropdown.
    select = Select(driver.find_element_by_xpath("//select[@name='congress']"))
    select.select_by_value(congress_num)
    go_btn = driver.find_element_by_xpath("//form[@class='form-inline pull-right']//input[@class='btn btn-default input-sm']")
    go_btn.click()
    element = WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//p[contains(text(),'Legislative History')]")) 
    related_links_element = WebDriverWait(driver, 10).until(lambda x:x.find_element_by_xpath=(""))
    
    res = requests.get(home_url)
    res.raise_for_status()
    print('Selecting House Bills...')
    
    soup = BeautifulSoup(res.content, 'html.parser')
    sublinks=[str(bill.attrs['href']) for bill in soup.select('a') if 'bill_res' in str(bill.attrs['href'])]
    print('There are ' + str(len(sublinks)) + ' bills on this page, '+ 'p' + str(page_num) + '.\n')
    
    bills_dict = OrderedDict()
    fetch_errors = []
    for sublink in sublinks:
        #
        link = 'http://www.senate.gov.ph/lis/'+sublink
        num = bill_number_regex.search(sublink).group()
        print("Trying to access child link "+ link + ' ...')
        driver.get(link)
        driver.implicitly_wait(100)
        try:
            allInfo_btn = driver.find_element_by_xpath("//a[@id='lbAll']")
        except NoSuchElementException:
            print('\t' + str(num) + ' is missing! Skipping...\n')
            fetch_errors.append(num)
            continue
        
        allInfo_btn.click()
        element = WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//p[contains(text(),'Legislative History')]")) 
        soup_allinfo = BeautifulSoup(driver.page_source, 'html.parser')
        
        short_title = soup_allinfo.select('p[class="h1_bold"]')[0].getText()
        allinfo_frame_text = soup_allinfo.select('td[id="content"]')[0].getText()
        date_filed = filed_date_regex.search(allinfo_frame_text).group()
        author = author_regex.search(allinfo_frame_text).group()
        long_title = soup_allinfo.select('blockquote')[0].getText()
        scope = soup_allinfo.select('blockquote')[1].getText()
        
        legis_status = soup_allinfo.select('blockquote')[2].getText()
        blkqt_idx = 3 if 'HBN-' in legis_status else 2 # place holder for bills w/ "House counterpart bill no."
            
        legis_status = soup_allinfo.select('blockquote')[blkqt_idx].getText()
        status = status_regex.search(legis_status).group()
        date_lastUpdate = last_update_regex.search(legis_status).group()
        
        # Special case if a bill was 'Withdrawn.'
        if 'Withdrawn' in status:
            subject, pri_committee, logs, ra = ('','','','')
        else: 
            subject = soup_allinfo.select('blockquote')[blkqt_idx + 1].getText()       
            pri_committee = soup_allinfo.select('blockquote')[blkqt_idx + 2].getText()
        
            all_blkquotes = '\n'.join([soup_allinfo.select('blockquote')[i].
                               getText() for i in range(len(soup_allinfo.select('blockquote')))])        
            logs = bills_logScraper(all_blkquotes)

            try:
                ra = ra_regex.search(all_blkquotes).group()
            except AttributeError:
                ra = ''
        
        # Create unique id for this record.
        bill_id = str(congress_num) + str(num)
        bills_dict[bill_id]=(bill_id, num, link, congress_num, short_title, long_title, date_filed, scope, status, author,
                 subject, pri_committee, date_lastUpdate, logs, ra)
        print('\t' + str(num) + ' has been successfully scraped.\n')
        
    print('All Bills have been successfully scraped on this page.')
    # Must close the ghostDriver before exiting this function...
    driver.quit()
    
    return bills_dict, fetch_errors

### Define getMax_page function
This function will get the maximum/last page number of a Congressional Bill list.

In [19]:
def getMax_page(congress_num):
    driver = selenium.webdriver.PhantomJS()
    home_url = 'http://www.senate.gov.ph/lis/leg_sys.aspx?congress='+str(congress_num)+'&type=bill&p=999'
    driver.get(home_url)
    driver.implicitly_wait(100)
    
    select = Select(driver.find_element_by_xpath("//select[@id='dlBillType']"))
    select.select_by_value('HBN')

    print('Selecting House Bills...')

    soup_pagination = BeautifulSoup(driver.page_source, 'html.parser')
    
    maxPage = int(soup_pagination.select('div[class="lis_pagenav"] a')[-1].getText()) + 1
    
    driver.quit()
    return maxPage

### Define Database functions
These database functions will create a database connection instance. It will create table within the database, check for records, insert records, as well as update existing records.

In [9]:
def connect_db(database):
    try:
        conn = sqlite3.connect(database)
        cursor = conn.cursor()
        return conn, cursor
    except sqlite3.Error as error:
        print("Error in connecting to sqlite3", error)

def create_bills_table(conn, cursor):
    cursor.execute("""CREATE TABLE if not exists houseBills(
                    bill_id text PRIMARY KEY,
                    num text,
                    link text,
                    congress integer,
                    short_title text,
                    long_title text,
                    date_filed text,
                    scope text,
                    status integer,
                    author text,
                    subject text,
                    pri_committee text,
                    date_lastUpdate text,
                    logs text,
                    ra text)""")
    conn.commit()
    
def check_bill_exists(bill_id, conn, cursor):
    with conn:
        cursor.execute("SELECT bill_id FROM houseBills WHERE bill_id = (?)",(bill_id,))
    rows = cursor.fetchall()
    return True if len(rows) else False

def insert_bill(entities, conn, cursor):
    with conn:
        cursor.execute("""INSERT INTO houseBills(
                        bill_id, num, link, congress, short_title, long_title, date_filed, 
                        scope, status, author, subject, pri_committee, 
                        date_lastUpdate, logs, ra) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",(entities))
        conn.commit()
        
def update_date_lastUpdate(bill_id, date_lastUpdate, conn, cursor):
    with conn:
        cursor.execute("UPDATE houseBills SET date_lastUpdate = (?) WHERE bill_id = (?)",(date_lastUpdate, bill_id))
        conn.commit()

### Define scrape_thisCongress function
This function accepts nth Congress as an argument, _and where n is an integer,_ and scrapes through all the bills in all the available pages under that Congress. 

In [10]:
def scrape_thisCongress(congress, start_page):
    pages = list(range(start_page, getMax_page(congress) + 1, 1))
    conn, cursor = connect_db('phBills.db')
    create_bills_table(conn, cursor)
    bill_count = 0
    scraping_failures = []
    for page in pages:
        thisPage_bills, scrape_failures = getBill(congress, page)
        scraping_failures.extend(scrape_failures)
        for bill, contents in thisPage_bills.items():
            some_bill = Bill(*contents)
            if check_bill_exists(some_bill.bill_id, conn, cursor):
                print(str(some_bill) + " already exists in our database. Updating 'date_lastUpdate' field instead.")
                update_date_lastUpdate(some_bill.bill_id, some_bill.date_lastUpdate, conn, cursor)
            else:
                insert_bill(some_bill.entities, conn, cursor)
            bill_count += 1
    print('\nCollected ' + str(bill_count) + ' bills from ' + str(congress) + 'th congress.')
    print('Disconnecting from http://www.senate.gov.ph...')
    conn.close()
    print('Done!')
    return bill_count, scraping_failures

### Main Program

In [2]:
congress_num = 17
driver = selenium.webdriver.PhantomJS()
home_url = 'http://www.congress.gov.ph/legisdocs/?v=bills'

print('Accessing '+ 'http://www.congress.gov.ph' + ' ...\n')

driver.get(home_url)
driver.implicitly_wait(100)

WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//li[contains(text(),'RELATED LINKS')]"))
# rl_element = WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//a[contains(text(),'ABOUT US')]")) 

# Select th congress from dropdown.
dropdown = Select(driver.find_element_by_name('congress'))
dropdown.select_by_value(str(congress_num))
print(f"{congress_num}th Congress selected.")

go_btn = driver.find_element_by_xpath("//form[@class='form-inline pull-right']//input[@class='btn btn-default input-sm']")
go_btn.click()
print("go button clicked!")

WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//li[contains(text(),'RELATED LINKS')]"))

history_btn = driver.find_element_by_xpath("/html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[3]/span[1]/a[1]")
history_btn.click()
WebDriverWait(driver,10).until(lambda x:x.find_element_by_xpath("//div[@class='modal-header']"))
print("History view shown...")

soup = BeautifulSoup(driver.page_source, 'html.parser')



driver.quit()

print("Soup is delivered.")
    
    
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[3]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[5]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[7]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[9]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[11]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[10133]/span[1]/a[1]
# /html[1]/body[1]/div[2]/div[1]/div[1]/div[2]/div[10135]/span[1]/a[1]
    

Accessing http://www.congress.gov.ph ...

17th Congress selected.
go button clicked!
History view shown...
Soup is delivered.


In [13]:
soup.select("div[class='modal-body'] div[class='fetched-data'] tbody")

[]

In [22]:
soup.find('tbody')

In [24]:
table_body = soup.find('tbody')
rows = table_body.find_all('tr')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [25]:
type(table_body)

NoneType