# Scrape Appointees from Orders in Council

## Get Search Page

In [1]:
import mechanicalsoup
import re
import logging

In [2]:
logging.basicConfig(filename="/home/john/oic_scraping.log", level=logging.INFO)

In [3]:
browser = mechanicalsoup.StatefulBrowser()

In [4]:
url = "https://orders-in-council.canada.ca/index.php?lang=en"

In [5]:
browser.open(url)

<Response [200]>

## Enter Form Search Criteria

In [6]:
browser.select_form()

<mechanicalsoup.form.Form at 0x7f779c101eb0>

In [7]:
browser["fromDate"] = "1993-01-01"

In [8]:
browser.form.print_summary()

<input class="form-control inputText" id="pcNumber" name="pcNumber" placeholder="yyyy-nnnn" type="text"/>
<input class="form-control inputText" name="fromDate" placeholder="yyyy-mm-dd" type="text" value="1993-01-01"/>
<input class="form-control inputText" name="toDate" placeholder="yyyy-mm-dd" type="text"/>
<input class="form-control inputText" name="keywords" type="text"/>
<input class="form-control inputText" name="department" type="text"/>
<input class="form-control inputText" name="act" type="text"/>
<input class="form-control inputText" name="chapterNumber" type="text"/>
<input class="form-control inputText" name="chapterYear" type="text"/>
<input class="form-control inputText" name="billNumber" type="text"/>
<input name="foa" type="radio" value="1"/>
<input name="foa" type="radio" value="0"/>
<input checked="" name="foa" type="radio" value="na"/>
<input class="btn btn-primary" id="btnSearch" name="searchList" type="submit" value="Search/ List"/>
<input class="btn btn-default" nam

In [9]:
#browser.launch_browser()

## Submit Form and Retrieve Results Page

In [10]:
response = browser.submit_selected()

In [11]:
#print(response.text)

In [12]:
browser.list_links()

Links in the current page:
     <a class="pagebutton" href="results.php?pageNum=1&amp;lang=en"><span class="currentpage btn btn-default">1</span></a>
     <a class="pagebutton" href="results.php?pageNum=2&amp;lang=en"><span class="btn btn-default">2</span></a>
     <a class="pagebutton" href="results.php?pageNum=3&amp;lang=en"><span class="btn btn-default">3</span></a>
     <a class="pagebutton" href="results.php?pageNum=4&amp;lang=en"><span class="btn btn-default">4</span></a>
     <a class="pagebutton" href="results.php?pageNum=5&amp;lang=en"><span class="btn btn-default">5</span></a>
     <a class="pagebutton" href="results.php?pageNum=6&amp;lang=en"><span class="btn btn-default">6</span></a>
     <a class="pagebutton" href="results.php?pageNum=10561&amp;lang=en"><span class="btn btn-default first">10561</span></a>
     <a href="attachment.php?attach=40168&amp;lang=en">Attachment</a>
     <a href="attachment.php?attach=40165&amp;lang=en">Attachment</a>
     <a href="attachment.php?a

## Find Highest Page Number Returned

In [13]:
highest_page_number = browser.page.find("span", "btn btn-default first").string
highest_page_number = int(str(highest_page_number))
print(highest_page_number)

10561


## Get next page link

In [14]:
next_page = browser.links()[1]
print(next_page)

<a class="pagebutton" href="results.php?pageNum=2&amp;lang=en"><span class="btn btn-default">2</span></a>


## Find all Links to Attachments Within the Page

In [15]:
browser.links("attachment")

[<a href="attachment.php?attach=40168&amp;lang=en">Attachment</a>,
 <a href="attachment.php?attach=40165&amp;lang=en">Attachment</a>,
 <a href="attachment.php?attach=40164&amp;lang=en">Attachment</a>,
 <a href="attachment.php?attach=40163&amp;lang=en">Attachment</a>,
 <a href="attachment.php?attach=40162&amp;lang=en">Attachment</a>]

## For Each Page in Search Results, Download Each Attachment Containing 'appoint'

In [16]:
pages = 0
while pages < highest_page_number:
    actual_page_num = pages + 1
    next_page_num = actual_page_num + 1
    # Build next_page link
    if(pages > 0):
        next_page['href'] = next_page['href'].replace(str(actual_page_num),str(next_page_num))
        next_page.string = str(next_page_num)
        
    logging.info("Processing page " + str(actual_page_num) + ", next page is " + next_page.string)
    
    for i in browser.links("attachment"):
        browser.follow_link(i)
        soup = browser.page

        # search method 1: use bs4 to search
        #appoints = soup.select("p:-soup-contains(appoint)")

        # search method 2: get_text() from bs4 and use Python find() to search
        if(soup.get_text().lower().find("appoint") > 0):
            # Get PC Number from page
            pc_number = soup.p.contents[1]
            # Get Date from page
            oic_date = soup.find_all("p")[1].contents[1]
            # Assemble filename and save file
            filename = oic_date + "_" + "PCNumber" + "_" + pc_number + ".html"
            logging.info("Found search string 'appoint', downloading " + browser.url + " to " + filename)
            browser.download_link(i,"/home/john/OrdersInCouncilAppointments/" + filename)
    logging.info("Page " + str(actual_page_num) + " complete...")
    browser.follow_link(next_page)
    pages += 1