In [1]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [2]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018.csv")
df.head()

Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,"<a href=""/page/siu-directors-report-case-17-of...",2017-12-30,2018-10-05
1,"<a href=""/page/siu-directors-report-case-17-ov...",2017-10-11,2018-10-02
2,"<a href=""/page/siu-directors-report-case-17-tc...",2017-11-08,2018-10-02
3,"<a href=""/page/siu-directors-report-case-17-of...",2017-10-27,2018-09-26
4,"<a href=""/page/siu-directors-report-case-18-pv...",2018-06-09,2018-09-25


In [3]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [4]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[0])

soup = BeautifulSoup(cases[0], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-17-ofd-379">17-OFD-379</a>
/page/siu-directors-report-case-17-ofd-379


In [5]:
url = baseurl + link
url

'https://www.ontario.ca//page/siu-directors-report-case-17-ofd-379'

In [6]:
# scrape the first directors report 17-OFD-379
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 17-OFD-379 | ontario.ca</title>


In [7]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[3].next_sibling.get_text()
text

'At approximately 2:09\xa0a.m. on December\xa030,\xa02017, the Peel Regional Police (PRP) notified the SIU of the firearm death of a man in Mississauga. At the time of the notification, the man had not yet been identified.footnote 1[1]'

In [8]:
year = re.search(r"(\d{4})", text).group(1)
year

'2017'

In [9]:
pattern = "the.*?notified"
match_results = re.search(pattern, text, re.IGNORECASE)
police_service = match_results.group()
police_service = re.sub("the ", "", police_service)
police_service = re.sub(" notified", "", police_service)
police_service

'Peel Regional Police (PRP)'

In [10]:
import string
# find the city
webpage = soup.get_text()

#text = webpage.split("City of ")[1]
split_str = webpage.partition('City of')
word_after_cityof = split_str[2].strip().split(" ")
city = word_after_cityof[0].strip(string.punctuation)
city
# row['beds'] = number_of_beds #append column to existing row

'Mississauga'

In [11]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found more easily in section 4
text = soup.find_all("h3")[9].next_sibling.get_text()
text

'The Post-Mortem Report, which the SIU received on October\xa02,\xa02018, determined the cause of death to be a gunshot wound to the left chest.'