In [3]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re

In [4]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,0,"<a href=""/page/siu-directors-report-case-17-of...",2017-12-30,2018-10-05
1,1,"<a href=""/page/siu-directors-report-case-17-ov...",2017-10-11,2018-10-02
2,2,"<a href=""/page/siu-directors-report-case-17-tc...",2017-11-08,2018-10-02
3,3,"<a href=""/page/siu-directors-report-case-17-of...",2017-10-27,2018-09-26
4,4,"<a href=""/page/siu-directors-report-case-18-pv...",2018-06-09,2018-09-25


In [5]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [6]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[4])

soup = BeautifulSoup(cases[4], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-18-pvi-175">18-PVI-175</a>
/page/siu-directors-report-case-18-pvi-175


In [7]:
url = baseurl + link
url

'https://www.ontario.ca/page/siu-directors-report-case-18-pvi-175'

In [8]:
# scrape the first directors report 17-OFD-379

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 18-PVI-175 | ontario.ca</title>


In [9]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[3].next_sibling.get_text()
text

'On June\xa010,\xa02018, at 12:29\xa0a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.'

In [10]:
year = re.search(r"(\d{4})", text).group(1)
year

'2018'

In [11]:
try:
    pattern = "the.*?notified"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" notified", "", police_service)
except:
    pattern = "the.*?reported"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" reported", "", police_service)
police_service

'Ontario Provincial Police (OPP)'

In [12]:
import string
# find the city
webpage = soup.get_text()

#text = webpage.split("City of ")[1]
split_str = webpage.partition('City of')
word_after_cityof = split_str[2].strip().split(" ")
city = word_after_cityof[0].strip(string.punctuation)
if city == "":
    city = np.nan
city
# row['beds'] = number_of_beds #append column to existing row

nan

In [13]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
print(text)
temp = text.split(".")
for i in temp:
    if police_service in i:
        pattern = "\).*"
        match_results = re.search(pattern, i, re.IGNORECASE)
        civilian_consequences = match_results.group()
        civilian_consequences = re.sub("\) ", "", civilian_consequences).strip()
        civilian_consequences = re.sub("the ", "", civilian_consequences).strip()
civilian_consequences

On June 10, 2018, at 12:29 a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.


'reported vehicle injury of Complainant'

In [14]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [15]:
# investigation outcome
# this is found in the director's analysis section/last paragraph of the report
outcome = soup.find_all("h2")
outcome

[<h2 class="alert__header-title h4">Ontario.ca needs JavaScript to function properly and provide you with a fast, stable experience.</h2>,
 <h2 class="small"> On this page <a class="show-on-focus text-right small margin-left-16-!" href="#toc-end">Skip this page navigation</a></h2>,
 <h2>Mandate of the <abbr title="Special Investigations Unit">SIU</abbr></h2>,
 <h2>Information restrictions</h2>,
 <h2>Mandate engaged</h2>,
 <h2>The investigation</h2>,
 <h2>Incident narrative</h2>,
 <h2>Evidence</h2>,
 <h2>Relevant legislation</h2>,
 <h2>Analysis and director’s decision</h2>,
 <h2 class="h5 sidebar__header" id="section-related"> Related <span class="show-for-sr">information</span></h2>,
 <h2 class="h3">Footnotes</h2>,
 <h2 class="h4 footer-ministry__heading"><div>Ministry of the Attorney General</div></h2>,
 <h2 class="h4 footer-ministry__heading">Questions or comments</h2>]

In [16]:
# reason for police investigation
