In [168]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

In [4]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,0,"<a href=""/page/siu-directors-report-case-17-of...",2017-12-30,2018-10-05
1,1,"<a href=""/page/siu-directors-report-case-17-ov...",2017-10-11,2018-10-02
2,2,"<a href=""/page/siu-directors-report-case-17-tc...",2017-11-08,2018-10-02
3,3,"<a href=""/page/siu-directors-report-case-17-of...",2017-10-27,2018-09-26
4,4,"<a href=""/page/siu-directors-report-case-18-pv...",2018-06-09,2018-09-25


In [5]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [179]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[4])

soup = BeautifulSoup(cases[4], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-18-pvi-175">18-PVI-175</a>
/page/siu-directors-report-case-18-pvi-175


In [180]:
url = baseurl + link
url

'https://www.ontario.ca/page/siu-directors-report-case-18-pvi-175'

In [181]:
# scrape the first directors report 17-OFD-379

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 18-PVI-175 | ontario.ca</title>


In [182]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[3].next_sibling.get_text()
text

'On June\xa010,\xa02018, at 12:29\xa0a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.'

In [164]:
def find_year(text):
    match = re.search(r"(20\d{2})", text)
    if match is not None:
        return match.group(1)


year = find_year(text)
print(year)

2018


In [11]:
try:
    pattern = "the.*?notified"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" notified", "", police_service)
except:
    pattern = "the.*?reported"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" reported", "", police_service)
police_service

'Ontario Provincial Police (OPP)'

In [110]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = get_inner_text(a_soup)
    else:
        full_name = tag
        
    
        
    return full_name.split(", ")[0]
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

In [124]:
incident_narrative_title = soup.find("h2", string="Incident narrative")

In [128]:
next_sibling = incident_narrative_title.next_sibling
incident_narrative = [next_sibling]

while next_sibling.next_sibling.name == "p":
    next_sibling = next_sibling.next_sibling
    incident_narrative.append(next_sibling)
    
print(incident_narrative)

[<p>On June 9, 2018, at around 10:00 p.m., the Complainant and the <abbr title="Subject Officer">SO</abbr> were involved in a motor vehicle collision at the intersection of Bruce County Road 3 and Bruce County Road 2 near Walkerton. The Complainant was operating a motorcycle northbound on Bruce County Road 3. The <abbr title="Subject Officer">SO</abbr> was driving southbound on Bruce County Road 3 and made a sudden left hand turn towards Bruce County Road 2 – travelling into the path of the Complainant. The Complainant was travelling at a speed slightly over the 80 km/h speed limit and was unable to stop. He collided with the side of the <abbr title="Subject Officer">SO</abbr>’s police cruiser, was propelled off his motorcycle and landed in the roadway. As a result of the collision, the Complainant sustained serious injuries. The <abbr title="Subject Officer">SO</abbr> says he did not see the Complainant and takes responsibility for the collision.</p>]


In [225]:
def find_city(soup):
    incident_narrative_title = soup.find("h2", string="Incident narrative")
    
    
    next_sibling = incident_narrative_title.next_sibling
    incident_narratives = [next_sibling]
    

    while next_sibling.next_sibling.name == "p":
        next_sibling = next_sibling.next_sibling
        incident_narratives.append(next_sibling)
        
    for p in incident_narratives:
        for municipality in list_of_municipalities_in_ontario:
            if municipality in get_inner_text(p):
                return municipality

In [226]:
find_city(soup)

'Hamilton'

In [139]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
# apparent its not always consistent where this info is
print(text)
temp = text.split(".")
for i in temp:
    if police_service in i:
        pattern = "\).*"
        match_results = re.search(pattern, i, re.IGNORECASE)
        civilian_consequences = match_results.group()
        civilian_consequences = re.sub("\) ", "", civilian_consequences).strip()
        civilian_consequences = re.sub("the ", "", civilian_consequences).strip()
civilian_consequences

On June 10, 2018, at 12:29 a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.


'reported vehicle injury of Complainant'

In [14]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [166]:
def find_decision_date(soup):
    body = soup.find("div", class_="body-field")
    decision_date_element = body.contents[-3]
    decision_date = decision_date_element.string
    
    if decision_date is not None:
        decision_date = decision_date.replace("\xa0", " ")

In [210]:
def find_investigation_outcome(soup):
    # investigation outcome
    # this is found in the director's analysis section/last paragraph of the report
    all_h2 = soup.find_all("h2")

    
    mandate = None
    for h2 in all_h2:
        if h2.find(text=re.compile("Mandate of the")):
            mandate = h2
            break
            
    if mandate is None:
        return None
    
    
    
    decision_paragraph = mandate.parent.contents[-4]
    decision_final_paragraph_text = get_inner_text(decision_paragraph)

    return decision_final_paragraph_text

In [211]:

find_investigation_outcome(soup)

'In the final analysis, I am satisfied for the foregoing reasons that the man’s detention and the manner in which it was carried out were lawful notwithstanding the injury which he suffered, even were I to find that the officers caused the injury, which I am not inclined to do. I am, therefore, satisfied on reasonable grounds on this record that the actions exercised by the officers fell within the limits prescribed by the criminal law and there are no grounds for proceeding with charges in this case.'

In [16]:
# reason for police investigation
# not really sure about this one


In [227]:
results = []
for case in tqdm(cases):
    link_soup = BeautifulSoup(case, "html.parser")
    link = link_soup.find("a")['href']
    url = baseurl + link
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url,headers=hdr)
    page = urlopen(req)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    province = "Ontario"

    # a lot of the information we might be looking for can be found in section 3 of the report
    text = soup.find_all("h3")[3].next_sibling.get_text()
    
    year = find_year(text)
    police_service = find_police_service(text)
    
    city = find_city(soup)
    
    investigation_outcome = find_investigation_outcome(soup)
    
    results.append([year, police_service, city, province, investigation_outcome])
    
results_df = pd.DataFrame(results, columns=['Year', 'Police Service', 'City', 'Province', 'Investigation Final Decision'])
    

100%|████████████████████████████████████████████████████████████████████████████████| 290/290 [01:39<00:00,  2.90it/s]


In [228]:
results_df.head()

Unnamed: 0,Year,Police Service,City,Province,Investigation Final Decision
0,2017,Peel Regional Police,Mississauga,Ontario,"I find, therefore, on this record, that the th..."
1,2017,Ottawa Police Service,Ottawa,Ontario,"In conclusion, on the evidence before me, I fi..."
2,2017,Toronto Police Service,Toronto,Ontario,"In conclusion, I find that the evidence is ins..."
3,2017,Cobourg Police Service,,Ontario,criminal law and instead find there are no gro...
4,2018,Ontario Provincial Police,Bruce,Ontario,"In sum, while it is clear that the SO’s left t..."


In [230]:
results_df.to_csv("results_on_2018.csv", index=False)