In [66]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
import bs4
from bs4 import BeautifulSoup
import re

In [107]:
def get_inner_text(tag):
    if isinstance(tag, bs4.element.NavigableString):
        return tag
    
    if len(tag.contents) > 0:
        results = []
        
        for content in tag.contents:
            text = get_inner_text(content)
            if text is not None:
                results.append(text)
            
        return ''.join(results)
    
    return tag.string        

In [2]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,0,"<a href=""/page/siu-directors-report-case-17-of...",2017-12-30,2018-10-05
1,1,"<a href=""/page/siu-directors-report-case-17-ov...",2017-10-11,2018-10-02
2,2,"<a href=""/page/siu-directors-report-case-17-tc...",2017-11-08,2018-10-02
3,3,"<a href=""/page/siu-directors-report-case-17-of...",2017-10-27,2018-09-26
4,4,"<a href=""/page/siu-directors-report-case-18-pv...",2018-06-09,2018-09-25


In [3]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [4]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[4])

soup = BeautifulSoup(cases[4], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-18-pvi-175">18-PVI-175</a>
/page/siu-directors-report-case-18-pvi-175


In [5]:
url = baseurl + link
url

'https://www.ontario.ca/page/siu-directors-report-case-18-pvi-175'

In [6]:
# scrape the first directors report 17-OFD-379

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 18-PVI-175 | ontario.ca</title>


In [7]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[3].next_sibling.get_text()
text

'On June\xa010,\xa02018, at 12:29\xa0a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.'

In [81]:
def find_year(text):
    return re.search(r"(\d{4})", text).group(1)

year = find_year(text)
year

'2018'

In [73]:
# police service

# list from https://www.oacp.ca/en/about-us/ontario-police-organizations.aspx
ontario_police_services = [
    'Akwesasne Mohawk Police Service',
    'Anishinabek Police Service',
    'Aylmer Police Service',
    'Barrie Police Service',
    'Belleville Police Service',
    'Brantford Police Service',
    'Brockville Police Service',
    'Chatham Kent Police Service',
    'City of Kawartha Lakes Police Service',
    'Cobourg Police Service',
    'Cornwall Police Service',
    'Deep River Police Service',
    'Dryden Police Service',
    'Greater Sudbury Police Service',
    'Guelph Police Service',
    'Halton Regional Police Service',
    'Hamilton Police Service',
    'Hanover Police Service',
    'Kingston Police',
    'Lac Seul Police Service',
    'LaSalle Police Service',
    'London Police Service',
    'Niagara Parks Police',
    'Niagara Regional Police Service',
    'Nishnawbe-Aski Police Service',
    'North Bay Police Service',
    'Ontario Provincial Police',
    'Owen Sound Police Service',
    'Ottawa Police Service',
    'Peterborough Police Service',
    'Peel Regional Police',
    'Port Hope Police Service',
    'Rama Police Service',
    'Sarnia Police Service',
    'Royal Canadian Mounted Police',
    'Sault Ste. Marie Police Service',
    'Saugeen Shores Police Service',
    'Smiths Falls Police Service',
    'Six Nations Police Service',
    'St. Thomas Police Service',
    'Strathroy-Caradoc Police Service',
    'South Simcoe Police Service',
    'Timmins Police Service',
    'Stratford Police Service',
    'Thunder Bay Police Service',
    'Treaty Three Police Service',
    'Toronto Police Service',
    'Waterloo Regional Police Service',
    'U.C.C.M. Anishnaabe Police',
    'Wikwemikong Tribal Police Service',
    'West Grey Police Service',
    'Woodstock Police Service',
    'Windsor Police Service',
    'York Regional Police'
]

In [77]:
def find_police_service(text):
    for service_name in ontario_police_services:
        if service_name in text:
            return service_name
        
police_service = find_police_service(text)
police_service

'Ontario Provincial Police'

In [9]:
try:
    pattern = "the.*?notified"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" notified", "", police_service)
except:
    pattern = "the.*?reported"
    match_results = re.search(pattern, text, re.IGNORECASE)
    police_service = match_results.group()
    police_service = re.sub("the ", "", police_service)
    police_service = re.sub(" reported", "", police_service)
police_service

'Ontario Provincial Police (OPP)'

In [110]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = get_inner_text(a_soup)
    else:
        full_name = tag
        
    
        
    return full_name.split(", ")[0]
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

In [124]:
incident_narrative_title = soup.find("h2", string="Incident narrative")

In [128]:
next_sibling = incident_narrative_title.next_sibling
incident_narrative = [next_sibling]

while next_sibling.next_sibling.name == "p":
    next_sibling = next_sibling.next_sibling
    incident_narrative.append(next_sibling)
    
print(incident_narrative)

[<p>On June 9, 2018, at around 10:00 p.m., the Complainant and the <abbr title="Subject Officer">SO</abbr> were involved in a motor vehicle collision at the intersection of Bruce County Road 3 and Bruce County Road 2 near Walkerton. The Complainant was operating a motorcycle northbound on Bruce County Road 3. The <abbr title="Subject Officer">SO</abbr> was driving southbound on Bruce County Road 3 and made a sudden left hand turn towards Bruce County Road 2 – travelling into the path of the Complainant. The Complainant was travelling at a speed slightly over the 80 km/h speed limit and was unable to stop. He collided with the side of the <abbr title="Subject Officer">SO</abbr>’s police cruiser, was propelled off his motorcycle and landed in the roadway. As a result of the collision, the Complainant sustained serious injuries. The <abbr title="Subject Officer">SO</abbr> says he did not see the Complainant and takes responsibility for the collision.</p>]


In [137]:
def find_city(soup):
    incident_narrative_title = soup.find("h2", string="Incident narrative")
    
    next_sibling = incident_narrative_title.next_sibling
    incident_narratives = [next_sibling]

    while next_sibling.next_sibling.name == "p":
        next_sibling = next_sibling.next_sibling
        incident_narratives.append(next_sibling)
        
    for p in incident_narratives:
        for municipality in list_of_municipalities_in_ontario:
            match = re.search(municipality, str(p), re.IGNORECASE)
            if match is not None:
                return match[0]

In [138]:
find_city(soup)

'Bruce'

In [10]:
import string
# find the city
webpage = soup.get_text()

#text = webpage.split("City of ")[1]
split_str = webpage.partition('City of')
word_after_cityof = split_str[2].strip().split(" ")
city = word_after_cityof[0].strip(string.punctuation)
if city == "":
    city = np.nan
city
# row['beds'] = number_of_beds #append column to existing row

nan

In [11]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
print(text)
temp = text.split(".")
for i in temp:
    if police_service in i:
        pattern = "\).*"
        match_results = re.search(pattern, i, re.IGNORECASE)
        civilian_consequences = match_results.group()
        civilian_consequences = re.sub("\) ", "", civilian_consequences).strip()
        civilian_consequences = re.sub("the ", "", civilian_consequences).strip()
civilian_consequences

On June 10, 2018, at 12:29 a.m., the Ontario Provincial Police (OPP) reported the vehicle injury of the Complainant.


'reported vehicle injury of Complainant'

In [12]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [29]:
# investigation outcome
# this is found in the director's analysis section/last paragraph of the report
analysis_and_directors_decision = soup.find("h2", string="Analysis and director’s decision")
analysis_and_directors_decision

<h2>Analysis and director’s decision</h2>

In [49]:
decision_date_element = analysis_and_directors_decision.parent.contents[-3]
decision_date = decision_date_element.string
decision_date = decision_date.replace("\xa0", " ")

if not decision_date.startswith("Date:"):
    print("Unable to find date element")

In [50]:
decision_date

'Date: September 25, 2018'

In [56]:
decision_final_paragraph = decision_date_element.previous_sibling
decision_final_paragraph.contents

['In sum, while it is clear that the ',
 <abbr title="Subject Officer">SO</abbr>,
 '’s left turn created a risk of danger which had very significant consequences',
 <onesite-ref number="8"><sup number="8"><a href="#foot-8" id="ref-8" onclick="(onesiteRef(this));" rel="footnote"><span class="show-for-sr">footnote 8</span><span aria-hidden="true">[8]</span></a></sup></onesite-ref>,
 ', there is simply insufficient evidence that the ',
 <abbr title="Subject Officer">SO</abbr>,
 '’s driving meets the high threshold required to find a marked departure from the standard of care. I am therefore unable to form grounds to believe the ',
 <abbr title="Subject Officer">SO</abbr>,
 ' committed a criminal offence in relation to the collision and the file will be closed.']

In [72]:
decision_final_paragraph_text = get_inner_text(decision_final_paragraph)
decision_final_paragraph_text

'In sum, while it is clear that the SO’s left turn created a risk of danger which had very significant consequencesfootnote 8[8], there is simply insufficient evidence that the SO’s driving meets the high threshold required to find a marked departure from the standard of care. I am therefore unable to form grounds to believe the SO committed a criminal offence in relation to the collision and the file will be closed.'

In [16]:
# reason for police investigation
# not really sure about this one
