In [1]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import bs4
import re
from tqdm import tqdm

In [2]:
def get_inner_text(tag):
    if isinstance(tag, bs4.element.NavigableString):
        return tag
    
    if len(tag.contents) > 0:
        results = []
        
        for content in tag.contents:
            text = get_inner_text(content)
            if text is not None:
                results.append(text)
            
        return ''.join(results)
    
    return tag.string  

In [3]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,0,"<a href=""/page/siu-directors-report-case-17-of...",2017-12-30,2018-10-05
1,1,"<a href=""/page/siu-directors-report-case-17-ov...",2017-10-11,2018-10-02
2,2,"<a href=""/page/siu-directors-report-case-17-tc...",2017-11-08,2018-10-02
3,3,"<a href=""/page/siu-directors-report-case-17-of...",2017-10-27,2018-09-26
4,4,"<a href=""/page/siu-directors-report-case-18-pv...",2018-06-09,2018-09-25


In [4]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [5]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[1])

soup = BeautifulSoup(cases[1], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-17-ovi-295">17-OVI-295</a>
/page/siu-directors-report-case-17-ovi-295


In [6]:
url = baseurl + link
url

'https://www.ontario.ca/page/siu-directors-report-case-17-ovi-295'

In [7]:
# scrape the first directors report 17-OFD-379

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 17-OVI-295 | ontario.ca</title>


In [8]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[3].next_sibling.get_text()
text

'At approximately 12:25\xa0a.m. on October\xa012,\xa02017, the Ottawa Police Service (OPS) notified the SIU and of the serious injury sustained by the Complainant following an attempted vehicle stop by the police.'

In [9]:
def find_year(text):
    match = re.search(r"(20\d{2})", text)
    if match is not None:
        return match.group(1)


year = find_year(text)
print(year)

2017


In [10]:
# list from https://www.oacp.ca/en/about-us/ontario-police-organizations.aspx
ontario_police_services = [
    'Akwesasne Mohawk Police Service',
    'Anishinabek Police Service',
    'Aylmer Police Service',
    'Barrie Police Service',
    'Belleville Police Service',
    'Brantford Police Service',
    'Brockville Police Service',
    'Chatham Kent Police Service',
    'City of Kawartha Lakes Police Service',
    'Cobourg Police Service',
    'Cornwall Police Service',
    'Deep River Police Service',
    'Dryden Police Service',
    'Greater Sudbury Police Service',
    'Guelph Police Service',
    'Halton Regional Police Service',
    'Hamilton Police Service',
    'Hanover Police Service',
    'Kingston Police',
    'Lac Seul Police Service',
    'LaSalle Police Service',
    'London Police Service',
    'Niagara Parks Police',
    'Niagara Regional Police Service',
    'Nishnawbe-Aski Police Service',
    'North Bay Police Service',
    'Ontario Provincial Police',
    'Owen Sound Police Service',
    'Ottawa Police Service',
    'Peterborough Police Service',
    'Peel Regional Police',
    'Port Hope Police Service',
    'Rama Police Service',
    'Sarnia Police Service',
    'Royal Canadian Mounted Police',
    'Sault Ste. Marie Police Service',
    'Saugeen Shores Police Service',
    'Smiths Falls Police Service',
    'Six Nations Police Service',
    'St. Thomas Police Service',
    'Strathroy-Caradoc Police Service',
    'South Simcoe Police Service',
    'Timmins Police Service',
    'Stratford Police Service',
    'Thunder Bay Police Service',
    'Treaty Three Police Service',
    'Toronto Police Service',
    'Waterloo Regional Police Service',
    'U.C.C.M. Anishnaabe Police',
    'Wikwemikong Tribal Police Service',
    'West Grey Police Service',
    'Woodstock Police Service',
    'Windsor Police Service',
    'York Regional Police'
]

In [11]:
def find_police_service(text):
    for service_name in ontario_police_services:
        if service_name in text:
            return service_name

In [12]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = get_inner_text(a_soup)
    else:
        full_name = tag
        
    
        
    return full_name.split(", ")[0]
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

In [13]:
incident_narrative_title = soup.find("h2", string="Incident narrative")

In [14]:
next_sibling = incident_narrative_title.next_sibling
incident_narrative = [next_sibling]

while next_sibling.next_sibling.name == "p":
    next_sibling = next_sibling.next_sibling
    incident_narrative.append(next_sibling)
    
print(incident_narrative)

[<p>Shortly after 6:00 p.m. on October 11, 2017, two <abbr title="Ottawa Police Service">OPS</abbr> officers were operating radar on Bank Street, south of Hunt Club Road, in the City of Ottawa, in separate vehicles. While <abbr title="Witness Officer">WO</abbr> #1 was busy dealing with one motorist, the <abbr title="Subject Officer">SO</abbr> was using a hand-held laser unit (<abbr>LIDAR</abbr>) to monitor traffic on Bank Street.</p>, <p>A black Honda motor vehicle travelling northwest on Bank Street passed the <abbr title="Subject Officer">SO</abbr> at a high rate of speed. The <abbr title="Subject Officer">SO</abbr> entered his police vehicle and gave pursuit with his emergency lights activated. The Honda vehicle turned right (north) onto Albion Road and the <abbr title="Subject Officer">SO</abbr> was able to catch up to it. The Honda the pulled over on Albion Road, at the south side of the intersection with Hunt Club Road.</p>, <p>The <abbr title="Subject Officer">SO</abbr> then als

In [15]:
def find_city(soup):
    incident_narrative_title = soup.find("h2", string="Incident narrative")
    
    
    next_sibling = incident_narrative_title.next_sibling
    incident_narratives = [next_sibling]
    

    while next_sibling.next_sibling.name == "p":
        next_sibling = next_sibling.next_sibling
        incident_narratives.append(next_sibling)
        
    for p in incident_narratives:
        for municipality in list_of_municipalities_in_ontario:
            regex = r"\b(?=\w)" + re.escape(municipality) + r"\b(?!\w)"
            match = re.search(regex, get_inner_text(p), re.IGNORECASE)
            if match:
                return municipality

In [16]:
find_city(soup)

'Ottawa'

In [17]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
# apparent its not always consistent where this info is
    
from string import punctuation

def find_civilian_consequences(soup):
    
    keywords = ['death', 'injury', 'fracture', 'diagnosed', 
                'diagnoses', 'dead', 'post-mortem', 'cause of death', 
                'injuries', 'CT scan', 'assessed', 'assessment', 'treated']
    
    if soup.find("h3", string = "Nature of Injuries / Treatment"):
        temp = soup.find("h3", string = "Nature of Injuries / Treatment")
    elif soup.find("h3", string = "Nature of Injury/Treatment"): 
        temp = soup.find("h3", string = "Nature of Injury/Treatment")
    elif soup.find("h3", string = "Cause of Death"):
        temp = soup.find("h3", string = "Cause of Death")
    elif soup.find("h3", string = "Cause of death"):
        temp = soup.find("h3", string = "Cause of death")
    elif soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death"):
        temp = soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death")
    elif soup.find("h3", string = "Nature of injury/treatment"):
        temp = soup.find("h3", string = "Nature of injury/treatment")
    else:
        temp = soup.find("h2", string="Incident narrative")

    next_sibling = temp.next_sibling
    incident = [next_sibling]


    while next_sibling.next_sibling.name == "p":
        next_sibling = next_sibling.next_sibling
        incident.append(next_sibling.text)
    
    civilian_consequences = ""
    
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(CLEANR, '', str(incident))
    temp = cleantext.split('.')
    for i in temp:
        for j in keywords:
            if j in i:
                civilian_consequences = i
    
    if civilian_consequences == "":
        civilian_consequences = cleantext
    
    civilian_consequences=[i.strip(punctuation) for i in civilian_consequences.split()]
    civilian_consequences = " ".join(civilian_consequences)
    
    return civilian_consequences

In [18]:
print(find_civilian_consequences(soup))

The Complainant suffered a fracture of the third vertebra C3 in his neck and was still in hospital with some paralysis in his right arm and both legs at the time of the writing of this report


In [19]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [20]:
def find_decision_date(soup):
    body = soup.find("div", class_="body-field")
    decision_date_element = body.contents[-3]
    decision_date = decision_date_element.string
    
    if decision_date is not None:
        decision_date = decision_date.replace("\xa0", " ")

In [21]:
def find_investigation_outcome(soup):
    # investigation outcome
    # this is found in the director's analysis section/last paragraph of the report
    all_h2 = soup.find_all("h2")

    
    mandate = None
    for h2 in all_h2:
        if h2.find(text=re.compile("Mandate of the")):
            mandate = h2
            break
            
    if mandate is None:
        return None
    
    
    
    decision_paragraph = mandate.parent.contents[-4]
    decision_final_paragraph_text = get_inner_text(decision_paragraph)

    return decision_final_paragraph_text

In [22]:

find_investigation_outcome(soup)

'In conclusion, on the evidence before me, I find that I lack the reasonable grounds upon which I can be satisfied that the SO’s driving conduct amounted to a criminal offence and therefore no charges shall issue.'

In [23]:
# reason for police investigation
# not really sure about this one


In [24]:
results = []
for case in tqdm(cases):
    link_soup = BeautifulSoup(case, "html.parser")
    link = link_soup.find("a")['href']
    url = baseurl + link
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url,headers=hdr)
    page = urlopen(req)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    province = "Ontario"

    # a lot of the information we might be looking for can be found in section 3 of the report
    text = soup.find_all("h3")[3].next_sibling.get_text()
    
    year = find_year(text)
    police_service = find_police_service(text)
    
    civilian_consequences = find_civilian_consequences(soup)
    
    city = find_city(soup)
    
    investigation_outcome = find_investigation_outcome(soup)
    
    results.append([year, police_service, city, province, civilian_consequences, investigation_outcome, url])
    
results_df = pd.DataFrame(results, columns=['Year',  'Police Service', 'City', 'Province', 'Civilian Consequences', 'Investigation Final Decision', 'Link'])
    

100%|███████████████████████████████████████████████████████████████████████| 290/290 [01:27<00:00,  3.32it/s]


In [25]:
results_df.head()

Unnamed: 0,Year,Police Service,City,Province,Civilian Consequences,Investigation Final Decision,Link
0,2017,Peel Regional Police,Mississauga,Ontario,The Post-Mortem Report which the SIU received ...,"I find, therefore, on this record, that the th...",https://www.ontario.ca/page/siu-directors-repo...
1,2017,Ottawa Police Service,Ottawa,Ontario,The Complainant suffered a fracture of the thi...,"In conclusion, on the evidence before me, I fi...",https://www.ontario.ca/page/siu-directors-repo...
2,2017,Toronto Police Service,Toronto,Ontario,There are no additional facial bone fractures,"In conclusion, I find that the evidence is ins...",https://www.ontario.ca/page/siu-directors-repo...
3,2017,Cobourg Police Service,Head,Ontario,The cause of death for CW\xa0#12 was determin...,criminal law and instead find there are no gro...,https://www.ontario.ca/page/siu-directors-repo...
4,2018,Ontario Provincial Police,Bruce,Ontario,As a result of the collision the Complainant s...,"In sum, while it is clear that the SO’s left t...",https://www.ontario.ca/page/siu-directors-repo...


In [26]:
results_df.to_csv("results_on_2018.csv", index=False)