In [1]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import bs4
import re
from tqdm import tqdm

In [2]:
def get_inner_text(tag):
    if isinstance(tag, bs4.element.NavigableString):
        return tag
    
    if len(tag.contents) > 0:
        results = []
        
        for content in tag.contents:
            text = get_inner_text(content)
            if text is not None:
                results.append(text)
            
        return ''.join(results)
    
    return tag.string  

In [4]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2005_2017_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case number,Special Investigations Unit incident date,Special Investigations Unit published date
0,2,"<a href=""/page/siu-directors-report-case-05-tf...",2005-05-27,2005-08-25
1,12,"<a href=""/page/siu-directors-report-case-15-of...",2015-04-04,2015-11-26
2,14,"<a href=""/page/siu-directors-report-case-10-pf...",2010-05-10,2010-09-29
3,25,"<a href=""/page/siu-directors-report-case-10-of...",2010-08-25,2011-01-11
4,32,"<a href=""/page/siu-directors-report-case-11-of...",2011-06-22,2011-10-25


In [5]:
# base url for page is: https://www.ontario.ca/
baseurl = "https://www.ontario.ca"

In [6]:
# get the link from the case number part of the df

cases = df["Case number"].tolist()
print(cases[1])

soup = BeautifulSoup(cases[1], "html.parser")
link = soup.find("a")['href']

print(link)

<a href="/page/siu-directors-report-case-15-ofd-061">15-OFD-061</a>
/page/siu-directors-report-case-15-ofd-061


In [7]:
url = baseurl + link
url

'https://www.ontario.ca/page/siu-directors-report-case-15-ofd-061'

In [8]:
# scrape the first directors report 17-OFD-379

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url,headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>SIU Director’s Report - Case # 15-OFD-061 | ontario.ca</title>


In [19]:
soup.find_all("h2")[3]

[<h2 class="alert__header-title h4">Ontario.ca needs JavaScript to function properly and provide you with a fast, stable experience.</h2>,
 <h2 class="small"> On this page <a class="show-on-focus text-right small margin-left-16-!" href="#toc-end">Skip this page navigation</a></h2>,
 <h2>Explanatory note</h2>,
 <h2>Director’s report</h2>,
 <h2>Notification of the <abbr title="Special Investigations Unit">SIU</abbr></h2>,
 <h2>Overview - Tim Hortons</h2>,
 <h2>The investigation</h2>,
 <h2>Director’s decision under <abbr title="section">s.</abbr> 113(7) of the <a href="/laws/statute/90p15"><cite>Police Services Act</cite></a></h2>,
 <h2 class="h5 sidebar__header" id="section-related"> Related <span class="show-for-sr">information</span></h2>,
 <h2 class="h3">Footnotes</h2>,
 <h2 class="h4 footer-ministry__heading"><div>Ministry of the Attorney General</div></h2>,
 <h2 class="h4 footer-ministry__heading">Questions or comments</h2>]

In [26]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"


section_headers = soup.find_all("h2")
for header in section_headers:
    if "Notification" in header.text:
        notification_of_the_siu = header
        print(notification_of_the_siu)


<h2>Notification of the <abbr title="Special Investigations Unit">SIU</abbr></h2>


In [33]:
next_sibling = notification_of_the_siu.next_sibling
notification_of_the_siu_paragraphs = []

while next_sibling.name == "p":
    notification_of_the_siu_paragraphs.append(next_sibling.get_text())
    next_sibling = next_sibling.next_sibling
    
notification_of_the_siu_text = " ".join(notification_of_the_siu_paragraphs)
notification_of_the_siu_text

'Notification Date and Time: 04/05/2015 at 0030\xa0hours Notified By: Police On April\xa05,\xa02015, at 0030\xa0hrs, Notifying Officer of the Peterborough Police Service (PPS) reported the following. On April\xa04,\xa02015, at 2344\xa0hrs, the PPS received a call for service at the Tim Hortons restaurant at 157 George Street North, Peterborough, regarding a man later identified by the PPS investigation to be [Deceased] waving a knife around. Civilian witnesses followed Deceased to the area of a location, Peterborough. A police officer later identified as [Subject Officer] arrived and at 2352\xa0hrs, the officer reported shots fired. Deceased was pronounced dead at the scene. A police officer was taken to the hospital for treatment of a stab wound. Deceased was identified by a health card as bearing the name Deceased with a date of birth of ----redacted,\xa01991.'

In [83]:
def get_notification_of_the_siu_text(soup):
    section_headers = soup.find_all("h2")
    for header in section_headers:
        if "Notification" in header.text:
            notification_of_the_siu = header
            
            next_sibling = notification_of_the_siu.next_sibling
            notification_of_the_siu_paragraphs = []

            while next_sibling.name == "p":
                notification_of_the_siu_paragraphs.append(next_sibling.get_text())
                next_sibling = next_sibling.next_sibling

            notification_of_the_siu_text = " ".join(notification_of_the_siu_paragraphs)
            
            return  notification_of_the_siu_text            

In [34]:
def find_year(text):
    match = re.search(r"(20\d{2})", text)
    if match is not None:
        return match.group(1)


year = find_year(notification_of_the_siu_text)
print(year)

2015


In [11]:
# list from https://www.oacp.ca/en/about-us/ontario-police-organizations.aspx
ontario_police_services = [
    'Akwesasne Mohawk Police Service',
    'Anishinabek Police Service',
    'Aylmer Police Service',
    'Barrie Police Service',
    'Belleville Police Service',
    'Brantford Police Service',
    'Brockville Police Service',
    'Chatham Kent Police Service',
    'City of Kawartha Lakes Police Service',
    'Cobourg Police Service',
    'Cornwall Police Service',
    'Deep River Police Service',
    'Dryden Police Service',
    'Greater Sudbury Police Service',
    'Guelph Police Service',
    'Halton Regional Police Service',
    'Hamilton Police Service',
    'Hanover Police Service',
    'Kingston Police',
    'Lac Seul Police Service',
    'LaSalle Police Service',
    'London Police Service',
    'Niagara Parks Police',
    'Niagara Regional Police Service',
    'Nishnawbe-Aski Police Service',
    'North Bay Police Service',
    'Ontario Provincial Police',
    'Owen Sound Police Service',
    'Ottawa Police Service',
    'Peterborough Police Service',
    'Peel Regional Police',
    'Port Hope Police Service',
    'Rama Police Service',
    'Sarnia Police Service',
    'Royal Canadian Mounted Police',
    'Sault Ste. Marie Police Service',
    'Saugeen Shores Police Service',
    'Smiths Falls Police Service',
    'Six Nations Police Service',
    'St. Thomas Police Service',
    'Strathroy-Caradoc Police Service',
    'South Simcoe Police Service',
    'Timmins Police Service',
    'Stratford Police Service',
    'Thunder Bay Police Service',
    'Treaty Three Police Service',
    'Toronto Police Service',
    'Waterloo Regional Police Service',
    'U.C.C.M. Anishnaabe Police',
    'Wikwemikong Tribal Police Service',
    'West Grey Police Service',
    'Woodstock Police Service',
    'Windsor Police Service',
    'York Regional Police'
]

In [35]:
def find_police_service(text):
    for service_name in ontario_police_services:
        if service_name in notification_of_the_siu_text:
            return service_name

In [37]:
find_police_service(notification_of_the_siu_text)

'Peterborough Police Service'

In [38]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = get_inner_text(a_soup)
    else:
        full_name = tag
        
    
        
    return full_name.split(", ")[0]
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

In [40]:
def find_city(text):
    for municipality in list_of_municipalities_in_ontario:
        regex = r"\b(?=\w)" + re.escape(municipality) + r"\b(?!\w)"
        match = re.search(regex, text, re.IGNORECASE)
        if match:
            return municipality

In [42]:
find_city(notification_of_the_siu_text)

'Peterborough'

In [43]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
# apparent its not always consistent where this info is
    
from string import punctuation

def find_civilian_consequences(soup):
    
    keywords = ['death', 'injury', 'fracture', 'diagnosed', 
                'diagnoses', 'dead', 'post-mortem', 'cause of death', 
                'injuries', 'CT scan', 'assessed', 'assessment', 'treated']
    
    if soup.find("h3", string = "Nature of Injuries / Treatment"):
        temp = soup.find("h3", string = "Nature of Injuries / Treatment")
    elif soup.find("h3", string = "Nature of Injury/Treatment"): 
        temp = soup.find("h3", string = "Nature of Injury/Treatment")
    elif soup.find("h3", string = "Cause of Death"):
        temp = soup.find("h3", string = "Cause of Death")
    elif soup.find("h3", string = "Cause of death"):
        temp = soup.find("h3", string = "Cause of death")
    elif soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death"):
        temp = soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death")
    elif soup.find("h3", string = "Nature of injury/treatment"):
        temp = soup.find("h3", string = "Nature of injury/treatment")
    else:
        temp = soup.find("h2", string="Incident narrative")

    next_sibling = temp.next_sibling
    incident = [next_sibling]


    while next_sibling.next_sibling.name == "p":
        next_sibling = next_sibling.next_sibling
        incident.append(next_sibling.text)
    
    civilian_consequences = ""
    
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(CLEANR, '', str(incident))
    temp = cleantext.split('.')
    for i in temp:
        for j in keywords:
            if j in i:
                civilian_consequences = i
    
    if civilian_consequences == "":
        civilian_consequences = cleantext
    
    civilian_consequences=[i.strip(punctuation) for i in civilian_consequences.split()]
    civilian_consequences = " ".join(civilian_consequences)
    
    return civilian_consequences

In [44]:
print(find_civilian_consequences(soup)) # looks like this is non-existent for pre 2017 cases

AttributeError: 'NoneType' object has no attribute 'next_sibling'

In [22]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [45]:
def find_decision_date(soup):
    body = soup.find("div", class_="body-field")
    decision_date_element = body.contents[-3]
    decision_date = decision_date_element.string
    
    if decision_date is not None:
        decision_date = decision_date.replace("\xa0", " ")

In [95]:
def get_all_following_tags_till_next_header(tag):
    if tag.next_sibling is None:
        return
    
    results = []
    next_sibling = tag.next_sibling
    
    while next_sibling is not None and next_sibling.name not in ["h1", "h2", "h3"]:
        results.append(next_sibling.get_text())
        next_sibling = next_sibling.next_sibling
        
    return results
    

In [100]:
def find_investigation_outcome(soup):
    # investigation outcome
    # this is found in the director's analysis section/last paragraph of the report
    all_h2 = soup.find_all("h2")

    
    decision = None
    for h2 in all_h2:
        if "decision" in h2.get_text():
            decision = h2
            break

    if decision is None:
        return None
    
    text = decision.next_sibling.get_text()
    text.replace("\xa0", " ")
    
    return text

In [101]:
find_investigation_outcome(soup)

'In my view, there are no reasonable grounds to believe that the named subject officer, Subject Officer, committed a criminal offence in relation to the firearms fatality of Mr. Darren Burnside on May\xa010,\xa02010. The following are the facts as I understand them.'

In [26]:
# reason for police investigation
# not really sure about this one


In [102]:
results = []
for case in tqdm(cases):
    link_soup = BeautifulSoup(case, "html.parser")
    link = link_soup.find("a")['href']
    url = baseurl + link
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url,headers=hdr)
    page = urlopen(req)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    province = "Ontario"

    text = get_notification_of_the_siu_text(soup)
    
    year = find_year(text)
    police_service = find_police_service(text)
    
    # civilian_consequences = find_civilian_consequences(soup)
    
    city = find_city(text)
    
    investigation_outcome = find_investigation_outcome(soup)
    
    # results.append([year, police_service, city, province, civilian_consequences, investigation_outcome, url])
    results.append([year, police_service, city, province, investigation_outcome, url])
    
# results_df = pd.DataFrame(results, columns=['Year',  'Police Service', 'City', 'Province', 'Civilian Consequences', 'Investigation Final Decision', 'Link'])
results_df = pd.DataFrame(results, columns=['Year',  'Police Service', 'City', 'Province', 'Investigation Final Decision', 'Link'])  

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.23it/s]


In [103]:
results_df

Unnamed: 0,Year,Police Service,City,Province,Investigation Final Decision,Link
0,2005,Peterborough Police Service,Toronto,Ontario,There are no reasonable grounds to believe tha...,https://www.ontario.ca/page/siu-directors-repo...
1,2015,Peterborough Police Service,Peterborough,Ontario,"There are no reasonable grounds, in my view, t...",https://www.ontario.ca/page/siu-directors-repo...
2,2010,Peterborough Police Service,,Ontario,"In my view, there are no reasonable grounds to...",https://www.ontario.ca/page/siu-directors-repo...
3,2010,Peterborough Police Service,Brantford,Ontario,"In my view, there are no reasonable grounds to...",https://www.ontario.ca/page/siu-directors-repo...
4,2011,Peterborough Police Service,Armstrong,Ontario,"In my view, there are no reasonable grounds to...",https://www.ontario.ca/page/siu-directors-repo...
5,2009,Peterborough Police Service,Toronto,Ontario,"In my view, there are no reasonable grounds to...",https://www.ontario.ca/page/siu-directors-repo...
6,2013,Peterborough Police Service,,Ontario,The investigation by this Unit has been comple...,https://www.ontario.ca/page/siu-directors-repo...
7,2015,Peterborough Police Service,St. Joseph,Ontario,,https://www.ontario.ca/page/siu-directors-repo...
8,2010,Peterborough Police Service,Ottawa,Ontario,,https://www.ontario.ca/page/siu-directors-repo...
9,2013,Peterborough Police Service,Toronto,Ontario,"In my view, there are no reasonable grounds to...",https://www.ontario.ca/page/siu-directors-repo...


In [104]:
results_df.to_csv("results_on_2005.csv", index=False)