In [1]:
%pip install pandas numpy beautifulsoup4 tqdm

Collecting pandas
  Downloading pandas-2.0.3-cp39-cp39-win_amd64.whl (10.8 MB)
     ---------------------------------------- 10.8/10.8 MB 6.2 MB/s eta 0:00:00
Collecting numpy
  Downloading numpy-1.25.2-cp39-cp39-win_amd64.whl (15.6 MB)
     ---------------------------------------- 15.6/15.6 MB 6.9 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.3/78.3 KB 4.5 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2023.3-py2.py3-none-any.whl (502 kB)
     -------------------------------------- 502.3/502.3 KB 7.8 MB/s eta 0:00:00
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ------------------------------------- 341.8/341.8 KB 10.7 MB/s eta 0:00:00
Installing collected packages: pytz, tzdata, tqdm, numpy, pandas
Successfully installed numpy-1.25.2 pandas-2.0.3 pytz-2023.3 tqdm-4.66.1 tzdata-2023.3
Note: you may need to restart the kernel to use updated pack

You should consider upgrading via the 'C:\Users\rohan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [2]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import bs4
import re
from tqdm import tqdm

In [3]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_2023_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case Number,Date Report Signed,Link Text,URLs
0,2,23-PVI-099,11-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
1,6,23-OCI-091,10-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
2,10,23-OCI-083,10-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
3,13,23-OVI-078,07-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
4,14,23-OCI-077,07-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...


In [4]:
# get the link from the case number part of the df

case_urls = df["URLs"].tolist()

In [5]:
case_urls[1]

'https://www.siu.on.ca/en/directors_report_details.php?drid=2488'

In [6]:
# scrape the first directors report 23-PVI-099 	

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(case_urls[0],headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>Special Investigations Unit -- Director's Report Details, Case Number: 23-PVI-099</title>


In [7]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[5].next_sibling.get_text()
text

'On March 30, 2023, at 8:41 p.m., the '

In [8]:
def find_year(text):
    match = re.search(r"(20\d{2})", text)
    if match is not None:
        return match.group(1)


year = find_year(text)
print(year)

2023


In [9]:
# list from https://www.oacp.ca/en/about-us/ontario-police-organizations.aspx
ontario_police_services = [
    'Akwesasne Mohawk Police Service',
    'Anishinabek Police Service',
    'Aylmer Police Service',
    'Barrie Police Service',
    'Belleville Police Service',
    'Brantford Police Service',
    'Brockville Police Service',
    'Chatham Kent Police Service',
    'City of Kawartha Lakes Police Service',
    'Cobourg Police Service',
    'Cornwall Police Service',
    'Deep River Police Service',
    'Dryden Police Service',
    'Greater Sudbury Police Service',
    'Guelph Police Service',
    'Halton Regional Police Service',
    'Hamilton Police Service',
    'Hanover Police Service',
    'Kingston Police',
    'Lac Seul Police Service',
    'LaSalle Police Service',
    'London Police Service',
    'Niagara Parks Police',
    'Niagara Regional Police Service',
    'Nishnawbe-Aski Police Service',
    'North Bay Police Service',
    'OPP',
    'Ontario Provincial Police',
    'Owen Sound Police Service',
    'Ottawa Police Service',
    'Peterborough Police Service',
    'Peel Regional Police',
    'Port Hope Police Service',
    'Rama Police Service',
    'Sarnia Police Service',
    'RCMP',
    'Royal Canadian Mounted Police',
    'Sault Ste. Marie Police Service',
    'Saugeen Shores Police Service',
    'Smiths Falls Police Service',
    'Six Nations Police Service',
    'St. Thomas Police Service',
    'Strathroy-Caradoc Police Service',
    'South Simcoe Police Service',
    'Timmins Police Service',
    'Stratford Police Service',
    'Thunder Bay Police Service',
    'Treaty Three Police Service',
    'Toronto Police Service',
    'Waterloo Regional Police Service',
    'U.C.C.M. Anishnaabe Police',
    'Wikwemikong Tribal Police Service',
    'West Grey Police Service',
    'Woodstock Police Service',
    'Windsor Police Service',
    'York Regional Police'
]

In [10]:
def find_police_service(text):
    for service_name in ontario_police_services:
        if service_name in text:
            if service_name == "OPP":
                return "Ontario Provincial Police"
            elif service_name == "RCMP":
                return "Royal Canadian Mounted Police"
            return service_name

In [11]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = a_soup.get_text()
    else:
        full_name = tag
            
    municipality_name = ", ".join(full_name.split(", ")[:-1])
    return municipality_name
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

list_of_municipalities_in_ontario = list(filter(lambda x: True if len(x) > 0 else False, list_of_municipalities_in_ontario))

In [12]:
list_of_municipalities_in_ontario

['Addington Highlands',
 'Adelaide-Metcalfe',
 'Adjala-Tosorontio',
 'Admaston/Bromley',
 'Ajax',
 'Alberton',
 'Alfred and Plantagenet',
 'Algonquin Highlands',
 'Alnwick/Haldimand',
 'Amaranth',
 'Amherstburg',
 'Armour',
 'Armstrong',
 'Arnprior',
 'Arran-Elderslie',
 'Ashfield-Colborne-Wawanosh',
 'Asphodel-Norwood',
 'Assiginack',
 'Athens',
 'Atikokan',
 'Augusta',
 'Aurora',
 'Aylmer',
 'Baldwin',
 'Bancroft',
 'Barrie',
 'Bayham',
 'Beckwith',
 'Belleville',
 'Billings',
 'Black River-Matheson',
 'Blandford-Blenheim',
 'Blind River',
 'Bluewater',
 'Bonfield',
 'Bonnechere Valley',
 'Bracebridge',
 'Bradford West Gwillimbury',
 'Brampton',
 'Brant',
 'Brantford',
 'Brethour',
 'Brighton',
 'Brock',
 'Brockton',
 'Brockville',
 'Brooke-Alvinston',
 'Bruce',
 'Bruce Mines',
 'Brudenell, Lyndoch and Raglan',
 'Burk’s Falls',
 'Burlington',
 'Burpee and Mills',
 'Caledon',
 'Callander',
 'Calvin',
 'Cambridge',
 'Carleton Place',
 'Carling',
 'Carlow/Mayo',
 'Casey',
 'Casselman',


In [13]:
def get_all_headings(soup):
    section_headers = soup.findAll(re.compile('^h[2-5]$'))
    
    def match_relevant_ids(section_header):
        header_id = section_header.get("id")
        
        if header_id == None:
            return False
        
        regex = re.compile('^section_[1-9]$')
        match = regex.search(section_header.get("id"))
        if match:
            return True
        else:
            return False
        
    section_headers = tuple(filter(match_relevant_ids, section_headers))
    
    return section_headers

In [14]:
get_all_headings(soup)

(<h2 class="mt-3" id="section_1">Mandate of the SIU</h2>,
 <h2 class="mt-3" id="section_2">Information Restrictions</h2>,
 <h2 class="mt-3" id="section_3">Mandate Engaged</h2>,
 <h2 class="mt-3" id="section_4">The Investigation</h2>,
 <h2 class="mt-3" id="section_5">Evidence</h2>,
 <h2 class="mt-3" id="section_6">Incident Narrative</h2>,
 <h2 class="mt-3" id="section_7">Relevant Legislation</h2>,
 <h2 class="mt-3" id="section_8">Analysis and Director's Decision</h2>)

In [15]:
def get_heading_text(heading):
    next_sibling = heading.next_sibling
    section_texts = []
    counter = 0
    
    while True:
        # if more than 20 elements have been checked without reaching next header, stop searching
        if counter > 20:
            break

        inner_text = next_sibling.get_text(strip=True)

        if len(inner_text) > 0:
            section_texts.append(inner_text)
            
        if next_sibling.next_sibling is None:
            break
        
        if next_sibling.next_sibling.name == "h2":
            break

        next_sibling = next_sibling.next_sibling
        counter += 1
    
    return '\n'.join(section_texts)

In [16]:
get_heading_text(get_all_headings(soup)[3])

'Notification of theSIU[1]On March 30, 2023, at 8:41 p.m., theOPPcontacted theSIUwith the following information.At 5:02 p.m., the Subject Official (SO) was off-duty and operating an unmarkedOPPcruiser eastbound on Highway 401 in Oxford County.  A vehicle dove past theSOat a high rate of speed, estimated to have been about 180 km/h.  TheSOactivated the emergency lighting of his cruiser and attempted to follow the speeding vehicle. After a brief period, he aborted the pursuit, pulled over to the shoulder, and contacted the Provincial Communication Centre (PCC), advising that he had disengaged from a vehicle that had failed to stop.  A short time later, the speeding vehicle was observed leaving Highway 401 at Exit 218 near Ingersoll, rolling multiple times and ending up in a ditch.  The driver was extricated by the fire department and taken to the London Health Sciences Centre Victoria Hospital with what was believed to be multiple fractures and a possible brain bleed.The TeamDate and tim

In [17]:
def get_all_text_in_report(soup):
    headings = get_all_headings(soup)
    
    all_texts = []
    
    for heading in headings:
        all_texts.append(get_heading_text(heading))
        
    return "\n".join(all_texts)

In [18]:
get_all_text_in_report(soup)



In [23]:
def find_section_text(soup, section_header_text = None):
    if(section_header_text):
        section_headers = soup.findAll(re.compile('^h[2-5]$'), string="section_header_text")

        if len(section_headers) == 0:
            raise Exception(f"Header tag with text '{section_header_text}' not found in document")

        section_header = section_headers[0]
    else:
        section_header = soup

    next_sibling = section_header.next_sibling

    section_texts = []
    counter = 0

    while True:
        # if more than 20 elements have been checked without reaching next header, stop searching
        if counter > 20:
            break

        inner_text = next_sibling.get_text(strip=True)

        if len(inner_text) > 0:
            section_texts.append(inner_text)

        if next_sibling.next_sibling is None:
            break

        if next_sibling.next_sibling.name in ['h1', 'h2', 'h3', 'h4']:
            break

        next_sibling = next_sibling.next_sibling
        counter += 1

    return ' '.join(section_texts)

In [24]:
find_section_text(soup, "Incident Narrative")

Exception: Header tag with text 'Incident Narrative' not found in document

In [25]:
def find_city(text):
    for municipality in list_of_municipalities_in_ontario:
        regex = r"\b(?=\w)" + re.escape(municipality) + r"\b(?!\w)"
        match = re.search(regex, text, re.IGNORECASE)
        if match:
            return municipality

In [26]:
find_city(get_heading_text(get_all_headings(soup)[3]))

'Ingersoll'

In [35]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
# apparent its not always consistent where this info is
    
from string import punctuation

def find_civilian_consequences(soup):
    
    keywords = ['death', 'injury', 'fracture', 'diagnosed', 
                'diagnoses', 'dead', 'post-mortem', 'cause of death', 
                'injuries', 'CT scan', 'assessed', 'assessment', 'treated']
    
    try:
    
        if soup.find("h3", string = "Nature of Injuries / Treatment"):
            temp = soup.find("h3", string = "Nature of Injuries / Treatment")
        elif soup.find("h3", string = "Nature of Injury/Treatment"): 
            temp = soup.find("h3", string = "Nature of Injury/Treatment")
        elif soup.find("h3", string = "Cause of Death"):
            temp = soup.find("h3", string = "Cause of Death")
        elif soup.find("h3", string = "Cause of death"):
            temp = soup.find("h3", string = "Cause of death")
        elif soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death"):
            temp = soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death")
        elif soup.find("h3", string = "Nature of injury/treatment"):
            temp = soup.find("h3", string = "Nature of injury/treatment")
        else:
            temp = soup.find("h2", string="Incident Narrative")

        next_sibling = temp.next_sibling
        incident = [next_sibling]


        while next_sibling.next_sibling.name == "div":
            next_sibling = next_sibling.next_sibling
            incident.append(next_sibling.text)

        civilian_consequences = ""

        CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleantext = re.sub(CLEANR, '', str(incident))
        temp = cleantext.split('.')
        for i in temp:
            for j in keywords:
                if j in i:
                    civilian_consequences = i

        if civilian_consequences == "":
            civilian_consequences = cleantext

        civilian_consequences=[i.strip(punctuation) for i in civilian_consequences.split()]
        civilian_consequences = " ".join(civilian_consequences)
    
    except:
        print(soup.find("title"))
        civilian_consequences = "Error when extracting consequences."
        # I think it is because they use the Event Chronology title instead 
        # but I want to see how many use it before fixing it all over
    
    return civilian_consequences

In [36]:
print(find_civilian_consequences(soup))

He was taken to hospital with a fractured back and possibly other injuries


In [17]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [53]:
# investigation outcome
def find_investigation_outcome(soup):
    heading = get_all_headings(soup)[-1]
    next_sibling = heading.next_sibling

    counter = 0

    while counter < 5 and next_sibling.name != "div":
        next_sibling = next_sibling.next_sibling

    contents = [content.get_text(strip=True) for content in next_sibling.contents]
    contents[-1]

In [54]:
find_investigation_outcome(soup)

'The offence that arises for consideration isdangerous driving causing bodily harmcontrary to section 320.13(2) of theCriminal Code.  As an offence of penal negligence, a simple want of care will not suffice to give rise to liability.  Rather, the offence is predicated, in part, on conduct that amounts to a marked departure from the level of care that a reasonable person would have observed in the circumstances.  In the instant case, the issue is whether there was a want of care in the manner in which theSOoperated his vehicle, sufficiently egregious to attract criminal sanction, that caused or contributed to the collision.  In my view, there was not.There is no evidence of any want of care on the part of theSO.  Having observed the Mercedes-Benz pass him at dangerously high speeds, the officer was within his rights in trying to stop the vehicle to issue an offence notice.  He too accelerated to about 180 km/h to try and narrow the gap with the Mercedes-Benz but only for a short period

In [21]:
# reason for police investigation
# not really sure about this one


In [27]:
results = []
for case in tqdm(case_urls):
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(case,headers=hdr)
    page = urlopen(req)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    province = "Ontario"
    
    year = find_year(soup)
    police_service = find_police_service(soup)
    
    civilian_consequences = find_civilian_consequences(soup)
    
    city = find_city(soup)
    
    investigation_outcome = find_investigation_outcome(soup)
    
    results.append([year, police_service, city, province, civilian_consequences, investigation_outcome, case])
    
results_df = pd.DataFrame(results, columns=['Year',  'Police Service', 'City', 'Province', 'Civilian Consequences', 'Investigation Final Decision', 'Link'])
    

  0%|                                                                                          | 0/697 [00:00<?, ?it/s]


TypeError: expected string or bytes-like object

In [70]:
results_df.head()

Unnamed: 0,Year,Police Service,City,Province,Civilian Consequences,Investigation Final Decision,Link
0,,,Ingersoll,Ontario,He was taken to hospital with a fractured back...,The offence that arises for consideration isda...,https://www.siu.on.ca/en/directors_report_deta...
1,,Kingston Police,Kingston,Ontario,The Complainant was transported to hospital in...,,https://www.siu.on.ca/en/directors_report_deta...
2,,London Police Service,London,Ontario,Following his arrest the Complainant was taken...,,https://www.siu.on.ca/en/directors_report_deta...
3,,Peel Regional Police,Brampton,Ontario,His wife and son were fortunate to have escape...,,https://www.siu.on.ca/en/directors_report_deta...
4,,Niagara Regional Police Service,Niagara,Ontario,The Complainant was taken into custody and tra...,,https://www.siu.on.ca/en/directors_report_deta...


In [71]:
results_df.to_csv("results_on_2018_2023.csv", index=False)