In [1]:
# scraping the Ontario SIU Directors Reports from 2017 to 2018
# the reports to scrape are in an csv file in reports_to_scrape folder downloaded from their site

# import libraries
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import bs4
import re
from tqdm import tqdm

In [2]:
def get_inner_text(tag):
    if isinstance(tag, bs4.element.NavigableString):
        return tag
    
    if len(tag.contents) > 0:
        results = []
        
        for content in tag.contents:
            text = get_inner_text(content)
            if text is not None:
                results.append(text)
            
        return ''.join(results)
    
    return tag.string  

In [3]:
# import pages to scrape from the csv file
df = pd.read_csv("reports_to_scrape/on_siu_2018_2023_w_keywords.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Case Number,Date Report Signed,Link Text,URLs
0,2,23-PVI-099,11-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
1,6,23-OCI-091,10-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
2,10,23-OCI-083,10-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
3,13,23-OVI-078,07-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...
4,14,23-OCI-077,07-Jul-23,Read Full Text,https://www.siu.on.ca/en/directors_report_deta...


In [4]:
# get the link from the case number part of the df

case_urls = df["URLs"].tolist()

In [5]:
# scrape the first directors report 23-PVI-099 	

hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(case_urls[0],headers=hdr)
page = urlopen(req)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.find("title"))

<title>Special Investigations Unit -- Director's Report Details, Case Number: 23-PVI-099</title>


In [6]:
def find_investigation_section(soup):
    try:
        investigation_title = soup.find("h2", string="The Investigation")


        next_sibling = investigation_title.next_sibling
        notification = [next_sibling]

        while next_sibling.next_sibling.name == "div":
            next_sibling = next_sibling.next_sibling
            notification.append(next_sibling)

        #print(notification)
        return notification
    except:
        return "Error finding Investigation Section"

In [7]:
# Information we are looking for:
# Province, city, police service, officer name, year, consequences for civilians (injuries), 
# consequences, if any, for officers (fine, dismissal, none, etc.), 
# investigation outcome (charge, acquittal complaint dismissal, etc.), and reason for police call. 

province = "Ontario"

# a lot of the information we might be looking for can be found in section 3 of the report
text = soup.find_all("h3")[5].next_sibling.get_text()
text

'On March 30, 2023, at 8:41 p.m., the '

In [8]:
def find_year(soup):
    text = str(find_investigation_section(soup))
    if text != "Error finding Investigation Section":
        match = re.search(r"(20\d{2})", text)
        if match is not None:
            return match.group(1)
    else:
        return text

year = find_year(soup)
print(year)

2023


In [9]:
# list from https://www.oacp.ca/en/about-us/ontario-police-organizations.aspx
ontario_police_services = [
    'Akwesasne Mohawk Police Service',
    'Anishinabek Police Service',
    'Aylmer Police Service',
    'Barrie Police Service',
    'Belleville Police Service',
    'Brantford Police Service',
    'Brockville Police Service',
    'Chatham Kent Police Service',
    'City of Kawartha Lakes Police Service',
    'Cobourg Police Service',
    'Cornwall Police Service',
    'Deep River Police Service',
    'Durham Regional Police Service',
    'Dryden Police Service',
    'Greater Sudbury Police Service',
    'Guelph Police Service',
    'Halton Regional Police Service',
    'Hamilton Police Service',
    'Hanover Police Service',
    'Kingston Police',
    'Lac Seul Police Service',
    'LaSalle Police Service',
    'London Police Service',
    'Niagara Parks Police',
    'Niagara Regional Police Service',
    'Nishnawbe-Aski Police Service',
    'North Bay Police Service',
    'Ontario Provincial Police',
    'Owen Sound Police Service',
    'Ottawa Police Service',
    'Peterborough Police Service',
    'Peel Regional Police',
    'Port Hope Police Service',
    'Rama Police Service',
    'Sarnia Police Service',
    'Royal Canadian Mounted Police',
    'Sault Ste. Marie Police Service',
    'Saugeen Shores Police Service',
    'Smiths Falls Police Service',
    'Six Nations Police Service',
    'St. Thomas Police Service',
    'Strathroy-Caradoc Police Service',
    'South Simcoe Police Service',
    'Timmins Police Service',
    'Stratford Police Service',
    'Thunder Bay Police Service',
    'Treaty Three Police Service',
    'Toronto Police Service',
    'Waterloo Regional Police Service',
    'U.C.C.M. Anishnaabe Police',
    'Wikwemikong Tribal Police Service',
    'West Grey Police Service',
    'Woodstock Police Service',
    'Windsor Police Service',
    'York Regional Police'
]

In [10]:
def find_police_service(soup):
    text = str(find_investigation_section(soup))
    if text != "Error finding Investigation Section":
        for service_name in ontario_police_services:
            if service_name in text:
                return service_name
    else:
        return text

In [11]:
police = find_police_service(soup)
print(police)
#print(text)

Ontario Provincial Police


In [12]:
municipalities_df = pd.read_csv("mmah-list-of-ontario-municipalities-en-utf8-2022-10-05.csv")

def get_municipality_name(tag):
    a_soup = BeautifulSoup(tag, "html.parser")
    
    if isinstance(a_soup, bs4.element.Tag):
        full_name = get_inner_text(a_soup)
    else:
        full_name = tag
        
    
        
    return full_name.split(", ")[0]
    

list_of_municipalities_in_ontario = list(municipalities_df["Municipality"].apply(get_municipality_name))

In [13]:
def find_incident_narrative():
    try:
        
        if soup.find("h2", string="Incident Narrative"):
            incident_narrative_title = soup.find("h2", string="Incident Narrative")
        elif soup.find("h2", string="Incident narrative"): 
            incident_narrative_title = soup.find("h2", string="Incident Narrative")
        elif soup.find("h2", string="Event Chronology"):
            incident_narrative_title = soup.find("h2", string="Event Chronology")
        else:
            incident_narrative_title = soup.find("h2", string="The Investigation")
        
        
        next_sibling = incident_narrative_title.next_sibling
        incident_narrative = [next_sibling]

        while next_sibling.next_sibling.name == "div":
            next_sibling = next_sibling.next_sibling
            incident_narrative.append(next_sibling)

        #print(incident_narrative)
        return incident_narrative
    except:
        print("Issue with finding Incident Narrative")
        return "Issue with finding Incident Narrative"

In [14]:
def find_city(soup):
    incident_narratives = find_incident_narrative()
    if incident_narratives != "Issue with finding Incident Narrative":    
        for p in incident_narratives:
            for municipality in list_of_municipalities_in_ontario:
                regex = r"\b(?=\w)" + re.escape(municipality) + r"\b(?!\w)"
                match = re.search(regex, get_inner_text(p), re.IGNORECASE)
                if match:
                    return municipality
    else:
        return "Municipality not found"

In [15]:
find_city(soup)

In [16]:
# officer name is not included in accordance with section 21 of FIPPA
# moving onto civilian consquences, this can be found in the text we have
# apparent its not always consistent where this info is
    
from string import punctuation

def find_civilian_consequences(soup):
    
    keywords = ['death', 'injury', 'fracture', 'diagnosed', 
                'diagnoses', 'dead', 'post-mortem', 'cause of death', 
                'injuries', 'CT scan', 'assessed', 'assessment', 'treated']
    
    try:
    
        if soup.find("h3", string = "Nature of Injuries / Treatment"):
            temp = soup.find("h3", string = "Nature of Injuries / Treatment")
        elif soup.find("h3", string = "Nature of Injury/Treatment"): 
            temp = soup.find("h3", string = "Nature of Injury/Treatment")
        elif soup.find("h3", string = "Cause of Death"):
            temp = soup.find("h3", string = "Cause of Death")
        elif soup.find("h3", string = "Cause of death"):
            temp = soup.find("h3", string = "Cause of death")
        elif soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death"):
            temp = soup.find("h3", string = "Nature of Injuries / Treatment/Cause of Death")
        elif soup.find("h3", string = "Nature of injury/treatment"):
            temp = soup.find("h3", string = "Nature of injury/treatment")
        else:
            temp = soup.find("h2", string="Incident Narrative")

        next_sibling = temp.next_sibling
        incident = [next_sibling]


        while next_sibling.next_sibling.name == "div":
            next_sibling = next_sibling.next_sibling
            incident.append(next_sibling.text)

        civilian_consequences = ""

        CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleantext = re.sub(CLEANR, '', str(incident))
        temp = cleantext.split('.')
        for i in temp:
            for j in keywords:
                if j in i:
                    civilian_consequences = i

        if civilian_consequences == "":
            civilian_consequences = cleantext

        civilian_consequences=[i.strip(punctuation) for i in civilian_consequences.split()]
        civilian_consequences = " ".join(civilian_consequences)
    
    except:
        print(soup.find("title"))
        civilian_consequences = "Error when extracting consequences."
        # I think it is because they use the Event Chronology title instead 
        # but I want to see how many use it before fixing it all over
    
    return civilian_consequences

In [17]:
print(find_civilian_consequences(soup))

He was taken to hospital with a fractured back and possibly other injuries


In [18]:
# find consequences for officer
# personally, I can't find an area of these reports that mention consequences for the officers

In [19]:
def find_decision_date(soup):
    body = soup.find("div", class_="body-field")
    decision_date_element = body.contents[-3]
    decision_date = decision_date_element.string
    
    if decision_date is not None:
        decision_date = decision_date.replace("\xa0", " ")

In [20]:
def find_investigation_outcome(soup):
    # investigation outcome
    # this is found in the director's analysis section/last paragraph of the report
    all_h2 = soup.find_all("h2")

    
    mandate = None
    for h2 in all_h2:
        if h2.find(text=re.compile("Mandate of the")):
            mandate = h2
            break
            
    if mandate is None:
        return None
    
    decision_paragraph = mandate.parent.contents[-9]
    decision_final_paragraph_text = get_inner_text(decision_paragraph)
    
    keywords = ['basis', 'reasonable grounds']
    
    decision = ""

    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(CLEANR, '', str(decision_final_paragraph_text))
    temp = cleantext.split('.')
    for i in temp:
        for j in keywords:
            if j in i:
                decision = i

    if decision == "":
        decision = cleantext

    decision=[i.strip(punctuation) for i in decision.split()]
    decision = " ".join(decision)

    return decision

In [21]:

find_investigation_outcome(soup)

'As such there is no basis for proceeding with criminal charges in this case'

In [22]:
# reason for police investigation
# not really sure about this one


In [23]:
results = []
for case in tqdm(case_urls):
    
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(case,headers=hdr)
    page = urlopen(req)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    province = "Ontario"
    
    year = find_year(soup)
    police_service = find_police_service(soup)
    
    civilian_consequences = find_civilian_consequences(soup)
    
    city = find_city(soup)
    
    investigation_outcome = find_investigation_outcome(soup)
    
    results.append([year, police_service, city, province, civilian_consequences, investigation_outcome, case])
    
results_df = pd.DataFrame(results, columns=['Year',  'Police Service', 'City', 'Province', 'Civilian Consequences', 'Investigation Final Decision', 'Link'])
    

 37%|██████████████████████████▏                                            | 257/697 [01:05<02:11,  3.34it/s]

<title>Special Investigations Unit -- Director's Report Details, Case Number: 20-TCD-124</title>


 67%|███████████████████████████████████████████████▎                       | 464/697 [01:56<00:56,  4.11it/s]

<title>Special Investigations Unit -- Director's Report Details, Case Number: 19-TCI-073a</title>


 74%|████████████████████████████████████████████████████▋                  | 517/697 [02:09<00:41,  4.32it/s]

<title>Special Investigations Unit -- Director's Report Details, Case Number: 18-OCI-342</title>
Issue with finding Incident Narrative


100%|███████████████████████████████████████████████████████████████████████| 697/697 [02:53<00:00,  4.02it/s]


In [24]:
results_df.head()

Unnamed: 0,Year,Police Service,City,Province,Civilian Consequences,Investigation Final Decision,Link
0,2023,Ontario Provincial Police,,Ontario,He was taken to hospital with a fractured back...,As such there is no basis for proceeding with ...,https://www.siu.on.ca/en/directors_report_deta...
1,2023,Kingston Police,Kingston,Ontario,The Complainant was transported to hospital in...,In the result as there are no reasonable groun...,https://www.siu.on.ca/en/directors_report_deta...
2,2023,London Police Service,London,Ontario,Following his arrest the Complainant was taken...,In the result whether or not the Complainant’s...,https://www.siu.on.ca/en/directors_report_deta...
3,2023,Peel Regional Police,,Ontario,His wife and son were fortunate to have escape...,Be that as it may as there are no reasonable g...,https://www.siu.on.ca/en/directors_report_deta...
4,2023,Niagara Regional Police Service,Niagara,Ontario,The Complainant was taken into custody and tra...,4 As such there is no basis for proceeding wit...,https://www.siu.on.ca/en/directors_report_deta...


In [25]:
results_df.to_csv("results_on_2018_2023.csv", index=False)