# Checking occurrance numbers

Becuase occurrance ids might have several rows of associated data, we are checking how often this happens.

In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Read in datasets and merge them into a single df
# Create empty df to store data
df = pd.DataFrame()
# Loop through years
for year in ['2018', '2020', '2022']:
    # Read in year data
    year_data = pd.read_csv(Path('resources', ('mci_' + year + '.csv')))
    # concatenate with general df
    df = pd.concat([df, year_data])
    # Log number of rows for this year and total in general df
    print(f"After reading in {len(year_data)} rows of data from year {year}, the total number of rows is: {len(df)}")

After reading in 37490 rows of data from year 2018, the total number of rows is: 37490
After reading in 35065 rows of data from year 2020, the total number of rows is: 72555
After reading in 40226 rows of data from year 2022, the total number of rows is: 112781


In [3]:
df.head()

Unnamed: 0,_id,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,LOCATION_TYPE,PREMISES_TYPE,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140
0,134022,GO-20181532,2018-01-01,2018-01-01,2018,January,1,1,Monday,4.0,...,Other Commercial / Corporate Places (For Profi...,Commercial,1430,100,Assault,Assault,168,Downtown Yonge East,75,Church-Yonge Corridor (75)
1,134023,GO-20184352,2018-01-01,2018-01-01,2018,January,1,1,Monday,18.0,...,"Apartment (Rooming House, Condo)",Apartment,1430,100,Assault,Assault,150,Fenside-Parkwoods,45,Parkwoods-Donalda (45)
2,134024,GO-20184878,2018-01-01,2018-01-01,2018,January,1,1,Monday,21.0,...,"Apartment (Rooming House, Condo)",Apartment,1420,100,Assault With Weapon,Assault,106,Humewood-Cedarvale,106,Humewood-Cedarvale (106)
3,134025,GO-20183609,2018-01-01,2018-01-01,2018,January,1,1,Monday,15.0,...,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,2130,210,Theft Over,Theft Over,95,Annex,95,Annex (95)
4,134026,GO-20184674,2018-01-01,2018-01-01,2018,January,1,1,Monday,20.0,...,Convenience Stores,Commercial,1610,210,Robbery - Business,Robbery,123,Cliffcrest,123,Cliffcrest (123)


In [25]:
occ_count = df[['EVENT_UNIQUE_ID']].value_counts(ascending=False)

n_repeats = len(occ_count.loc[occ_count>1])

In [26]:
# Count rows for each event
occ_count_offence = df[['EVENT_UNIQUE_ID', 'OFFENCE']].value_counts(ascending=False)

occ_count_offence.head(20)


EVENT_UNIQUE_ID  OFFENCE                      
GO-20222426282   Theft Of Motor Vehicle           20
GO-20222160871   Robbery With Weapon              12
GO-20201148574   Theft Of Motor Vehicle           11
GO-20182281596   Assault                          10
GO-2018778601    Assault                          10
GO-20201296123   Theft Of Motor Vehicle           10
GO-20222227053   Assault Peace Officer Wpn/Cbh     9
GO-20201494819   Assault With Weapon               9
GO-2022248788    B&E                               9
GO-20201641497   Theft Of Motor Vehicle            8
GO-20201810011   Discharge Firearm With Intent     8
GO-20221126212   Assault                           8
GO-20181809539   Assault                           8
GO-2022322288    Robbery - Business                8
GO-20201630856   Assault Peace Officer             8
GO-20221044736   Assault With Weapon               8
GO-2018752060    Assault Peace Officer             8
GO-2020403614    Assault                           8

In [28]:
# Count how many events have more than 1 row
n_repeats_offence = len(occ_count_offence.loc[occ_count_offence>1])

n_repeats - n_repeats_offence

4252

Conclusion: We decided to drop the rows where the same offence was reported as part of the same event, and keep the events that have different offences as separate.

Method: We will drop duplicate rows from the dataset, since events that have different offences won't be dropped.

In [30]:
no_duplicates_df = df.drop_duplicates(['EVENT_UNIQUE_ID', 'OFFENCE'])

# confirm that there are no duplicates of the same event and offence
no_duplicates_df[['EVENT_UNIQUE_ID', 'OFFENCE']].value_counts(ascending=False)


EVENT_UNIQUE_ID  OFFENCE                       
GO-20181000120   Assault - Resist/ Prevent Seiz    1
GO-20221166225   B&E W'Intent                      1
GO-20221166777   Assault Bodily Harm               1
GO-20221166766   Assault                           1
GO-20221166695   Theft Over                        1
                                                  ..
GO-2020102257    Robbery - Mugging                 1
GO-20201022550   Assault Bodily Harm               1
GO-20201022513   Theft Over                        1
GO-2020102247    Theft Of Motor Vehicle            1
GO-2022999907    Assault                           1
Length: 104454, dtype: int64

In [32]:
no_duplicates_df.to_csv(Path('resources', 'mci_no_duplicates_2018_20_22.csv'), index = False)