In [None]:
import pandas as pd
import os
import tqdm as tq
import string
import statistics as stats
import re
pd.set_option('display.max_columns', None) # allows to display all columns of a large dataframe

In [None]:
# The data from 2010 and 2012 only has "street" files. The "stop-and-search" files start in 2015.
# (by "file type" I mean either 'search', 'outcome' or 'stop-and-search' report files)

# input here the (complete) path to the folder from which you want to extract the files 
directory = "/work/Jan_2010_Oct_2021" 

# initialize dataframes for each file type
df_street = pd.DataFrame()
df_outcome = pd.DataFrame()
df_sas = pd.DataFrame()

# into this list, input the name of the folder ('yyyy-mm') from which you want to extract
# runtime limitations if you try to do that for the whole dataset
folders = ['2020-09']

for folder in folders:
    # initialize lists for file names of each type
    files_street = []
    files_outcome = []
    files_sas = []
    
    # generate the path to folder
    folder_direc = os.fsencode(directory + '/'+ str(folder))
    
    # add each file name to the appropriate list
    for file in tq.tqdm(os.listdir(folder_direc)):
        file = str(file).strip("'b")
        if (bool(re.search('outcomes', file))):
            files_outcome.append(file)
        elif (bool(re.search('street', file))):
            files_street.append(file)
        elif (bool(re.search('stop-and-search', file))):
            files_sas.append(file)
    
    # create data frames populated with the data from all files of the given type
    for file in files_street:
        current_data_street = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file))  
        df_street = df_street.append(current_data_street)

    for file in files_outcome:
        current_data_outcome = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file))  
        df_outcome = df_outcome.append(current_data_outcome)

    for file in files_sas:
        current_data_sas = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file))  
        df_sas = df_sas.append(current_data_sas)

100%|██████████| 120/120 [00:00<00:00, 93692.57it/s]


In [None]:
# check if "LSOA code" and "Crime type" have missing values
len(df_street[df_street['LSOA code'].isna()])/len(df_street)
# around 6% of LSOA code entries missing - acceptable

0.05923641810232434

In [None]:
len(df_street[df_street['Crime type'].isna()])/len(df_street)
# Crime type is filled out everywhere
# df_street['Crime type'].unique()

0.0

In [None]:
df_street['Crime type'].unique() # you can insert any attribute name here to see waht values it may take

array(['Anti-social behaviour', 'Drugs', 'Other theft', 'Public order',
       'Violence and sexual offences', 'Other crime',
       'Criminal damage and arson', 'Vehicle crime', 'Burglary',
       'Bicycle theft', 'Possession of weapons', 'Shoplifting',
       'Theft from the person', 'Robbery'], dtype=object)

In [None]:
df_street

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context
0,,2020-09,Dyfed-Powys Police,Dyfed-Powys Police,-4.264037,51.997506,On or near Maes Cader,W01000685,Carmarthenshire 001A,Anti-social behaviour,,
1,b267eefa1d1eb337b621de90d5760d92c363cd412fce00...,2020-09,Dyfed-Powys Police,Dyfed-Powys Police,-4.266697,51.999118,On or near Maes Derwenydd,W01000685,Carmarthenshire 001A,Drugs,Court result unavailable,
2,210397977f9e987763464cacb7b13e20f6826b3e86e8f2...,2020-09,Dyfed-Powys Police,Dyfed-Powys Police,-4.266653,51.994506,On or near Rhandir Wen,W01000685,Carmarthenshire 001A,Other theft,Investigation complete; no suspect identified,
3,b9793a8f607a449f7bebea470fcbae12566ef82cd528a9...,2020-09,Dyfed-Powys Police,Dyfed-Powys Police,-4.266653,51.994506,On or near Rhandir Wen,W01000685,Carmarthenshire 001A,Other theft,Investigation complete; no suspect identified,
4,eeab3387f73521f608d64c6502cffdf1f7c8e256698533...,2020-09,Dyfed-Powys Police,Dyfed-Powys Police,-4.264037,51.997506,On or near Maes Cader,W01000685,Carmarthenshire 001A,Public order,Unable to prosecute suspect,
...,...,...,...,...,...,...,...,...,...,...,...,...
12424,517600d43845c4c6712482951c62ce87cd3bd985867099...,2020-09,South Wales Police,South Wales Police,,,No Location,,,Violence and sexual offences,Offender given a caution,
12425,51f37f650d4cb8c43973aea53967510b156f10ae9652ec...,2020-09,South Wales Police,South Wales Police,,,No Location,,,Violence and sexual offences,Offender given a caution,
12426,8cd41a534135bc5dc073b462c3c85554fcec0f25ca3997...,2020-09,South Wales Police,South Wales Police,,,No Location,,,Violence and sexual offences,Unable to prosecute suspect,
12427,9c4b505ca2376b8fa26cc7aa4c53d67e110eccef3c0a37...,2020-09,South Wales Police,South Wales Police,,,No Location,,,Other crime,Status update unavailable,


In [None]:
df_outcome

Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Outcome type
0,6218c92fc68f7c5f2a10fa08b6a65c31b59b0eedf2d12f...,2020-09,Bedfordshire Police,Bedfordshire Police,-0.454797,52.126757,On or near BARFORD AVENUE,E01017523,Bedford 014F,Investigation complete; no suspect identified
1,cae7a8ca58ecf18d36a0d21366881c80c66a14f062be21...,2020-09,Bedfordshire Police,Bedfordshire Police,-0.419811,51.912476,On or near RINGWOOD ROAD,E01015747,Luton 004C,Local resolution
2,e768eccecfea72598d3b27738502bef164643ea9099967...,2020-09,Bedfordshire Police,Bedfordshire Police,-0.406447,51.873515,On or near PARK STREET,E01015789,Luton 018E,Suspect charged
3,af86eb04e1b09e1e6c4421d0473abf48f44e85015916c9...,2020-09,Bedfordshire Police,Bedfordshire Police,-0.517208,51.889511,On or near THE MALL,E01017570,Central Bedfordshire 029E,Investigation complete; no suspect identified
4,5de1bc42ba0c6e8697bfe195982a660cb9061ad35f40f0...,2020-09,Bedfordshire Police,Bedfordshire Police,-0.543713,52.224597,On or near HOME CLOSE,E01017544,Bedford 001C,Investigation complete; no suspect identified
...,...,...,...,...,...,...,...,...,...,...
7146,6b4b1922463231c0125b4ba55a0f39e3bfa0f6fbf97eff...,2020-09,Derbyshire Constabulary,Derbyshire Constabulary,-1.512480,52.908272,On or near BALMORAL CLOSE,E01013543,Derby 021C,Suspect charged
7147,00627dbc43282b72bcd2b07ab9741064c1eadaf92de934...,2020-09,Derbyshire Constabulary,Derbyshire Constabulary,-1.493638,52.900849,On or near FOREMARK AVENUE,E01013491,Derby 022D,Suspect charged
7148,78298e6740b649d5b51da24f9aff8e63c5a24b7a87a83d...,2020-09,Derbyshire Constabulary,Derbyshire Constabulary,-1.685357,52.865751,On or near SCROPTON ROAD,E01019846,South Derbyshire 001C,Suspect charged
7149,5da085c500507ed56d4b7142c625410fe29fc805ecbf51...,2020-09,Derbyshire Constabulary,Derbyshire Constabulary,-1.334561,53.099897,On or near WILLOW CLOSE,E01019517,Bolsover 010E,Suspect charged


In [None]:
df_sas

Unnamed: 0,Type,Date,Part of a policing operation,Policing operation,Latitude,Longitude,Gender,Age range,Self-defined ethnicity,Officer-defined ethnicity,Legislation,Object of search,Outcome,Outcome linked to object of search,Removal of more than just outer clothing
0,Person search,2020-08-31T23:30:19+00:00,0.0,,52.602701,0.376608,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,A no further action disposal,0.0,0.0
1,Person search,2020-09-01T08:35:31+00:00,0.0,,52.416985,0.743852,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,1.0,0.0
2,Person search,2020-09-01T12:14:22+00:00,0.0,,52.607668,1.726090,Male,25-34,Black/African/Caribbean/Black British - African,Black,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,0.0,0.0
3,Person search,2020-09-01T12:23:05+00:00,0.0,,52.412250,0.732793,Male,over 34,White - English/Welsh/Scottish/Northern Irish/...,White,Police and Criminal Evidence Act 1984 (section 1),Article for use in theft,A no further action disposal,0.0,0.0
4,Person search,2020-09-01T13:54:00+00:00,0.0,,52.616957,1.721028,Male,25-34,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,Person and Vehicle search,2020-09-30T17:30:00+00:00,False,,,,Male,25-34,Asian/Asian British - Any other Asian background,Asian,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,,False
669,Person and Vehicle search,2020-09-30T17:30:00+00:00,False,,,,Male,over 34,Asian/Asian British - Any other Asian background,Asian,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,,False
670,Person and Vehicle search,2020-09-30T18:34:00+00:00,False,,51.889463,-0.183444,Male,18-24,Other ethnic group - Not stated,Black,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,,False
671,Person search,2020-09-30T18:45:00+00:00,False,,51.889463,-0.183444,Female,18-24,White - English/Welsh/Scottish/Northern Irish/...,White,Misuse of Drugs Act 1971 (section 23),Controlled drugs,A no further action disposal,,False


In [None]:
# check if "Type" has missing values
len(df_sas[df_sas['Type'].isna()])/len(df_sas)
# Type is filled out everywhere

0.0

In [None]:
# to get an orientational indication of the differences between the files 'street' and 'outcome,
# merge the two frames and compare. Most attributes are repeated accross these files.
# The attribute that seems to be the most suitable as the "key" is 'Crime ID' - but it does not have
# all entries. So, we have to filter out only those with 'Crime ID'.
 
df_merge = df_street[df_street['Crime ID'].notna()].merge(df_outcome[df_outcome['Crime ID'].notna()][['Crime ID', 'Outcome type']],
                                                          on = 'Crime ID')

In [None]:
# see if there is a specific outcome type with more differences
df_merge[df_merge['Last outcome category'] != df_merge['Outcome type']]['Last outcome category'].unique()

array(['Court result unavailable',
       'Action to be taken by another organisation',
       'Unable to prosecute suspect',
       'Formal action is not in the public interest',
       'Further action is not in the public interest',
       'Status update unavailable',
       'Investigation complete; no suspect identified',
       'Further investigation is not in the public interest',
       'Awaiting court outcome', 'Offender given a caution',
       'Local resolution', 'Suspect charged as part of another case',
       'Offender given penalty notice'], dtype=object)

In [None]:
# See what percentage of the outcomes are different in 'street' and 'outcome'
len(df_merge[df_merge['Last outcome category'] != df_merge['Outcome type']])/len(df_merge)

0.08603118553997832

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=40789b9a-1c62-45b9-9d9c-b1a39ebe3dfd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>