# Approach 1 - use street data

postcode files can be downloaded here: https://www.doogal.co.uk/PostcodeDownloads.php .

Look at England, Wales, NI

## Load libraries and data

In [None]:
import pandas as pd
import os
import tqdm as tq
import string
import statistics as stats
import re
import numpy as np
pd.set_option('display.max_columns', None)

Use the parsing function defined in "Data aggregation"

In [None]:
# To make a dataframe for street for one police force only (pf name must correspond to the one in file name)

def extract_street_for_force(pf: str):
    directory = "/Users/hannabroszczak/Desktop/uni/DataChallenge2/Jan_2012_Oct_2021" # Change directory
    df_street = pd.DataFrame()

    folders = [folder for folder in os.listdir(directory) if not str(folder).strip("'b").startswith('.')]

    for folder in folders:
        files_street = []

        # Here you can change the date range
        if int(folder[:4]) > 2014:
            # generate the path to folder
            folder_direc = os.fsencode(directory + '/'+ str(folder))
        
            # add each file name to the appropriate list
            for file in os.listdir(folder_direc):
                file = str(file).strip("'b")
                # check if the file name contains the appropriate district name and 'street'
                if (bool(re.search(str(pf), file))) & (bool(re.search('street', file))):
                        files_street.append(file)
                
            # Clean dataframe
            for file in files_street:
                # Create dataframe from the current file
                current_data_street = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file)) 
                # Drop the attributes considered irrelevant
                current_data_street.drop(['Reported by','Location', 'Context', 'LSOA name'], axis=1, inplace=True)
                current_data_street.dropna(subset=['LSOA code'], inplace=True)
                current_data_street.dropna(subset=['Crime type'], inplace=True)
                df_street = df_street.append(current_data_street)
    return df_street

def extract_sas_for_force(pf):
    directory = "/Users/hannabroszczak/Desktop/uni/DataChallenge2/Jan_2012_Oct_2021" # Change directory
    df_sas = pd.DataFrame()

    folders = [folder for folder in os.listdir(directory) if not str(folder).strip("'b").startswith('.')]
    
    for folder in folders:
        files_sas = []

        # Here you can change the date range
        if int(folder[:4]) > 2014:
            # generate the path to folder
            folder_direc = os.fsencode(directory + '/'+ str(folder))
        
            # add each file name to the appropriate list
            for file in os.listdir(folder_direc):
                file = str(file).strip("'b")
                # check if the file name contains the appropriate district name and 'street'
                if (bool(re.search(str(pf), file))) & (bool(re.search('stop-and-search', file))):
                        files_sas.append(file)
                
            # Clean dataframe for stop-and-search
            for file in files_sas:
                # Create dataframe from the current file
                current_data_sas = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file))  
                # drop the attributes considered irrelevant
                current_data_sas.drop(['Policing operation', 'Gender', 'Object of search', \
                                      'Outcome linked to object of search', 'Removal of more than just outer clothing',
                                      'Self-defined ethnicity', 'Officer-defined ethnicity'], axis=1, inplace=True)
                # drop rows which do not have the "Type" specified
                current_data_sas.dropna(subset=['Type'], inplace=True)
                current_data_sas.dropna(subset=['Longitude'], inplace=True)
                current_data_sas.dropna(subset=['Latitude'], inplace=True)
                df_sas = df_sas.append(current_data_sas)
    return  df_sas

In [None]:
# extract all street data for one police force
df_str_dict = extract_street_for_force("essex")

In [None]:
# append street data from some other police forces to generate a more 
# extensive dataset-dictionary
df_str_dict = df_str_dict.append(extract_street_for_force('cheshire'))
df_str_dict = df_str_dict.append(extract_street_for_force('essex'))
df_str_dict = df_str_dict.append(extract_street_for_force('norfolk'))

In [None]:
# add some more data
df_str_dict = df_str_dict.append(extract_street_for_force('avon'))
df_str_dict = df_str_dict.append(extract_street_for_force('london'))
df_str_dict = df_str_dict.append(extract_street_for_force('devon'))
# df_str_dict = df_str_dict.append(extract_street_for_force('cleveland'))
# df_str_dict = df_str_dict.append(extract_street_for_force('durham'))
# df_str_dict = df_str_dict.append(extract_street_for_force('wales'))
# df_str_dict = df_str_dict.append(extract_street_for_force('thames'))


In [None]:
# extract the sas data for one police force
df_sas_btp = extract_sas_for_force("essex")

## Creating a "dictionary" (data frame) for recreating LSOA is sas 

In [None]:
# create a new df with unique LSOA codes and some corresponding
# Longitude and Latitude values
# df_lsoa = df_str_dict.copy().drop_duplicates(subset='LSOA code')
df_lsoa = df_str_dict[['LSOA code','Longitude','Latitude']].groupby('LSOA code').mean()

In [None]:
# round Longitude and Latitude values to 5 decimal places (initially 6 given)
df_lsoa['Longitude'] = round(df_lsoa['Longitude'], 3)
df_lsoa['Latitude'] = round(df_lsoa['Latitude'], 3)

In [None]:
df_lsoa = df_lsoa.reset_index()

In [None]:
df_lsoa

Unnamed: 0,LSOA code,Longitude,Latitude
0,E01000001,-0.097,51.518
1,E01000002,-0.092,51.519
2,E01000003,-0.095,51.521
3,E01000005,-0.076,51.514
4,E01000006,0.087,51.540
...,...,...,...
8064,W01001937,-4.052,52.503
8065,W01001939,-3.180,51.483
8066,W01001942,-3.168,51.461
8067,W01001943,-3.165,51.468


## Recreate LSOA codes in df_sas

In [None]:
# round Longitude and Latitude values in sas 
# to 5 decimal places (initially 6 given)
df_sas_btp['Longitude'] = round(df_sas_btp['Longitude'], 3)
df_sas_btp['Latitude'] = round(df_sas_btp['Latitude'], 3)

In [None]:
# merge the 2 dataframes based on longitude and latitude - results in
# adding LSOA code to the sas df
df_sas_lsoa =pd.merge( df_sas_btp, df_lsoa, on=['Latitude', 'Longitude'], how='left')

In [None]:
df_sas_lsoa[df_sas_lsoa['LSOA code'].notna()]

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Age range,Legislation,Outcome,LSOA code
6,Person search,2020-01-01T01:01:00+00:00,False,51.614,0.320,25-34,Misuse of Drugs Act 1971 (section 23),Khat or Cannabis warning,E01021435
19,Person search,2020-01-01T06:01:00+00:00,False,51.881,0.912,18-24,Police and Criminal Evidence Act 1984 (section 1),A no further action disposal,E01021688
203,Person search,2020-01-04T23:01:00+00:00,False,51.886,0.938,over 34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021700
204,Person search,2020-01-04T23:01:00+00:00,False,51.886,0.938,25-34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021700
207,Person search,2020-01-04T23:01:00+00:00,False,51.886,0.938,over 34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021700
...,...,...,...,...,...,...,...,...,...
64407,Person and Vehicle search,2018-12-16T02:01:00+00:00,False,51.603,0.528,25-34,Misuse of Drugs Act 1971 (section 23),Khat or Cannabis warning,E01021343
64408,Person search,2018-12-16T02:01:00+00:00,False,51.603,0.528,25-34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021343
64453,Vehicle search,2018-12-17T21:01:00+00:00,False,51.751,0.071,,Police and Criminal Evidence Act 1984 (section 1),A no further action disposal,E01021865
64509,Person and Vehicle search,2018-12-20T01:01:00+00:00,True,51.526,0.596,over 34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021493


In [None]:
# add the column with Month (format yyyy-mm) to sas df
df_sas_lsoa['Month'] = df_sas_lsoa['Date'].apply(lambda x: x[:7])

In [None]:
# group sas df by Month and the LSOA code generated based on street df
df_sas_lsoa.groupby(['Month', 'LSOA code']).count()['Type'].to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Type
Month,LSOA code,Unnamed: 2_level_1
2015-04,E01021577,2
2015-04,E01021664,1
2015-04,E01022079,5
2015-04,E01022081,5
2015-04,E01033140,2
...,...,...
2021-10,E01033614,2
2021-10,E01033719,13
2021-10,E01033722,9
2021-10,E01033723,2


# Approach 2 - use external file

In [None]:
df_sas_btp['Longitude'] = round(df_sas_btp['Longitude'], 3)
df_sas_btp['Latitude'] = round(df_sas_btp['Latitude'], 3)

In [None]:
# read the csv with LSOA codes and long/lat values
postcodes = pd.read_csv('Wales postcodes.csv')
postcodes = postcodes.append(pd.read_csv('BT postcodes.csv'))

In [None]:
postcodes = postcodes.append(pd.read_csv('England postcodes.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
postcodes = postcodes[['LSOA Code', 'Latitude', 'Longitude']]

Unnamed: 0,LSOA Code,Latitude,Longitude
0,W01001767,51.469744,-3.187692
1,W01001767,51.469744,-3.187692
2,W01001939,51.477907,-3.180704
3,W01001767,51.469744,-3.187692
4,W01001939,51.479690,-3.182190
...,...,...,...
2230312,E01013406,53.978208,-1.065358
2230313,E01013406,53.978208,-1.065358
2230314,E01013406,53.978208,-1.065358
2230315,E01013406,53.978235,-1.065381


In [None]:
df_sas_btp[:50]

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Age range,Legislation,Outcome
0,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution
1,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution
2,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution
3,Vehicle search,2020-01-01T00:01:00+00:00,False,51.84,0.78,,Misuse of Drugs Act 1971 (section 23),Community resolution
4,Person search,2020-01-01T01:01:00+00:00,False,51.54,0.69,over 34,Police and Criminal Evidence Act 1984 (section 1),
5,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,18-24,Misuse of Drugs Act 1971 (section 23),Khat or Cannabis warning
6,Person search,2020-01-01T01:01:00+00:00,False,51.61,0.32,25-34,Misuse of Drugs Act 1971 (section 23),Khat or Cannabis warning
7,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),A no further action disposal
8,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,18-24,Misuse of Drugs Act 1971 (section 23),A no further action disposal
9,Vehicle search,2020-01-01T02:01:00+00:00,False,51.56,0.42,,Police and Criminal Evidence Act 1984 (section 1),Arrest


In [None]:
postcodes['Latitude'] = round(postcodes['Latitude'], 3)
postcodes['Longitude'] = round(postcodes['Longitude'], 3)

In [None]:
df_sas_lsoa_postcodes =pd.merge(df_sas_btp, postcodes, on=['Latitude', 'Longitude'], how='inner')

In [None]:
df_sas_lsoa_postcodes[:50]

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Age range,Legislation,Outcome,LSOA Code
0,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution,E01021644
1,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution,E01021644
2,Person search,2020-01-01T00:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),Community resolution,E01021644
3,Vehicle search,2020-01-01T00:01:00+00:00,False,51.84,0.78,,Misuse of Drugs Act 1971 (section 23),Community resolution,E01021644
4,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,18-24,Misuse of Drugs Act 1971 (section 23),Khat or Cannabis warning,E01021644
5,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,10-17,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021644
6,Person search,2020-01-01T01:01:00+00:00,False,51.84,0.78,18-24,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021644
7,Person search,2020-07-26T00:01:00+00:00,False,51.84,0.78,25-34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021644
8,Person and Vehicle search,2020-07-26T00:01:00+00:00,False,51.84,0.78,25-34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021644
9,Person search,2020-07-26T00:01:00+00:00,False,51.84,0.78,25-34,Misuse of Drugs Act 1971 (section 23),A no further action disposal,E01021644


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=40789b9a-1c62-45b9-9d9c-b1a39ebe3dfd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>