In [None]:
import pandas as pd
import os
import tqdm as tq
import string
import statistics as stats
import re
pd.set_option('display.max_columns', None)

In [None]:
# To make a dataframe for street for one district only (district name must correspond to the one in file name)

def extract_street_for_district(district: str):
    directory = "Jan_2012_Oct_2021" # Change directory
    df_street = pd.DataFrame()

    folders = [folder for folder in tq.tqdm(os.listdir(directory)) if not str(folder).strip("'b").startswith('.')]

    for folder in folders:
        files_street = []

        # Here you can change the date range
        if int(folder[:4]) > 2014:
            # generate the path to folder
            folder_direc = os.fsencode(directory + '/'+ str(folder))
        
            # add each file name to the appropriate list
            for file in os.listdir(folder_direc):
                file = str(file).strip("'b")
                # check if the file name contains the appropriate district name and 'street'
                if (bool(re.search(str(district), file))) & (bool(re.search('street', file))):
                        files_street.append(file)
                
            # Clean dataframe
            for file in tq.tqdm(files_street):
                # Create dataframe from the current file
                current_data_street = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file)) 
                # Drop the attributes considered irrelevant
                current_data_street.drop(['Reported by', 'Longitude', 'Latitude', \
                                          'Location', 'Context'], axis=1, inplace=True)
                current_data_street.dropna(subset=['LSOA code'], inplace=True)
                current_data_street.dropna(subset=['Crime type'], inplace=True)
                df_street = df_street.append(current_data_street)
    return df_street


In [None]:
# call the function and save the df with the appropriate district name: 
# df_str_district = extract_street_for_district
df_str_btp = extract_street_for_district("btp")

100%|█████████████████████████████████████| 119/119 [00:00<00:00, 487424.00it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 14.97it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 99.21it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 54.83it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 89.00it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 75.17it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 87.74it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 48.66it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 35.93it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 99.86it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 83.91it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 77.11it/s]
100%|███████████████████████

In [None]:
df_str_btp

Unnamed: 0,Crime ID,Month,Falls within,LSOA code,LSOA name,Crime type,Last outcome category
0,,2020-01,British Transport Police,E01031365,Adur 002D,Bicycle theft,
1,,2020-01,British Transport Police,E01031365,Adur 002D,Bicycle theft,
2,,2020-01,British Transport Police,E01031375,Adur 004G,Possession of weapons,
3,,2020-01,British Transport Police,E01031375,Adur 004G,Violence and sexual offences,
4,,2020-01,British Transport Police,E01031375,Adur 004G,Violence and sexual offences,
...,...,...,...,...,...,...,...
5371,,2018-12,British Transport Police,E01033068,York 013G,Public order,
5372,,2018-12,British Transport Police,E01033068,York 013G,Theft from the person,
5373,,2018-12,British Transport Police,E01033068,York 013G,Violence and sexual offences,
5374,,2018-12,British Transport Police,E01033068,York 013G,Violence and sexual offences,


In [None]:
df_str_btp['Last outcome category'].unique()

array([nan])

In [None]:
# if needed to export to csv, use:
# df_street.to_csv('all_street.csv')

In [None]:
# To make a dataframe for stop-and-search for one district only 
# (district name must correspond to the one in file name)

def extract_sas_for_district(district):
    directory = "Jan_2012_Oct_2021" # Change directory
    df_sas = pd.DataFrame()

    folders = [folder for folder in tq.tqdm(os.listdir(directory)) if not str(folder).strip("'b").startswith('.')]
    
    for folder in folders:
        files_sas = []

        # Here you can change the date range
        if int(folder[:4]) > 2014:
            # generate the path to folder
            folder_direc = os.fsencode(directory + '/'+ str(folder))
        
            # add each file name to the appropriate list
            for file in os.listdir(folder_direc):
                file = str(file).strip("'b")
                # check if the file name contains the appropriate district name and 'street'
                if (bool(re.search(str(district), file))) & (bool(re.search('stop-and-search', file))):
                        files_sas.append(file)
                
            # Clean dataframe for stop-and-search
            for file in tq.tqdm(files_sas):
                # Create dataframe from the current file
                current_data_sas = pd.read_csv(directory + '/' + str(folder).strip("'b") + '/' + str(file))  
                # drop the attributes considered irrelevant
                current_data_sas.drop(['Policing operation', 'Gender', 'Object of search', \
                                      'Outcome linked to object of search', 'Removal of more than just outer clothing',
                                      'Self-defined ethnicity', 'Officer-defined ethnicity'], axis=1, inplace=True)
                # drop rows which do not have the "Type" specified
                current_data_sas.dropna(subset=['Type'], inplace=True)
                current_data_sas.dropna(subset=['Longitude'], inplace=True)
                current_data_sas.dropna(subset=['Latitude'], inplace=True)
                df_sas = df_sas.append(current_data_sas)
    return  df_sas


In [None]:
# call the function and save the df with the appropriate district name: 
# df_sas_district = extract_sas_for_district
df_sas_btp = extract_sas_for_district("btp")

100%|█████████████████████████████████████| 119/119 [00:00<00:00, 414897.90it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 19.69it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 52.66it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 115.58it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 107.43it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 176.52it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 145.56it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 130.82it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 178.60it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 132.94it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 178.98it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 196.78it/s]
100%|███████████████████████

In [None]:
df_sas_btp.describe(include='all')

Unnamed: 0,Type,Date,Part of a policing operation,Latitude,Longitude,Age range,Legislation,Outcome
count,35764,35764,0.0,35764.0,35764.0,32468,35537,35496
unique,3,31279,,,,5,9,15
top,Person search,2019-03-26T18:20:00+00:00,,,,18-24,Misuse of Drugs Act 1971 (section 23),A no further action disposal
freq,35357,15,,,,11561,23685,24937
mean,,,,52.022798,-0.929537,,,
std,,,,1.070966,1.437629,,,
min,,,,49.7668,-7.55717,,,
25%,,,,51.489793,-1.78532,,,
50%,,,,51.5325,-0.170493,,,
75%,,,,52.5245,-0.087053,,,


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=40789b9a-1c62-45b9-9d9c-b1a39ebe3dfd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>