In [5]:
import pandas as pd
import os
import glob

In [6]:
# Load all street crime files
street_files = glob.glob('../data/raw/**/*street*.csv', recursive=True)

dfs = []
for file in street_files:
    df = pd.read_csv(file)
    # Extract force name from filename
    df['force'] = os.path.basename(file).split('-')[2]
    dfs.append(df)

street = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(street):,} rows from {len(street_files)} files")
street.head()

Loaded 470,228 rows from 50 files


Unnamed: 0,Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context,force
0,3805030082ced818125d730fd7c0a1c6087320be923ce7...,2023-12,Derbyshire Constabulary,Derbyshire Constabulary,-1.376936,53.098607,On or near Mansfield Road,E01019400,Amber Valley 001A,Drugs,Court result unavailable,,derbyshire
1,ecb65a50a49b09cc5378e4f83459050e8e05db315eea08...,2023-12,Derbyshire Constabulary,Derbyshire Constabulary,-1.373582,53.09921,On or near Catherine Street,E01019400,Amber Valley 001A,Public order,Unable to prosecute suspect,,derbyshire
2,49ae18115c1bd911c2a6bc178553681a5800b64b59b7ba...,2023-12,Derbyshire Constabulary,Derbyshire Constabulary,-1.374915,53.103055,On or near Tavistock Square,E01019400,Amber Valley 001A,Violence and sexual offences,Court result unavailable,,derbyshire
3,,2023-12,Derbyshire Constabulary,Derbyshire Constabulary,-1.381946,53.10124,On or near Cedar Avenue,E01019400,Amber Valley 001A,Anti-social behaviour,,,derbyshire
4,d2530e8c1f64fb5638533662d9c22e219b2cc4834ae57d...,2023-12,Derbyshire Constabulary,Derbyshire Constabulary,-1.373886,53.099904,On or near Catherine Court,E01019400,Amber Valley 001A,Other theft,Investigation complete; no suspect identified,,derbyshire


In [7]:
# Drop columns that aren't useful
street = street.drop(columns=['Context', 'Falls within', 'Reported by'], errors='ignore')

# Rename columns to be more code-friendly
street.columns = [c.lower().replace(' ', '_') for c in street.columns]

# Drop rows with no location data
street = street.dropna(subset=['latitude', 'longitude'])

# Convert month to datetime
street['month'] = pd.to_datetime(street['month'])

print(f"Clean dataset: {len(street):,} rows")
print(street.columns.tolist())
street.head()

Clean dataset: 468,342 rows
['crime_id', 'month', 'longitude', 'latitude', 'location', 'lsoa_code', 'lsoa_name', 'crime_type', 'last_outcome_category', 'force']


Unnamed: 0,crime_id,month,longitude,latitude,location,lsoa_code,lsoa_name,crime_type,last_outcome_category,force
0,3805030082ced818125d730fd7c0a1c6087320be923ce7...,2023-12-01,-1.376936,53.098607,On or near Mansfield Road,E01019400,Amber Valley 001A,Drugs,Court result unavailable,derbyshire
1,ecb65a50a49b09cc5378e4f83459050e8e05db315eea08...,2023-12-01,-1.373582,53.09921,On or near Catherine Street,E01019400,Amber Valley 001A,Public order,Unable to prosecute suspect,derbyshire
2,49ae18115c1bd911c2a6bc178553681a5800b64b59b7ba...,2023-12-01,-1.374915,53.103055,On or near Tavistock Square,E01019400,Amber Valley 001A,Violence and sexual offences,Court result unavailable,derbyshire
3,,2023-12-01,-1.381946,53.10124,On or near Cedar Avenue,E01019400,Amber Valley 001A,Anti-social behaviour,,derbyshire
4,d2530e8c1f64fb5638533662d9c22e219b2cc4834ae57d...,2023-12-01,-1.373886,53.099904,On or near Catherine Court,E01019400,Amber Valley 001A,Other theft,Investigation complete; no suspect identified,derbyshire


In [8]:
os.makedirs('../data/processed', exist_ok=True)
street.to_csv('../data/processed/street_clean.csv', index=False)
print("Saved to data/processed/street_clean.csv")

Saved to data/processed/street_clean.csv
