In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
crimes = pd.read_csv('data\crimes.csv')
data = crimes[["dispatch_date", "dispatch_time","location_block", "text_general_code", "point_x", "point_y"]]

In [3]:
data.head()

Unnamed: 0,dispatch_date,dispatch_time,location_block,text_general_code,point_x,point_y
0,2023-10-04,22:58:00,1300 BLOCK N ALLISON ST,Thefts,-75.232271,39.972757
1,2023-10-04,22:42:00,200 BLOCK W CHELTEN AV,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809
2,2023-10-04,22:25:00,600 BLOCK N 52ND ST,All Other Offenses,-75.226049,39.971074
3,2023-10-04,22:23:00,3400 BLOCK ARBOR ST,All Other Offenses,-75.121491,40.001767
4,2023-10-04,22:17:00,5900 BLOCK N BROAD ST,Other Assaults,-75.143587,40.044345


# Feature Preprocessing

Dissect Date and Time into a set of categorical features

In [4]:
date_format = "%Y:%m:%d"
data['dispatch_date'] = pd.to_datetime(data['dispatch_date'])
data['dispatch_day'] = data['dispatch_date'].dt.day
data['dispatch_month'] = data['dispatch_date'].dt.month
data['dispatch_year'] = data['dispatch_date'].dt.year
data['dispatch_dayOfWeek'] = data['dispatch_date'].dt.dayofweek
data = data.drop("dispatch_time", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dispatch_date'] = pd.to_datetime(data['dispatch_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dispatch_day'] = data['dispatch_date'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dispatch_month'] = data['dispatch_date'].dt.month
A value is trying to be set on a co

In [5]:
data.head()

Unnamed: 0,dispatch_date,location_block,text_general_code,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek
0,2023-10-04,1300 BLOCK N ALLISON ST,Thefts,-75.232271,39.972757,4,10,2023,2
1,2023-10-04,200 BLOCK W CHELTEN AV,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2
2,2023-10-04,600 BLOCK N 52ND ST,All Other Offenses,-75.226049,39.971074,4,10,2023,2
3,2023-10-04,3400 BLOCK ARBOR ST,All Other Offenses,-75.121491,40.001767,4,10,2023,2
4,2023-10-04,5900 BLOCK N BROAD ST,Other Assaults,-75.143587,40.044345,4,10,2023,2


Extracted street names and block numbers

In [6]:
#  Extracting street names and block numbers allows for more granular analysis of crime locations.
# While streets don't have an inherent order, the block numbers can provide some ordinal information
# his information can pave the way for more advanced geospatial analysis or clustering based on proximity
data['street_name'] = data['location_block'].str.extract('(BLOCK [A-Z]+)')
data['block_number'] = data['location_block'].str.extract('(\d+) BLOCK')
data = data.drop("location_block", axis=1)

In [7]:
data.head()

Unnamed: 0,dispatch_date,text_general_code,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek,street_name,block_number
0,2023-10-04,Thefts,-75.232271,39.972757,4,10,2023,2,BLOCK N,1300
1,2023-10-04,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2,BLOCK W,200
2,2023-10-04,All Other Offenses,-75.226049,39.971074,4,10,2023,2,BLOCK N,600
3,2023-10-04,All Other Offenses,-75.121491,40.001767,4,10,2023,2,BLOCK ARBOR,3400
4,2023-10-04,Other Assaults,-75.143587,40.044345,4,10,2023,2,BLOCK N,5900


One Hot Encoding Crime Type, since there is no inherent order in the crime types

In [8]:
data.rename(columns={'text_general_code':'crime_type'}, inplace=True)
data["crime_type"].value_counts()

All Other Offenses                         567720
Other Assaults                             435605
Thefts                                     431659
Vandalism/Criminal Mischief                297890
Theft from Vehicle                         259640
Narcotic / Drug Law Violations             180360
Fraud                                      176292
Burglary Residential                       122241
Aggravated Assault No Firearm              103848
Motor Vehicle Theft                         91499
Robbery No Firearm                          71680
DRIVING UNDER THE INFLUENCE                 66617
Robbery Firearm                             56052
Aggravated Assault Firearm                  48277
Disorderly Conduct                          44115
Weapon Violations                           38339
Burglary Non-Residential                    35676
Other Sex Offenses (Not Commercialized)     21524
Rape                                        17617
Prostitution and Commercialized Vice        15934


In [9]:
# combining Homicide categories together
data["crime_type"] = data["crime_type"].apply(lambda x: "Homicides" if isinstance(x, str) and "Homicide" in x else x)
data["crime_type"].value_counts()

All Other Offenses                         567720
Other Assaults                             435605
Thefts                                     431659
Vandalism/Criminal Mischief                297890
Theft from Vehicle                         259640
Narcotic / Drug Law Violations             180360
Fraud                                      176292
Burglary Residential                       122241
Aggravated Assault No Firearm              103848
Motor Vehicle Theft                         91499
Robbery No Firearm                          71680
DRIVING UNDER THE INFLUENCE                 66617
Robbery Firearm                             56052
Aggravated Assault Firearm                  48277
Disorderly Conduct                          44115
Weapon Violations                           38339
Burglary Non-Residential                    35676
Other Sex Offenses (Not Commercialized)     21524
Rape                                        17617
Prostitution and Commercialized Vice        15934


In [10]:
data.head()

Unnamed: 0,dispatch_date,crime_type,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek,street_name,block_number
0,2023-10-04,Thefts,-75.232271,39.972757,4,10,2023,2,BLOCK N,1300
1,2023-10-04,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2,BLOCK W,200
2,2023-10-04,All Other Offenses,-75.226049,39.971074,4,10,2023,2,BLOCK N,600
3,2023-10-04,All Other Offenses,-75.121491,40.001767,4,10,2023,2,BLOCK ARBOR,3400
4,2023-10-04,Other Assaults,-75.143587,40.044345,4,10,2023,2,BLOCK N,5900


# Feature Engineering

Distance of crime to nearest police station

In [11]:
import json

min_distances = json.load(open("data\min_distances.json"))["min_distances"]
data["distance_to_nearest_police_station"] = min_distances
data.head()

Unnamed: 0,dispatch_date,crime_type,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek,street_name,block_number,distance_to_nearest_police_station
0,2023-10-04,Thefts,-75.232271,39.972757,4,10,2023,2,BLOCK N,1300,0.010203
1,2023-10-04,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2,BLOCK W,200,0.00752
2,2023-10-04,All Other Offenses,-75.226049,39.971074,4,10,2023,2,BLOCK N,600,0.014742
3,2023-10-04,All Other Offenses,-75.121491,40.001767,4,10,2023,2,BLOCK ARBOR,3400,0.007706
4,2023-10-04,Other Assaults,-75.143587,40.044345,4,10,2023,2,BLOCK N,5900,0.000326


In [12]:
weather = pd.read_csv('data/weather.csv')
# Droping the first two rows
weather_cleaned = weather.drop([0, 1]).reset_index(drop=True)

# Setting the third row as the header
weather_cleaned.columns = weather.iloc[1]

# Dropping the now redundant third row
weather_cleaned = weather_cleaned.drop(2).reset_index(drop=True)

# Displaying the first few rows of the cleaned weather dataframe
weather_cleaned.head()

1,time,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h)
0,2006-01-01,45.5,30.5,36.5,0.0,0.0
1,2006-01-02,46.9,31.1,38.0,12.6,9.0
2,2006-01-04,42.7,31.5,36.3,0.3,1.0
3,2006-01-05,50.5,37.5,42.5,0.0,0.0
4,2006-01-06,41.3,32.6,37.0,0.0,0.0


In [13]:
# Converting the 'time' column in weather_cleaned to datetime format
weather_cleaned['time'] = pd.to_datetime(weather_cleaned['time'])

# Converting the 'dispatch_date' column in data to datetime format
data['dispatch_date'] = pd.to_datetime(data['dispatch_date'])

# Merging the two dataframes based on the date
merged_data = pd.merge(data, weather_cleaned, left_on='dispatch_date', right_on='time', how='left')
data = merged_data

# Displaying the first few rows of the merged dataframe
data.head()

Unnamed: 0,dispatch_date,crime_type,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek,street_name,block_number,distance_to_nearest_police_station,time,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h)
0,2023-10-04,Thefts,-75.232271,39.972757,4,10,2023,2,BLOCK N,1300,0.010203,2023-10-04,82.0,60.2,69.9,0.0,0.0
1,2023-10-04,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2,BLOCK W,200,0.00752,2023-10-04,82.0,60.2,69.9,0.0,0.0
2,2023-10-04,All Other Offenses,-75.226049,39.971074,4,10,2023,2,BLOCK N,600,0.014742,2023-10-04,82.0,60.2,69.9,0.0,0.0
3,2023-10-04,All Other Offenses,-75.121491,40.001767,4,10,2023,2,BLOCK ARBOR,3400,0.007706,2023-10-04,82.0,60.2,69.9,0.0,0.0
4,2023-10-04,Other Assaults,-75.143587,40.044345,4,10,2023,2,BLOCK N,5900,0.000326,2023-10-04,82.0,60.2,69.9,0.0,0.0


In [14]:
unemp = pd.read_csv('data/unemployment rate.csv')
# Converting 'label' column in unemp dataframe to string type
unemp['label'] = unemp['label'].astype(str)

# Extracting month and year from dispatch_date
merged_data['Year-Month'] = merged_data['dispatch_date'].dt.strftime('%Y-%m')

# Merging with unemployment data
merged_data = pd.merge(merged_data, unemp, left_on='Year-Month', right_on='label', how='left')

# If there are any missing values, filling them with the most recent unemployment rate
most_recent_rate = unemp['Unemployment Rate of a Population'].iloc[0]
merged_data['Unemployment Rate of a Population'].fillna(most_recent_rate, inplace=True)

# Dropping unnecessary columns
merged_data.drop(columns=["label", "dispatch_date", "label", "Year-Month"], inplace=True)
merged_data.rename({"Unemployment Rate of a Population": "Unemployment Rate"})
data = merged_data
# Displaying the cleaned dataframe
data.head()

Unnamed: 0,crime_type,point_x,point_y,dispatch_day,dispatch_month,dispatch_year,dispatch_dayOfWeek,street_name,block_number,distance_to_nearest_police_station,time,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,Thefts,-75.232271,39.972757,4,10,2023,2,BLOCK N,1300,0.010203,2023-10-04,82.0,60.2,69.9,0.0,0.0,5.4
1,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,4,10,2023,2,BLOCK W,200,0.00752,2023-10-04,82.0,60.2,69.9,0.0,0.0,5.4
2,All Other Offenses,-75.226049,39.971074,4,10,2023,2,BLOCK N,600,0.014742,2023-10-04,82.0,60.2,69.9,0.0,0.0,5.4
3,All Other Offenses,-75.121491,40.001767,4,10,2023,2,BLOCK ARBOR,3400,0.007706,2023-10-04,82.0,60.2,69.9,0.0,0.0,5.4
4,Other Assaults,-75.143587,40.044345,4,10,2023,2,BLOCK N,5900,0.000326,2023-10-04,82.0,60.2,69.9,0.0,0.0,5.4


In [20]:
data.drop(columns=["time"], inplace=True)

Taking care of missing values

In [25]:
data.isna().sum()

crime_type                                 0
point_x                                43047
point_y                                43047
dispatch_day                               0
dispatch_month                             0
dispatch_year                              0
dispatch_dayOfWeek                         0
street_name                           270169
block_number                          259602
distance_to_nearest_police_station         0
temperature_2m_max (°F)                  596
temperature_2m_min (°F)                  596
temperature_2m_mean (°F)                 596
precipitation_sum (mm)                   596
precipitation_hours (h)                  596
Unemployment Rate of a Population          0
dtype: int64

In [28]:
data.dropna(inplace=True)

In [29]:
data.to_csv("data\data_full_features.csv")

# Preprocessing Pipeline

* One hot encoding crime types
* Nominally encoding street names
* Standardizing (Z score) different numerical features

In [31]:
data_pipeline = ColumnTransformer([
    ("numerical", StandardScaler(), ["point_x", "point_y", "distance_to_nearest_police_station", 
                                     "temperature_2m_max (°F)", "temperature_2m_min (°F)", 
                                     "temperature_2m_mean (°F)", "precipitation_sum (mm)", "precipitation_hours (h)",
                                     "Unemployment Rate of a Population", "dispatch_dayOfWeek", "dispatch_day", 
                                     "dispatch_month", "dispatch_year", "block_number"]),
    ("categorical", OneHotEncoder(), ["crime_type"]),
    ("ordinal", OrdinalEncoder(), ["street_name"])
])

transformed = data_pipeline.fit_transform(data)
transformed

array([[-1.39420859e+00, -4.53719594e-01, -8.11058867e-01, ...,
         0.00000000e+00,  0.00000000e+00,  2.07000000e+03],
       [-4.97176810e-01,  8.31935666e-01, -1.04910579e+00, ...,
         0.00000000e+00,  0.00000000e+00,  2.98900000e+03],
       [-1.29021644e+00, -4.90990957e-01, -4.08383318e-01, ...,
         0.00000000e+00,  0.00000000e+00,  2.07000000e+03],
       ...,
       [-1.36689669e+00, -1.23419800e+00, -3.62725355e-01, ...,
         0.00000000e+00,  1.00000000e+00,  2.55400000e+03],
       [-1.39572283e+00, -8.84983582e-01, -1.58063510e+00, ...,
         0.00000000e+00,  1.00000000e+00,  2.55400000e+03],
       [-1.39623732e+00, -6.64735882e-01, -8.76361465e-01, ...,
         0.00000000e+00,  0.00000000e+00,  5.52000000e+02]])

In [32]:
transformed.shape

(2841446, 45)