# Capstone: Exploratory Prediction Modeling

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import time

# Export dataFrame's as images
import dataframe_image as dfi

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, log_loss

from xgboost import XGBClassifier
import xgboost as xgb

In [3]:
# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# logging.getLogger().setLevel(logging.DEBUG)
# logging.getLogger().setLevel(logging.INFO)

In [4]:
def time_secs_to_msg(lapse_time_secs, mins_label='m', secs_label='s'):
    if lapse_time_secs <= 60:
        return f'{lapse_time_secs%60:.2f}{secs_label}'
    else:
        return f'{lapse_time_secs//60:,.0f}{mins_label} {lapse_time_secs%60:.2f}{secs_label}'

## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [7]:
# Which dataset to work from?

sample_file = data_utils.select_sample_csv_file(pct=10)
print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean_10_pct.csv


In [8]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean_10_pct.csv ... Done: 89,458 rows, 37 columns
... Converting datetime to timeseries ... Done
... Setting index to datetime ... Done
Done


In [9]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 dropped
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
... preprocess_drop_cols: Column current_supervisor_d

In [10]:
# Fix data value artifacts that were discovered during EDA
data = data_utils.fix_data_artifacts(data)

Fixing data artifacts (in-place) ... 
... Category column:
    ..."Human Trafficking*"
    ..."Motor Vehicle Theft"
    ..."Weapons Offence"
Done


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82888 entries, 2024-08-01 08:01:00 to 2018-10-02 16:53:00
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             82888 non-null  object 
 1   time             82888 non-null  object 
 2   year             82888 non-null  int64  
 3   day_of_week      82888 non-null  object 
 4   category         82888 non-null  object 
 5   resolution       82888 non-null  object 
 6   police_district  82888 non-null  object 
 7   neighborhood     82888 non-null  object 
 8   latitude         82888 non-null  float64
 9   longitude        82888 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 7.0+ MB


## Summary of EDA

After cleaning the data and performing basic EDA, we have established the following:

1. Target variable `category`
   * Evenly spread across time
   * Incidence of crimes is extremely skewed/unbalanced by category. Larceny (29.02%) by far outweighing the other top-10 categories with each being in the single digits
3. Features impacting `category`
   * Affected by incident time and date components: date, time, day of week, month, year, etc
   * Affected by police disctrict
   * Affect by latitude and logitude (TODO: need visualization)
4. We artificially removed nulls (TODO: will come back to impute data later)

## Feature Engineering

In [15]:
data.head(2)

Unnamed: 0_level_0,date,time,year,day_of_week,category,resolution,police_district,neighborhood,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-08-01 08:01:00,2024/08/01,08:01,2024,Thursday,Other Miscellaneous,Open or Active,Mission,Mission,37.768272,-122.419983
2021-11-25 23:30:00,2021/11/25,23:30,2021,Thursday,Burglary,Open or Active,Northern,Haight Ashbury,37.773757,-122.432467


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82888 entries, 2024-08-01 08:01:00 to 2018-10-02 16:53:00
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             82888 non-null  object 
 1   time             82888 non-null  object 
 2   year             82888 non-null  int64  
 3   day_of_week      82888 non-null  object 
 4   category         82888 non-null  object 
 5   resolution       82888 non-null  object 
 6   police_district  82888 non-null  object 
 7   neighborhood     82888 non-null  object 
 8   latitude         82888 non-null  float64
 9   longitude        82888 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 7.0+ MB


### Encoding: Time-based columns

Let's unpack the date and time into their components that are still missing so there is less to encode:

In [19]:
data['hour'] = data.index.map(lambda x: x.hour)
data['minute'] = data.index.map(lambda x: x.minute)
data['day'] = data.index.map(lambda x: x.day)
data['month'] = data.index.map(lambda x: x.month)

Now let's encode day_of_week to numeric values:

In [21]:
enc_dow = LabelEncoder()
enc_dow.fit(data.day_of_week.unique())
data['dow'] = enc_dow.transform(data.day_of_week)

Let's mark the redundant columns to be dropped after feature engineering:

In [23]:
drop_encoded_cols = ['date', 'time', 'day_of_week']

### Encoding: Resolution

We will also drop the resolution column since it doesn't impact crime prediction:

In [26]:
data.resolution.value_counts()

resolution
Open or Active          66265
Cite or Arrest Adult    16623
Name: count, dtype: int64

In [27]:
drop_encoded_cols.append('resolution')

### Encoding: Category

In [29]:
enc_cat = LabelEncoder()
enc_cat.fit(data.category.unique())
data.category = enc_cat.transform(data.category)

### Encoding: Police District

In [31]:
enc_pd = LabelEncoder()
enc_pd.fit(data.police_district.unique())
data['pd'] = enc_pd.transform(data.police_district)

### Encoding: Neighborhood

In [33]:
enc_hood = LabelEncoder()
enc_hood.fit(data.neighborhood.unique())
data.neighborhood = enc_hood.transform(data.neighborhood)

### Dropping Redundant Columns

We can now drop the redundant encoded columns:

In [36]:
drop_encoded_cols.append('police_district')

print(f'Dropping encoded columns: {drop_encoded_cols}')
data.drop(columns=drop_encoded_cols, inplace=True)

Dropping encoded columns: ['date', 'time', 'day_of_week', 'resolution', 'police_district']


In [37]:
data.head(2)

Unnamed: 0_level_0,year,category,neighborhood,latitude,longitude,hour,minute,day,month,dow,pd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-08-01 08:01:00,2024,26,18,37.768272,-122.419983,8,1,1,8,4,3
2021-11-25 23:30:00,2021,2,8,37.773757,-122.432467,23,30,25,11,4,4


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 82888 entries, 2024-08-01 08:01:00 to 2018-10-02 16:53:00
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          82888 non-null  int64  
 1   category      82888 non-null  int64  
 2   neighborhood  82888 non-null  int64  
 3   latitude      82888 non-null  float64
 4   longitude     82888 non-null  float64
 5   hour          82888 non-null  int64  
 6   minute        82888 non-null  int64  
 7   day           82888 non-null  int64  
 8   month         82888 non-null  int64  
 9   dow           82888 non-null  int64  
 10  pd            82888 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 7.6 MB


## Data Preparation

### Create Train/Test Splits

In [41]:
X = data.drop('category', axis='columns')
y = data['category']

In [42]:
# OneHot Encode the features and drop the first value to reduce multicollinearity
X = pd.get_dummies(X, drop_first=True)

In [43]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=Config.RANDOM_STATE)

In [45]:
# spot-check feature encoding
X.T.iloc[:, 0:5]

datetime,2024-08-01 08:01:00,2021-11-25 23:30:00,2018-06-20 21:00:00,2022-07-06 12:41:00,2021-02-27 23:02:00
year,2024.0,2021.0,2018.0,2022.0,2021.0
neighborhood,18.0,8.0,23.0,33.0,19.0
latitude,37.768272,37.773757,37.723642,37.777457,37.770063
longitude,-122.419983,-122.432467,-122.461251,-122.413158,-122.403878
hour,8.0,23.0,21.0,12.0,23.0
minute,1.0,30.0,0.0,41.0,2.0
day,1.0,25.0,20.0,6.0,27.0
month,8.0,11.0,6.0,7.0,2.0
dow,4.0,4.0,6.0,6.0,2.0
pd,3.0,4.0,8.0,9.0,7.0


### Feature Scaling

In [47]:
# Scale the data - we'll use StandardScaler for the baseline model
logging.debug('Scaling data')
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

## Model Exploration

We will now evaluate different models for predicting the Crime Category from our features:

In [50]:
X_train.columns

Index(['year', 'neighborhood', 'latitude', 'longitude', 'hour', 'minute',
       'day', 'month', 'dow', 'pd'],
      dtype='object')

We will evaluate the following models:

* Logistic Regression with L1 Regularization
* K Nearest Neighbors
* Stochastic Gradient Descent

### Evaluation Metrics

In this project, we are predicting or classifyig across 49 crime categories. We will use two evaluation metrics to compare our models:

1. **Accuracy**: Measures the proportion of correct predictions over all predictions made. The accuracy benchmark is 1/49 or 2.04% given our crime categories
2. **Log_Loss**: Measures the accuracy of a classifier by penalizing false classifications. It does this by taking the negative logarithm of the predicted probability for the true class. The goal is to minimize this loss, meaning that higher probabilities are assigned to the correct classes
   * TODO: Benchmark???

While accuracy provides a simple measure of correctness, log-loss offers a more nuanced view by considering how confident those predictions are. We'll use them together for a comprehensive evaluation and to learn more about them

### Baseline DummyClassifier

In [55]:
results = []

In [56]:
name='DummyClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

all_classes = np.unique(y)

base = DummyClassifier(strategy='uniform', random_state=Config.RANDOM_STATE)#, classes=all_classes)
base.fit(X_train_scaled, y_train)
y_preds = base.predict(X_test)
pred_probs = base.predict_proba(X_test)
base_acc = accuracy_score(y_test, y_preds)
base_loss = log_loss(y_test, pred_probs)

label='Baseline: DummyClassifier - strategy=uniform'
results.append([label, base_acc, base_loss])
print(f'{label}: accuracy: {base_acc}, log_loss: {base_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

DummyClassifier: Starting
Baseline: DummyClassifier - strategy=uniform: accuracy: 0.022740982024369645, log_loss: 3.8066624897703183
DummyClassifier: Done: 0.31s


### LogisticRegresson (L1)

In [58]:
name='LogisticRegression'
print(f'{name}: Starting', flush=True)
start_time = time.time()

lr = LogisticRegression(penalty='l1', solver='saga',
                        max_iter=1000, verbose=1, n_jobs=3, random_state=Config.RANDOM_STATE)

lr.fit(X_train_scaled, y_train)
y_preds = lr.predict(X_test)
pred_probs = lr.predict_proba(X_test)
lr_acc = accuracy_score(y_test, y_preds)
lr_loss = log_loss(y_test, pred_probs)

label='LogisticRegression (L1)'
results.append([label, lr_acc, lr_loss])
print(f'{label}: accuracy: {lr_acc}, log_loss: {lr_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

LogisticRegression: Starting


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


convergence after 60 epochs took 18 seconds
LogisticRegression (L1): accuracy: 0.00030160453613222344, log_loss: 36.03278245975621
LogisticRegression: Done: 17.87s




### K-Nearest Neighbors

In [60]:
name='KNeighborsClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_preds = knn.predict(X_test)
pred_probs = knn.predict_proba(X_test)
knn_acc = accuracy_score(y_test, y_preds)
knn_loss = log_loss(y_test, pred_probs)

label='K-Nearest Neighbors'
results.append([label, knn_acc, knn_loss])
print(f'{label}: accuracy: {knn_acc}, log_loss: {knn_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

KNeighborsClassifier: Starting




K-Nearest Neighbors: accuracy: 0.2827844130775727, log_loss: 20.035105376974474
KNeighborsClassifier: Done: 9.06s


### Random Forest Ensemble

In [62]:
name='RandomForestClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

rf = RandomForestClassifier(n_estimators=500, max_depth=15,
                            min_samples_leaf=5, min_samples_split=25, 
                            random_state=Config.RANDOM_STATE, verbose=1, n_jobs=2)
rf.fit(X_train_scaled, y_train)
y_preds = rf.predict(X_test)
pred_probs = rf.predict_proba(X_test)
rf_acc = accuracy_score(y_test, y_preds)
rf_loss = log_loss(y_test, pred_probs)

label='Random Forest'
results.append([label, rf_acc, rf_loss])
print(f'{label}: accuracy: {rf_acc}, log_loss: {rf_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

RandomForestClassifier: Starting


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.7s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   11.5s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   26.9s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:   32.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.4s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    3.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    3.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s


Random Forest: accuracy: 0.2896609965013874, log_loss: 2.9338705945673396
RandomForestClassifier: Done: 36.94s


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.8s finished


### XGBoost Ensemble

In [64]:
name='XGBClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

xgb_clf = XGBClassifier(n_estimators=500, objective="multi:softprob", 
                        verbose=1, n_jobs=2, random_state=Config.RANDOM_STATE)
xgb_clf.fit(X_train, y_train)
y_preds = xgb_clf.predict(X_test)
pred_probs = xgb_clf.predict_proba(X_test)
xgb_acc = accuracy_score(y_test, y_preds)
xgb_loss = log_loss(y_test, pred_probs)

label='XGBoost'
results.append([label, xgb_acc, xgb_loss])
print(f'{label}: accuracy: {xgb_acc}, log_loss: {xgb_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

XGBClassifier: Starting


Parameters: { "verbose" } are not used.



XGBoost: accuracy: 0.30190614066835564, log_loss: 2.9832450078660924
XGBClassifier: Done: 3m 18.16s


In [65]:
results_df = pd.DataFrame(results,
                          columns=['Label', 'Accuracy', 'Log_Loss']
                         ).set_index('Label')


results_df

Unnamed: 0_level_0,Accuracy,Log_Loss
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
Baseline: DummyClassifier - strategy=uniform,0.022741,3.806662
LogisticRegression (L1),0.000302,36.032782
K-Nearest Neighbors,0.282784,20.035105
Random Forest,0.289661,2.933871
XGBoost,0.301906,2.983245


In [66]:
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_df, '../images/results.png')