# Capstone: Exploratory Prediction Modeling

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns


# Import utilities
# import pathlib
import time

# Export dataFrame's as images
import dataframe_image as dfi

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve

from xgboost import XGBClassifier
import xgboost as xgb

In [3]:
# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# logging.getLogger().setLevel(logging.DEBUG)
# logging.getLogger().setLevel(logging.INFO)

In [4]:
def time_secs_to_msg(lapse_time_secs, mins_label='m', secs_label='s'):
    if lapse_time_secs <= 60:
        return f'{lapse_time_secs%60:.2f}{secs_label}'
    else:
        return f'{lapse_time_secs//60:,.0f}{mins_label} {lapse_time_secs%60:.2f}{secs_label}'

## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [7]:
# Which dataset to work from?

sample_file = data_utils.select_sample_csv_file(pct=100)
# sample_file = data_utils.select_sample_csv_file(pct=10)
print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean.csv


In [8]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean.csv ... Done: 894,585 rows, 36 columns
... Converting datetime to timeseries ... Done
... Setting index to datetime ... Done
Done


In [9]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 not dropped: KeyError("['Unnamed: 0'] not found in axis")
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
...

In [10]:
# Fix data value artifacts that were discovered during EDA
data = data_utils.fix_data_artifacts(data)

Fixing data artifacts (in-place) ... 
... Category column:
    ..."Human Trafficking*"
    ..."Motor Vehicle Theft"
    ..."Weapons Offence"
Done


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             829328 non-null  object 
 1   time             829328 non-null  object 
 2   year             829328 non-null  int64  
 3   day_of_week      829328 non-null  object 
 4   category         829328 non-null  object 
 5   resolution       829328 non-null  object 
 6   police_district  829328 non-null  object 
 7   neighborhood     829328 non-null  object 
 8   latitude         829328 non-null  float64
 9   longitude        829328 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 69.6+ MB


## Summary of EDA

After cleaning the data and performing basic EDA, we have established the following:

1. Target variable `category`
   * Evenly spread across time
   * Incidence of crimes is extremely skewed/unbalanced by category. Larceny (29.02%) by far outweighing the other top-10 categories with each being in the single digits
3. Features impacting `category`
   * Affected by incident time and date components: date, time, day of week, month, year, etc
   * Affected by police disctrict
   * Affect by latitude and logitude (TODO: need visualization)
4. We artificially removed nulls (TODO: will come back to impute data later)

## Feature Engineering

In [15]:
data.head(2)

Unnamed: 0_level_0,date,time,year,day_of_week,category,resolution,police_district,neighborhood,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-03-11 14:00:00,2023/03/11,14:00,2023,Saturday,Assault,Open or Active,Park,Golden Gate Park,37.772895,-122.454285
2022-06-27 12:00:00,2022/06/27,12:00,2022,Monday,Lost Property,Open or Active,Central,Financial District/South Beach,37.787359,-122.408227


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             829328 non-null  object 
 1   time             829328 non-null  object 
 2   year             829328 non-null  int64  
 3   day_of_week      829328 non-null  object 
 4   category         829328 non-null  object 
 5   resolution       829328 non-null  object 
 6   police_district  829328 non-null  object 
 7   neighborhood     829328 non-null  object 
 8   latitude         829328 non-null  float64
 9   longitude        829328 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 69.6+ MB


### Encoding: Time-based columns

Let's unpack the date and time into their components that are still missing so there is less to encode:

In [19]:
data['hour'] = data.index.map(lambda x: x.hour)
data['minute'] = data.index.map(lambda x: x.minute)
data['day'] = data.index.map(lambda x: x.day)
data['month'] = data.index.map(lambda x: x.month)

Now let's encode day_of_week to numeric values:

In [21]:
enc_dow = LabelEncoder()
enc_dow.fit(data.day_of_week.unique())
data['dow'] = enc_dow.transform(data.day_of_week)

Let's mark the redundant columns to be dropped after feature engineering:

In [23]:
drop_encoded_cols = ['date', 'time', 'day_of_week']

### Encoding: Resolution

We will also drop the resolution column since it doesn't impact crime prediction:

In [26]:
data.resolution.value_counts()

resolution
Open or Active          662581
Cite or Arrest Adult    166747
Name: count, dtype: int64

In [27]:
drop_encoded_cols.append('resolution')

### Encoding: Category

In [29]:
enc_cat = LabelEncoder()
enc_cat.fit(data.category.unique())
data.category = enc_cat.transform(data.category)

### Encoding: Police District

In [31]:
enc_pd = LabelEncoder()
enc_pd.fit(data.police_district.unique())
data['pd'] = enc_pd.transform(data.police_district)

### Encoding: Neighborhood

In [33]:
enc_hood = LabelEncoder()
enc_hood.fit(data.neighborhood.unique())
data.neighborhood = enc_hood.transform(data.neighborhood)

### Dropping Redundant Columns

We can now drop the redundant encoded columns:

In [36]:
drop_encoded_cols.append('police_district')

print(f'Dropping encoded columns: {drop_encoded_cols}')
data.drop(columns=drop_encoded_cols, inplace=True)

Dropping encoded columns: ['date', 'time', 'day_of_week', 'resolution', 'police_district']


In [37]:
data.head(2)

Unnamed: 0_level_0,year,category,neighborhood,latitude,longitude,hour,minute,day,month,dow,pd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-03-11 14:00:00,2023,1,7,37.772895,-122.454285,14,0,11,3,2,5
2022-06-27 12:00:00,2022,18,5,37.787359,-122.408227,12,0,27,6,1,1


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          829328 non-null  int64  
 1   category      829328 non-null  int64  
 2   neighborhood  829328 non-null  int64  
 3   latitude      829328 non-null  float64
 4   longitude     829328 non-null  float64
 5   hour          829328 non-null  int64  
 6   minute        829328 non-null  int64  
 7   day           829328 non-null  int64  
 8   month         829328 non-null  int64  
 9   dow           829328 non-null  int64  
 10  pd            829328 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 75.9 MB


## Data Preparation

### Create Train/Test Splits

In [41]:
X = data.drop('category', axis='columns')
y = data['category']

In [42]:
# OneHot Encode the features and drop the first value to reduce multicollinearity
X = pd.get_dummies(X, drop_first=True)

In [43]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=Config.RANDOM_STATE)

In [45]:
# spot-check feature encoding
X.T.iloc[:, 0:5]

datetime,2023-03-11 14:00:00,2022-06-27 12:00:00,2023-03-16 17:30:00,2023-03-21 15:50:00,2021-08-22 09:40:00
year,2023.0,2022.0,2023.0,2023.0,2021.0
neighborhood,7.0,5.0,28.0,35.0,26.0
latitude,37.772895,37.787359,37.76229,37.787038,37.793977
longitude,-122.454285,-122.408227,-122.401324,-122.418271,-122.429804
hour,14.0,12.0,17.0,15.0,9.0
minute,0.0,0.0,30.0,50.0,40.0
day,11.0,27.0,16.0,21.0,22.0
month,3.0,6.0,3.0,3.0,8.0
dow,2.0,1.0,4.0,5.0,3.0
pd,5.0,1.0,0.0,4.0,4.0


### Feature Scaling

In [47]:
# Scale the data - we'll use StandardScaler for the baseline model
logging.debug('Scaling data')
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Exploration

We will now evaluate different models for predicting the Crime Category from our features:

In [50]:
X_train.columns

Index(['year', 'neighborhood', 'latitude', 'longitude', 'hour', 'minute',
       'day', 'month', 'dow', 'pd'],
      dtype='object')

The task of classifying the incident types based on a set of historical attrirbutes (features) and predicting on similar attributes is a **multiclass classification** problem. We will now experiment on some ML models that are generally used for similar problems to see what would be the best choice for us.

We will evaluate the following models:

* Simple classification models
  * `DummyClassifier` to get a baseline for our project
  * `LogisticRegression` with L1 Regularization
* Multiclass classifiers
  * `KNeighborsClassifier`
* Ensemble methods: Since our dataset has high variability with a lot of numerical and cagtegorical features with a range of mean and variance, we plan to employ ensemble methods and tune them for best results
  * `RandomForestClassifier`
  * `XGBClassifier`: We considered `XGLite` but selected XGBoost as it provides better model explainability features like SHAP values, which we expect to be able to use in explaining our results

### Evaluation Metrics

In this project, we are predicting or classifyig across 49 crime categories. We will use two evaluation metrics to compare our models:

1. **Accuracy**: Measures the proportion of correct predictions over all predictions made. The accuracy benchmark is 1/49 or 2.04% given our crime categories
2. **Log_Loss**: Measures the accuracy of a classifier by penalizing false classifications. It does this by taking the negative logarithm of the predicted probability for the true class. The goal is to minimize this loss, meaning that higher probabilities are assigned to the correct classes
   * TODO: Benchmark???

While accuracy provides a simple measure of correctness, log-loss offers a more nuanced view by considering how confident those predictions are. We'll use them together for a comprehensive evaluation and to learn more about them

In [54]:
def build_results_row(name, model, Xtr, Xte, ytr, yte, use_best=False):
    """
    Given the model and training/test sets, builds a row of metrics for reporting the results

    :param name: Name/Description of model
    :param model: Fully constructed model instance - will call fit() and predict() to get metrics
    :param Xtr: X_train - scale before calling
    :param Xte: X_test - scale before calling
    :param ytr: Y_train set
    :param yteL: Y_test set
    """
    
    print(f'{name}: Starting', flush=True)
    start_time = time.time()

    # train the model
    clf = model.fit(Xtr, ytr)

    # if we're tuning then use best_estimator
    if use_best:
        clf = model.best_estimator_
        logging.debug(f'{name}: Best Model={clf}')
        logging.debug(f'{name}: Best Params={model.best_params_}')

    # Save fit time
    fit_time = time.time() - start_time
    logging.debug(f'{name}: Fitted')

    # get the predictions / probabilities
    y_preds = clf.predict(Xte)
    y_probs_full = clf.predict_proba(Xte)
    y_probs = y_probs_full[:, 1]

    print(f'>>> {yte.shape} {y_preds.shape} {y_probs_full.shape}')
    print(f'>>> {np.unique(yte)}')
    print(f'>>> {np.unique(y_preds)}')
    print(f'>>> {np.unique(y_probs_full)}')

    cm = confusion_matrix(yte, y_preds)
    logging.debug(f'{name}: Got preds/probs')

    # Get metrics
    row = {
        'Train Time': time_secs_to_msg(fit_time),
        'Train Accuracy': f'{model.score(Xtr, ytr)*100:.2f}%',
        'Test Accuracy': f'{model.score(Xte, yte)*100:.2f}%',
        'Precision': f'{precision_score(yte, y_preds, average="weighted")*100:.2f}%',  # for multi-class with imbalance
        'Recall': f'{recall_score(yte, y_preds, average="weighted")*100:.2f}%',
        'F1': f'{f1_score(yte, y_preds, average="weighted")*100:.2f}%',
        'AUC': f'{roc_auc_score(yte, y_probs_full, average="weighted", multi_class="ovr")*100:.2f}%',    # faster with imbalanced multi-class cases
        'Log_loss': f'{log_loss(yte, y_probs_full, labels=np.unique(yte))}',
        'preds': y_preds,
        'probs': y_probs,
        'cm': cm,
        'TN': f'{cm[0, 0]:,d}',
        'FP': f'{cm[0, 1]:,d}',
        'FN': f'{cm[1, 0]:,d}',
        'TP': f'{cm[1, 1]:,d}',
        'params': model.get_params(),
        'best_params': None,
        'best_model': clf,
    }
    if use_best:
        row.update({'best_params': model.best_params_})
        
    logging.debug(f'{name}: Got metrics')
    
    print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

    return row

### Baseline DummyClassifier

In [56]:
results = []

n_estimators = 100

In [57]:
# let's start saving the results for reporting out
results_defaults = {}

# Reports won't print all the columns
report_cols = ['Train Time', 'Train Accuracy', 'Test Accuracy', 
               'Precision', 'Recall', 'F1', 'AUC', 'Log_loss', 'TN', 'FP', 'FN', 'TP']
report_cols_tuned = ['best_params', 'Train Time', 'Train Accuracy', 'Test Accuracy', 
                     'Precision', 'Recall', 'F1', 'AUC', 'Log_loss', 'TN', 'FP', 'FN', 'TP']

### DummyClassifier

In [59]:
name='DummyClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

base = DummyClassifier(strategy='uniform', random_state=Config.RANDOM_STATE)
base.fit(X_train_scaled, y_train)
y_preds = base.predict(X_test_scaled)
pred_probs = base.predict_proba(X_test_scaled)
base_acc = accuracy_score(y_test, y_preds)
base_loss = log_loss(y_test, pred_probs)

label='Baseline: DummyClassifier - strategy=uniform'
results.append([label, base_acc, base_loss, time_secs_to_msg(time.time()-start_time), base.get_params()])
print(f'{label}: accuracy: {base_acc}, log_loss: {base_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

DummyClassifier: Starting
Baseline: DummyClassifier - strategy=uniform: accuracy: 0.022120265756695165, log_loss: 3.8066624897703183
DummyClassifier: Done: 0.40s


In [60]:
# Define the default models
models_dummy = {
    'DummyClassifier: uniform': DummyClassifier(strategy='uniform', random_state=Config.RANDOM_STATE),
    'DummyClassifier: most_frequent': DummyClassifier(strategy='most_frequent', random_state=Config.RANDOM_STATE),
    'DummyClassifier: stratified': DummyClassifier(strategy='stratified', random_state=Config.RANDOM_STATE),
}

In [61]:
for name, model in models_dummy.items():
    # Get metrics row for the report - will fit() and predict() to generate metrics
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

DummyClassifier: uniform: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [0.02222222]
DummyClassifier: uniform: Done: 2.34s
DummyClassifier: most_frequent: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [16]
>>> [0. 1.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


DummyClassifier: most_frequent: Done: 2.76s
DummyClassifier: stratified: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [0. 1.]
DummyClassifier: stratified: Done: 5.19s


In [62]:
# results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)

results_defaults_df[report_cols]

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,Precision,Recall,F1,AUC,Log_loss,TN,FP,FN,TP
DummyClassifier: most_frequent,0.07s,28.93%,28.93%,8.37%,28.93%,12.98%,50.00%,25.617535643729592,0,0,0,0
DummyClassifier: uniform,0.07s,2.21%,2.21%,11.62%,2.21%,3.12%,50.00%,3.8066624897703183,16,19,240,242
DummyClassifier: stratified,0.09s,11.55%,11.44%,11.46%,11.44%,11.45%,49.95%,31.920274739734527,1,35,43,722


In [63]:
# if not Config.SUPPRESS_OUTPUT_FILES:
#     dfi.export(results_defaults_df[report_cols], data_utils.Config.IMAGE_DIR / 'table_models_defaults.png')

In [64]:
results_defaults

{'DummyClassifier: uniform': {'Train Time': '0.07s',
  'Train Accuracy': '2.21%',
  'Test Accuracy': '2.21%',
  'Precision': '11.62%',
  'Recall': '2.21%',
  'F1': '3.12%',
  'AUC': '50.00%',
  'Log_loss': '3.8066624897703183',
  'preds': array([38, 28, 14, ..., 42, 14, 11]),
  'probs': array([0.02222222, 0.02222222, 0.02222222, ..., 0.02222222, 0.02222222,
         0.02222222]),
  'cm': array([[ 16,  19,  11, ...,  13,   9,   9],
         [240, 242, 237, ..., 252, 230, 253],
         [217, 219, 216, ..., 223, 227, 204],
         ...,
         [122, 121, 127, ...,  96, 146, 118],
         [ 19,  22,  14, ...,  16,  27,  29],
         [ 30,  30,  40, ...,  33,  22,  30]]),
  'TN': '16',
  'FP': '19',
  'FN': '240',
  'TP': '242',
  'params': {'constant': None, 'random_state': 42, 'strategy': 'uniform'},
  'best_params': None,
  'best_model': DummyClassifier(random_state=42, strategy='uniform')},
 'DummyClassifier: most_frequent': {'Train Time': '0.07s',
  'Train Accuracy': '28.93%',
  '

### LogisticRegresson (L1)

In [66]:
name='LogisticRegression'
print(f'{name}: Starting', flush=True)
start_time = time.time()

lr = LogisticRegression(penalty='l1', solver='saga',
                        max_iter=1000, verbose=1, n_jobs=3, random_state=Config.RANDOM_STATE)

lr.fit(X_train_scaled, y_train)
y_preds = lr.predict(X_test_scaled)
pred_probs = lr.predict_proba(X_test_scaled)
lr_acc = accuracy_score(y_test, y_preds)
lr_loss = log_loss(y_test, pred_probs)

label='LogisticRegression (L1)'
results.append([label, lr_acc, lr_loss, time_secs_to_msg(time.time()-start_time), lr.get_params()])
print(f'{label}: accuracy: {lr_acc}, log_loss: {lr_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

LogisticRegression: Starting


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


convergence after 30 epochs took 104 seconds
LogisticRegression (L1): accuracy: 0.29198268481786505, log_loss: 2.6362135489065714
LogisticRegression: Done: 1m 45.78s


In [67]:
# Define the default models
models_lr = {
    'LogisticRegression: Default': LogisticRegression(random_state=Config.RANDOM_STATE),
    'LogisticRegression: L1 / saga': LogisticRegression(penalty='l1', solver='saga', max_iter=1000, 
                                                        verbose=1, n_jobs=3, random_state=Config.RANDOM_STATE),
}

In [68]:
for name, model in models_lr.items():
    # Get metrics row for the report - will fit() and predict() to generate metrics
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

LogisticRegression: Default: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 1  7 12 16 22 26 30]
>>> [1.75641513e-08 1.85709154e-08 1.87343785e-08 ... 6.57328892e-01
 6.72390699e-01 6.76321225e-01]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression: Default: Done: 2m 3.55s
LogisticRegression: L1 / saga: Starting


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.


convergence after 30 epochs took 89 seconds
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 1  7 12 16 22 26 30]
>>> [5.43353199e-09 5.75068710e-09 5.84737997e-09 ... 6.57009082e-01
 6.72288703e-01 6.76193414e-01]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression: L1 / saga: Done: 1m 34.24s


In [69]:
label=f'Model ({sample_file})'
# results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)

results_defaults_df[report_cols]

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,Precision,Recall,F1,AUC,Log_loss,TN,FP,FN,TP
LogisticRegression: L1 / saga,1m 29.27s,29.19%,29.20%,14.15%,29.20%,14.66%,63.23%,2.6362135489065714,0,4,0,89
LogisticRegression: Default,1m 58.27s,29.19%,29.19%,13.94%,29.19%,14.65%,63.22%,2.6363059389563923,0,4,0,87
DummyClassifier: most_frequent,0.07s,28.93%,28.93%,8.37%,28.93%,12.98%,50.00%,25.617535643729592,0,0,0,0
DummyClassifier: uniform,0.07s,2.21%,2.21%,11.62%,2.21%,3.12%,50.00%,3.8066624897703183,16,19,240,242
DummyClassifier: stratified,0.09s,11.55%,11.44%,11.46%,11.44%,11.45%,49.95%,31.920274739734527,1,35,43,722


### K-Nearest Neighbors

In [71]:
name='KNeighborsClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_preds = knn.predict(X_test_scaled)
pred_probs = knn.predict_proba(X_test_scaled)
knn_acc = accuracy_score(y_test, y_preds)
knn_loss = log_loss(y_test, pred_probs)

label='K-Nearest Neighbors'
results.append([label, knn_acc, knn_loss, time_secs_to_msg(time.time()-start_time), knn.get_params()])
print(f'{label}: accuracy: {knn_acc}, log_loss: {knn_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

KNeighborsClassifier: Starting
K-Nearest Neighbors: accuracy: 0.24825461517128283, log_loss: 18.790414742791288
KNeighborsClassifier: Done: 2m 35.89s


In [72]:
# Define the default models
models_knn = {
    'KNeighborsClassifier: Default': KNeighborsClassifier(),
}

In [73]:
for name, model in models_knn.items():
    # Get metrics row for the report - will fit() and predict() to generate metrics
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

KNeighborsClassifier: Default: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44]
>>> [0.  0.2 0.4 0.6 0.8 1. ]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNeighborsClassifier: Default: Done: 8m 5.54s


In [74]:
label=f'Model ({sample_file})'
# results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)

results_defaults_df[report_cols]

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,Precision,Recall,F1,AUC,Log_loss,TN,FP,FN,TP
LogisticRegression: L1 / saga,1m 29.27s,29.19%,29.20%,14.15%,29.20%,14.66%,63.23%,2.6362135489065714,0,4,0,89
LogisticRegression: Default,1m 58.27s,29.19%,29.19%,13.94%,29.19%,14.65%,63.22%,2.6363059389563923,0,4,0,87
DummyClassifier: most_frequent,0.07s,28.93%,28.93%,8.37%,28.93%,12.98%,50.00%,25.617535643729592,0,0,0,0
KNeighborsClassifier: Default,1.72s,44.13%,24.83%,19.91%,24.83%,20.96%,61.19%,18.790414742791288,36,81,92,2648
DummyClassifier: uniform,0.07s,2.21%,2.21%,11.62%,2.21%,3.12%,50.00%,3.8066624897703183,16,19,240,242
DummyClassifier: stratified,0.09s,11.55%,11.44%,11.46%,11.44%,11.45%,49.95%,31.920274739734527,1,35,43,722


### Random Forest Ensemble

In [76]:
name='RandomForestClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=15,
                            min_samples_leaf=5, min_samples_split=25, 
                            random_state=Config.RANDOM_STATE, verbose=1, n_jobs=2)
rf.fit(X_train_scaled, y_train)
y_preds = rf.predict(X_test_scaled)
pred_probs = rf.predict_proba(X_test_scaled)
rf_acc = accuracy_score(y_test, y_preds)
rf_loss = log_loss(y_test, pred_probs)

label='Random Forest'
results.append([label, rf_acc, rf_loss, time_secs_to_msg(time.time()-start_time), rf.get_params()])
print(f'{label}: accuracy: {rf_acc}, log_loss: {rf_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

RandomForestClassifier: Starting


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   46.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.7min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    2.0s


Random Forest: accuracy: 0.3304715854967263, log_loss: 2.4068490030416556
RandomForestClassifier: Done: 1m 49.93s


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.5s finished


In [77]:
# Define the default models
models_rf = {
    'RandomForestClassifier: Default': RandomForestClassifier(random_state=Config.RANDOM_STATE),
    f'RandomForestClassifier: n_e={n_estimators}, mx_d={15}': RandomForestClassifier(n_estimators=n_estimators, max_depth=15,
                                                              min_samples_leaf=5, min_samples_split=25, 
                                                              random_state=Config.RANDOM_STATE, 
                                                              verbose=1, n_jobs=2),
}

In [78]:
for name, model in models_rf.items():
    # Get metrics row for the report - will fit() and predict() to generate metrics
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

RandomForestClassifier: Default: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [0.00000000e+00 4.76190476e-04 5.00000000e-04 ... 9.96666667e-01
 9.96666667e-01 1.00000000e+00]
RandomForestClassifier: Default: Done: 9m 13.14s
RandomForestClassifier: n_e=100, mx_d=15: Starting


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   47.8s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.5min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    3.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.7s finished


>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 1  2  4  6  7 12 16 19 20 21 22 23 24 25 26 27 28 30 31 32 36 38 42 43
 44]
>>> [0.00000000e+00 5.14271021e-07 6.02083208e-07 ... 9.25122769e-01
 9.32655323e-01 9.33339876e-01]


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    9.9s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   19.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    2.7s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForestClassifier: n_e=100, mx_d=15: Done: 2m 3.05s


In [79]:
label=f'Model ({sample_file})'
# results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)

results_defaults_df[report_cols]

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,Precision,Recall,F1,AUC,Log_loss,TN,FP,FN,TP
RandomForestClassifier: Default,4m 50.12s,87.94%,33.97%,28.66%,33.97%,29.36%,72.13%,5.68160131627927,40,45,11,2236
"RandomForestClassifier: n_e=100, mx_d=15",1m 31.71s,35.67%,33.05%,28.66%,33.05%,21.77%,74.16%,2.406849003041656,0,43,0,1543
LogisticRegression: L1 / saga,1m 29.27s,29.19%,29.20%,14.15%,29.20%,14.66%,63.23%,2.6362135489065714,0,4,0,89
LogisticRegression: Default,1m 58.27s,29.19%,29.19%,13.94%,29.19%,14.65%,63.22%,2.6363059389563923,0,4,0,87
DummyClassifier: most_frequent,0.07s,28.93%,28.93%,8.37%,28.93%,12.98%,50.00%,25.617535643729592,0,0,0,0
KNeighborsClassifier: Default,1.72s,44.13%,24.83%,19.91%,24.83%,20.96%,61.19%,18.790414742791288,36,81,92,2648
DummyClassifier: uniform,0.07s,2.21%,2.21%,11.62%,2.21%,3.12%,50.00%,3.8066624897703183,16,19,240,242
DummyClassifier: stratified,0.09s,11.55%,11.44%,11.46%,11.44%,11.45%,49.95%,31.920274739734527,1,35,43,722


### XGBoost Ensemble

In [81]:
name='XGBClassifier'
print(f'{name}: Starting', flush=True)
start_time = time.time()

xgb_clf = XGBClassifier(n_estimators=n_estimators, objective="multi:softprob", 
                        n_jobs=2, random_state=Config.RANDOM_STATE)
xgb_clf.fit(X_train, y_train)
y_preds = xgb_clf.predict(X_test_scaled)
pred_probs = xgb_clf.predict_proba(X_test_scaled)
xgb_acc = accuracy_score(y_test, y_preds)
xgb_loss = log_loss(y_test, pred_probs)

label='XGBoost'
results.append([label, xgb_acc, xgb_loss, time_secs_to_msg(time.time()-start_time), xgb_clf.get_params()])
print(f'{label}: accuracy: {xgb_acc}, log_loss: {xgb_loss}')

print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

XGBClassifier: Starting
XGBoost: accuracy: 0.061634090169172706, log_loss: 3.560720231745073
XGBClassifier: Done: 4m 4.79s


In [82]:
# Define the default models
models_xgb = {
    'XGBClassifier: Default': XGBClassifier(random_state=Config.RANDOM_STATE),
    f'XGBClassifier: n_e={n_estimators}, mx_d={15}': XGBClassifier(n_estimators=n_estimators, objective="multi:softprob", 
                        n_jobs=2, random_state=Config.RANDOM_STATE),
}

In [83]:
for name, model in models_xgb.items():
    # Get metrics row for the report - will fit() and predict() to generate metrics
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

XGBClassifier: Default: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44]
>>> [1.3565548e-08 1.5720673e-08 1.9377072e-08 ... 9.5342827e-01 9.5907706e-01
 9.6252662e-01]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBClassifier: Default: Done: 4m 39.14s
XGBClassifier: n_e=100, mx_d=15: Starting
>>> (165866,) (165866,) (165866, 45)
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]
>>> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44]
>>> [1.3565548e-08 1.5720673e-08 1.9377072e-08 ... 9.5342827e-01 9.5907706e-01
 9.6252662e-01]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBClassifier: n_e=100, mx_d=15: Done: 4m 46.58s


In [84]:
# results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by='Test Accuracy', ascending=False)

results_defaults_df[report_cols]

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,Precision,Recall,F1,AUC,Log_loss,TN,FP,FN,TP
RandomForestClassifier: Default,4m 50.12s,87.94%,33.97%,28.66%,33.97%,29.36%,72.13%,5.68160131627927,40,45,11,2236
XGBClassifier: Default,3m 37.21s,35.92%,33.64%,27.36%,33.64%,24.63%,74.45%,2.3760751180898074,4,35,0,1552
"XGBClassifier: n_e=100, mx_d=15",3m 44.18s,35.92%,33.64%,27.36%,33.64%,24.63%,74.45%,2.3760751180898074,4,35,0,1552
"RandomForestClassifier: n_e=100, mx_d=15",1m 31.71s,35.67%,33.05%,28.66%,33.05%,21.77%,74.16%,2.406849003041656,0,43,0,1543
LogisticRegression: L1 / saga,1m 29.27s,29.19%,29.20%,14.15%,29.20%,14.66%,63.23%,2.6362135489065714,0,4,0,89
LogisticRegression: Default,1m 58.27s,29.19%,29.19%,13.94%,29.19%,14.65%,63.22%,2.6363059389563923,0,4,0,87
DummyClassifier: most_frequent,0.07s,28.93%,28.93%,8.37%,28.93%,12.98%,50.00%,25.617535643729592,0,0,0,0
KNeighborsClassifier: Default,1.72s,44.13%,24.83%,19.91%,24.83%,20.96%,61.19%,18.790414742791288,36,81,92,2648
DummyClassifier: uniform,0.07s,2.21%,2.21%,11.62%,2.21%,3.12%,50.00%,3.8066624897703183,16,19,240,242
DummyClassifier: stratified,0.09s,11.55%,11.44%,11.46%,11.44%,11.45%,49.95%,31.920274739734527,1,35,43,722


## Summarize Results

In [86]:
reports_cols = ['Fit Time', 'Accuracy', 'Log Loss', 'Params']

In [87]:
label=f'Model ({sample_file})'
results_df = pd.DataFrame(results,
                          columns=[label, 'Accuracy', 'Log Loss', 'Fit Time', 'Params']
                         ).set_index(label)

results_styled = results_df.style.set_table_styles(
    [{'selector': 'td', 'props': [('max-width', '350px'), 
                                  ('white-space', 'normal'), 
                                  ('word-wrap', 'break-word')]}]
)
results_styled

Unnamed: 0_level_0,Accuracy,Log Loss,Fit Time,Params
Model (../data/incidents_clean.csv),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Baseline: DummyClassifier - strategy=uniform,0.02212,3.806662,0.39s,"{'constant': None, 'random_state': 42, 'strategy': 'uniform'}"
LogisticRegression (L1),0.291983,2.636214,1m 45.78s,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': 3, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga', 'tol': 0.0001, 'verbose': 1, 'warm_start': False}"
K-Nearest Neighbors,0.248255,18.790415,2m 35.89s,"{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}"
Random Forest,0.330472,2.406849,1m 49.93s,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 25, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': 2, 'oob_score': False, 'random_state': 42, 'verbose': 1, 'warm_start': False}"
XGBoost,0.061634,3.56072,4m 4.79s,"{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 100, 'n_jobs': 2, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}"


In [88]:
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_styled, data_utils.Config.IMAGE_DIR / 'results_model_exploration.png')

In [89]:
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_defaults_df[report_cols], data_utils.Config.IMAGE_DIR / 'table_models_defaults.png')