# Capstone: Exploratory Prediction Modeling

## Imports & Utilities

### Imports

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns


# Import utilities
# import pathlib
import time

# Export dataFrame's as images
import dataframe_image as dfi

# import project utils
import sys
sys.path.append('../src')

import data_utils
from data_utils import Config

import graph_utils

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve

from xgboost import XGBClassifier
import xgboost as xgb

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [5]:
# Configure logging
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# logging.getLogger().setLevel(logging.DEBUG)
# logging.getLogger().setLevel(logging.INFO)

### Utility Functions

In [7]:
def time_secs_to_msg(lapse_time_secs, mins_label='m', secs_label='s'):
    if lapse_time_secs <= 60:
        return f'{lapse_time_secs%60:.2f}{secs_label}'
    else:
        return f'{lapse_time_secs//60:,.0f}{mins_label} {lapse_time_secs%60:.2f}{secs_label}'

In [8]:
def log_optimizer_status_cv(optimizer_results):
    """
    Summarizes optimizer results for each iteration so we can see what params are impactful

    :params optimizer_results: Optimizer data passed in the callback
    """

    iter_num = len(optimizer_results["x_iters"])
    # print(f'... Iteration #{iter_num} Best(score: {optimizer_results["func_vals"][iter_num-1]:,.4f}, best_params: {optimizer_results["x"]})')
    print('... Iteration #{} Best(score: {:,.4f}, best_params: {})'
          .format(iter_num, optimizer_results["func_vals"][iter_num-1], optimizer_results["x"]))
    # print(f'... opt_r={optimizer_results}')

In [9]:
def build_results_row(name, model, Xtr, Xte, ytr, yte, optimizer=None):
    """
    Given the model and training/test sets, builds a row of metrics for reporting the results

    :param name: Name/Description of model
    :param model: Fully constructed model instance - will call fit() and predict() to get metrics
    :param Xtr: X_train - scale before calling
    :param Xte: X_test - scale before calling
    :param ytr: Y_train set
    :param yteL: Y_test set
    :param optimizer: If using CV for optimization, pass in the optimizer here
    """

    if optimizer is None:
        use_best = False
    else:
        use_best = True
        # clf = optimizer
        
    print(f'{name}: Starting (use_best={use_best})', flush=True)
    start_time = time.time()

    # train the model
    clf = None
    if (use_best):
        clf = optimizer.fit(Xtr, ytr, callback=log_optimizer_status_cv)
    else:
        clf = model.fit(Xtr, ytr)
        
    # Save fit time
    fit_time = time.time() - start_time
    logging.debug(f'{name}: Fitted: {fit_time} secs')

    # if we're tuning then use best_estimator
    if use_best:
        clf = optimizer.best_estimator_
        logging.debug(f'{name}: Best Model={clf}')
        logging.debug(f'{name}: Best Params={optimizer.best_params_}')

    # get the predictions / probabilities
    y_preds = clf.predict(Xte)
    y_probs_full = clf.predict_proba(Xte)
    y_probs = y_probs_full[:, 1]

    logging.debug(f'{name}: Got preds/probs')

    cm = confusion_matrix(yte, y_preds)
    logging.debug(f'{name}: cm.shape: {cm.shape}')

    # Get metrics
    row = {
        'Train Time': time_secs_to_msg(fit_time),
        'Train Accuracy': f'{clf.score(Xtr, ytr)*100:.2f}%',
        'Test Accuracy': f'{clf.score(Xte, yte)*100:.2f}%',
        'Precision': f'{precision_score(yte, y_preds, average="weighted")*100:.2f}%',  # for multi-class with imbalance
        'Recall': f'{recall_score(yte, y_preds, average="weighted")*100:.2f}%',
        'F1': f'{f1_score(yte, y_preds, average="weighted")*100:.2f}%',
        'AUC': f'{roc_auc_score(yte, y_probs_full, average="weighted", multi_class="ovr")*100:.2f}%',    # faster with imbalanced multi-class cases
        'LogLoss': f'{log_loss(yte, y_probs_full, labels=np.unique(yte)):.4f}',
        'preds': y_preds,
        'probs': y_probs,
        'cm': cm,
        'params': clf.get_params(),
        'best_params': None,
        'best_model': clf,
    }
    if use_best:
        row.update({'best_params': dict(optimizer.best_params_)})
        
    logging.debug(f'{name}: Got metrics')
    
    print(f'{name}: Done: {time_secs_to_msg(time.time()-start_time)}')

    return row

## The Data: San Francisco Police Department Incident Reports

### Read the Data

In [12]:
# Which dataset to work from? Select sample size percentage

# sample_file = data_utils.select_sample_csv_file(pct=10)
sample_file = data_utils.select_sample_csv_file(pct=100)
# sample_file = data_utils.select_sample_csv_file(pct=75)
# sample_file = data_utils.select_sample_csv_file(pct=50)
# sample_file = data_utils.select_sample_csv_file(pct=25)

print(f'Selected sample file: {sample_file}')

Selected sample file: ../data/incidents_clean.csv


In [13]:
current_raw_df, current_clean_df = data_utils.get_clean_data_from_csv(sample_file)

Reading file: ../data/incidents_clean.csv ... Done: 894,585 rows, 36 columns
... Converting datetime to timeseries ... Done
... Setting index to datetime ... Done
Done


In [14]:
data = data_utils.preprocess_data(current_raw_df.copy())

Pre-processing ... 
... Dropping unwanted columns ... 
... preprocess_drop_cols: Column Unnamed: 0 not dropped: KeyError("['Unnamed: 0'] not found in axis")
... preprocess_drop_cols: Column esncag_-_boundary_file dropped
... preprocess_drop_cols: Column central_market/tenderloin_boundary_polygon_-_updated dropped
... preprocess_drop_cols: Column civic_center_harm_reduction_project_boundary dropped
... preprocess_drop_cols: Column hsoc_zones_as_of_2018-06-05 dropped
... preprocess_drop_cols: Column invest_in_neighborhoods_(iin)_areas dropped
... preprocess_drop_cols: Column report_type_code dropped
... preprocess_drop_cols: Column report_type_description dropped
... preprocess_drop_cols: Column filed_online dropped
... preprocess_drop_cols: Column intersection dropped
... preprocess_drop_cols: Column cnn dropped
... preprocess_drop_cols: Column point dropped
... preprocess_drop_cols: Column supervisor_district dropped
... preprocess_drop_cols: Column supervisor_district_2012 dropped
...

In [15]:
# Fix data value artifacts that were discovered during EDA
data = data_utils.fix_data_artifacts(data)

Fixing data artifacts (in-place) ... 
... Category column:
    ..."Human Trafficking*"
    ..."Motor Vehicle Theft"
    ..."Weapons Offence"
Done


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             829328 non-null  object 
 1   time             829328 non-null  object 
 2   year             829328 non-null  int64  
 3   day_of_week      829328 non-null  object 
 4   category         829328 non-null  object 
 5   resolution       829328 non-null  object 
 6   police_district  829328 non-null  object 
 7   neighborhood     829328 non-null  object 
 8   latitude         829328 non-null  float64
 9   longitude        829328 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 69.6+ MB


## Summary of EDA

After cleaning the data and performing basic EDA, we have established the following:

1. Target variable `category`
   * Evenly spread across time
   * Incidence of crimes is extremely skewed/unbalanced by category. Larceny (29.02%) by far outweighing the other top-10 categories with each being in the single digits
3. Features impacting `category`
   * Affected by incident time and date components: date, time, day of week, month, year, etc
   * Affected by police disctrict
   * Affect by latitude and logitude (TODO: need visualization)
4. We artificially removed nulls (TODO: will come back to impute data later)

## Feature Engineering

In [20]:
data.head(2)

Unnamed: 0_level_0,date,time,year,day_of_week,category,resolution,police_district,neighborhood,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-03-11 14:00:00,2023/03/11,14:00,2023,Saturday,Assault,Open or Active,Park,Golden Gate Park,37.772895,-122.454285
2022-06-27 12:00:00,2022/06/27,12:00,2022,Monday,Lost Property,Open or Active,Central,Financial District/South Beach,37.787359,-122.408227


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             829328 non-null  object 
 1   time             829328 non-null  object 
 2   year             829328 non-null  int64  
 3   day_of_week      829328 non-null  object 
 4   category         829328 non-null  object 
 5   resolution       829328 non-null  object 
 6   police_district  829328 non-null  object 
 7   neighborhood     829328 non-null  object 
 8   latitude         829328 non-null  float64
 9   longitude        829328 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 69.6+ MB


### Encoding: Time-based columns

Let's unpack the date and time into their components that are still missing so there is less to encode:

In [24]:
data['hour'] = data.index.map(lambda x: x.hour)
data['minute'] = data.index.map(lambda x: x.minute)
data['day'] = data.index.map(lambda x: x.day)
data['month'] = data.index.map(lambda x: x.month)

Now let's encode day_of_week to numeric values:

In [26]:
enc_dow = LabelEncoder()
enc_dow.fit(data.day_of_week.unique())
data['dow'] = enc_dow.transform(data.day_of_week)

Let's mark the redundant columns to be dropped after feature engineering:

In [28]:
drop_encoded_cols = ['date', 'time', 'day_of_week']

### Encoding: Resolution

We will also drop the resolution column since it doesn't impact crime prediction:

In [31]:
data.resolution.value_counts()

resolution
Open or Active          662581
Cite or Arrest Adult    166747
Name: count, dtype: int64

In [32]:
drop_encoded_cols.append('resolution')

### Encoding: Category

In [34]:
enc_cat = LabelEncoder()
enc_cat.fit(data.category.unique())
data.category = enc_cat.transform(data.category)

### Encoding: Police District

In [36]:
enc_pd = LabelEncoder()
enc_pd.fit(data.police_district.unique())
data['pd'] = enc_pd.transform(data.police_district)

### Encoding: Neighborhood

In [38]:
enc_hood = LabelEncoder()
enc_hood.fit(data.neighborhood.unique())
data.neighborhood = enc_hood.transform(data.neighborhood)

### Dropping Redundant Columns

We can now drop the redundant encoded columns:

In [41]:
drop_encoded_cols.append('police_district')

print(f'Dropping encoded columns: {drop_encoded_cols}')
data.drop(columns=drop_encoded_cols, inplace=True)

Dropping encoded columns: ['date', 'time', 'day_of_week', 'resolution', 'police_district']


In [42]:
data.head(2)

Unnamed: 0_level_0,year,category,neighborhood,latitude,longitude,hour,minute,day,month,dow,pd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-03-11 14:00:00,2023,1,7,37.772895,-122.454285,14,0,11,3,2,5
2022-06-27 12:00:00,2022,18,5,37.787359,-122.408227,12,0,27,6,1,1


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 829328 entries, 2023-03-11 14:00:00 to 2023-03-21 17:42:00
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          829328 non-null  int64  
 1   category      829328 non-null  int64  
 2   neighborhood  829328 non-null  int64  
 3   latitude      829328 non-null  float64
 4   longitude     829328 non-null  float64
 5   hour          829328 non-null  int64  
 6   minute        829328 non-null  int64  
 7   day           829328 non-null  int64  
 8   month         829328 non-null  int64  
 9   dow           829328 non-null  int64  
 10  pd            829328 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 75.9 MB


In [44]:
# data.to_csv('../data/incidents_10.csv')

## Data Preparation

### Create Train/Test Splits

In [47]:
X = data.drop('category', axis='columns')
y = data['category']

In [48]:
# OneHot Encode the features and drop the first value to reduce multicollinearity
X = pd.get_dummies(X, drop_first=True)

In [49]:
# Consistent random_state for the project
print(f'Project-wide random_state: {Config.RANDOM_STATE}')

Project-wide random_state: 42


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    stratify=y, random_state=Config.RANDOM_STATE)

In [51]:
print('AFTER TRAIN_TEST_SPLIT: Data{}, X_train{}, X_test{}, y_train{}, y_test{}'
      .format(data.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape))

AFTER TRAIN_TEST_SPLIT: Data(829328, 11), X_train(663462, 10), X_test(165866, 10), y_train(663462,), y_test(165866,)


In [52]:
# spot-check feature encoding
X.T.iloc[:, 0:5]

datetime,2023-03-11 14:00:00,2022-06-27 12:00:00,2023-03-16 17:30:00,2023-03-21 15:50:00,2021-08-22 09:40:00
year,2023.0,2022.0,2023.0,2023.0,2021.0
neighborhood,7.0,5.0,28.0,35.0,26.0
latitude,37.772895,37.787359,37.76229,37.787038,37.793977
longitude,-122.454285,-122.408227,-122.401324,-122.418271,-122.429804
hour,14.0,12.0,17.0,15.0,9.0
minute,0.0,0.0,30.0,50.0,40.0
day,11.0,27.0,16.0,21.0,22.0
month,3.0,6.0,3.0,3.0,8.0
dow,2.0,1.0,4.0,5.0,3.0
pd,5.0,1.0,0.0,4.0,4.0


### Feature Scaling

In [54]:
# Scale the data - we'll use StandardScaler for the baseline model
logging.debug('Scaling data')
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('AFTER SCALING: Data{}, X_train_scaled{}, X_test_scaled{}, y_train{}, y_test{}'
      .format(data.shape, X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape))

AFTER SCALING: Data(829328, 11), X_train_scaled(663462, 10), X_test_scaled(165866, 10), y_train(663462,), y_test(165866,)


## Model Exploration

The task of classifying the incident types based on a set of historical attrirbutes (features) and predicting on similar attributes is a **multiclass classification** problem. We will now experiment on some ML models that are generally used for similar problems to see what would be the best choice for us.

We will evaluate the following models:

* Simple classification models
  * `DummyClassifier` to get a baseline for our project
  * `LogisticRegression` with L1 Regularization
* Multiclass classifiers
  * `KNeighborsClassifier`
* Ensemble methods: Since our dataset has high variability with a lot of numerical and cagtegorical features with a range of mean and variance, we plan to employ ensemble methods and tune them for best results
  * `RandomForestClassifier`
  * `XGBClassifier`: We considered `XGLite` but selected XGBoost as it provides better model explainability features like SHAP values, which we expect to be able to use in explaining our results

We will now evaluate different models for predicting the Crime Category from our features:

In [58]:
X_train.columns

Index(['year', 'neighborhood', 'latitude', 'longitude', 'hour', 'minute',
       'day', 'month', 'dow', 'pd'],
      dtype='object')

### Evaluation Metrics

In this project, we are predicting or classifyig across 49 crime categories. We will use two evaluation metrics to compare our models:

1. **Accuracy**: Measures the proportion of correct predictions over all predictions made. The accuracy benchmark is 1/49 or 2.04% given our crime categories
2. **Log_Loss**: Measures the accuracy of a classifier by penalizing false classifications. It does this by taking the negative logarithm of the predicted probability for the true class. The goal is to minimize this loss, meaning that higher probabilities are assigned to the correct classes. Log loss is a powerful way to evaluate not just if the model is making the right predictions, but how confident it is in those predictions. A lower log loss indicates a model that is both accurate and confident.
   * TODO: Benchmark???

While accuracy provides a simple measure of correctness, log-loss offers a more nuanced view by considering how confident those predictions are. A model that predicts with 51% confidence for the correct class will have the same accuracy as one that predicts with 99% confidence, but their log loss will be very different. The 99%-confident model will have a much lower log loss.

We'll use them together for a comprehensive evaluation and to learn more about them.

The `build_results_row` utility function will be used to standardize the recording and reporting of our model exploration: 

### Establishing a Baseline

#### DummyClassifier

We will use the Scikit-Learn DummyClassifier method to get a baseline for our predictions using the different strategies provided by the library:

In [65]:
# let's start saving the results for reporting out
results_defaults = {}

# Reports won't print all the columns
report_cols = ['Train Time', 
               'Train Accuracy', 'Test Accuracy', 'LogLoss',
               'Precision', 'Recall', 'F1', 'AUC']

In [66]:
# Define the DummyClassifier models we want to test
models_dummy = {
    'DummyClassifier: uniform': DummyClassifier(strategy='uniform', random_state=Config.RANDOM_STATE),
    'DummyClassifier: most_frequent': DummyClassifier(strategy='most_frequent', random_state=Config.RANDOM_STATE),
    'DummyClassifier: stratified': DummyClassifier(strategy='stratified', random_state=Config.RANDOM_STATE),
}

In [67]:
# logging.getLogger().setLevel(logging.DEBUG)

# Get metrics row for the report - will fit() and predict() to generate metrics
for name, model in models_dummy.items():
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

# logging.getLogger().setLevel(logging.INFO)

DummyClassifier: uniform: Starting (use_best=False)
DummyClassifier: uniform: Done: 3.21s
DummyClassifier: most_frequent: Starting (use_best=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


DummyClassifier: most_frequent: Done: 3.15s
DummyClassifier: stratified: Starting (use_best=False)
DummyClassifier: stratified: Done: 6.18s


The warning above is from the precision calculation within scikit-learn, and highlights that some labels have no predicted samples, which results in precision being undefined for those labels. We can ignore the warning since we're using accuracy as our key evaluation metric. We could use `prescion_score(zero_division=0)` to suppress the warning, but we'll ignore it instead to ensure we're aware of the condition.

In [69]:
# View results by highest Test Accuracy and lowest log_loss
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by=['Test Accuracy', 'LogLoss'], ascending=[False, True])

In [70]:
# format the output to hilite results
hilite=Config.TBL_HILITE_COLOR
results_defaults_styled = results_defaults_df[report_cols].style.map(lambda val: f'background-color: {hilite}',
                                                                     subset=['Test Accuracy','LogLoss'])
results_defaults_styled = results_defaults_styled.set_table_styles({
    'LogLoss': [{'selector': 'th', 'props': [('background-color', hilite)]}],
    'Test Accuracy': [{'selector': 'th', 'props': [('background-color', hilite)]}]
}, overwrite=False)
results_defaults_styled

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,LogLoss,Precision,Recall,F1,AUC
DummyClassifier: most_frequent,0.13s,28.93%,28.93%,25.6175,8.37%,28.93%,12.98%,50.00%
DummyClassifier: uniform,0.19s,2.21%,2.21%,3.8067,11.62%,2.21%,3.12%,50.00%
DummyClassifier: stratified,0.10s,11.55%,11.44%,31.9203,11.46%,11.44%,11.45%,49.95%


#### Default Models

We will now explore the selected models with out-of-the-box default settings of their hyperparameters to get a baseline per model class.

In [73]:
# Define the default models
models_default = {
    'LogisticRegression (Default)': LogisticRegression(random_state=Config.RANDOM_STATE),
    'KNeighborsClassifier (Default)': KNeighborsClassifier(),
    'RandomForestClassifier (Default)': RandomForestClassifier(random_state=Config.RANDOM_STATE),
    'XGBClassifier (Default)': XGBClassifier(random_state=Config.RANDOM_STATE)
}

In [74]:
# logging.getLogger().setLevel(logging.DEBUG)

# Get metrics row for the report - will fit() and predict() to generate metrics
for name, model in models_default.items():
    results_defaults[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

# logging.getLogger().setLevel(logging.INFO)

LogisticRegression (Default): Starting (use_best=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression (Default): Done: 3m 36.23s
KNeighborsClassifier (Default): Starting (use_best=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNeighborsClassifier (Default): Done: 17m 53.62s
RandomForestClassifier (Default): Starting (use_best=False)
RandomForestClassifier (Default): Done: 13m 14.98s
XGBClassifier (Default): Starting (use_best=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBClassifier (Default): Done: 7m 28.20s


In [75]:
# View results by highest Test Accuracy and lowest log_loss
results_defaults_df = pd.DataFrame(results_defaults).T.sort_values(by=['Test Accuracy', 'LogLoss'], ascending=[False, True])

In [76]:
# format the output to hilite results
hilite=Config.TBL_HILITE_COLOR
results_defaults_styled = results_defaults_df[report_cols].style.map(lambda val: f'background-color: {hilite}',
                                                                     subset=['Test Accuracy','LogLoss'])
results_defaults_styled = results_defaults_styled.set_table_styles({
    'LogLoss': [{'selector': 'th', 'props': [('background-color', hilite)]}],
    'Test Accuracy': [{'selector': 'th', 'props': [('background-color', hilite)]}]
}, overwrite=False)
results_defaults_styled

Unnamed: 0,Train Time,Train Accuracy,Test Accuracy,LogLoss,Precision,Recall,F1,AUC
RandomForestClassifier (Default),7m 28.22s,87.94%,33.97%,5.6816,28.66%,33.97%,29.36%,72.13%
XGBClassifier (Default),5m 26.39s,35.92%,33.64%,2.3761,27.36%,33.64%,24.63%,74.45%
LogisticRegression (Default),3m 30.57s,29.19%,29.19%,2.6363,13.94%,29.19%,14.65%,63.22%
DummyClassifier: most_frequent,0.13s,28.93%,28.93%,25.6175,8.37%,28.93%,12.98%,50.00%
KNeighborsClassifier (Default),2.85s,44.13%,24.83%,18.7904,19.91%,24.83%,20.96%,61.19%
DummyClassifier: uniform,0.19s,2.21%,2.21%,3.8067,11.62%,2.21%,3.12%,50.00%
DummyClassifier: stratified,0.10s,11.55%,11.44%,31.9203,11.46%,11.44%,11.45%,49.95%


In [77]:
# Save results to file for final report
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_defaults_styled, Config.IMAGE_DIR / 'table_models_defaults.png')

### Candidate Models

In [79]:
# parameters
VERBOSE=2             # 0: None, 1: Iteration end, 2: Iteration scores
JOBS=-1               # Use all cores

In [80]:
# let's start saving the results for reporting out
results_tuned = {}

# Reports won't print all the columns
report_cols_tuned = ['params', 'Train Time', 
                     'Train Accuracy', 'Test Accuracy', 'LogLoss',
                     'Precision', 'Recall', 'F1', 'AUC']

In [81]:
# Define the candidate models with starting params
models_tuned = {
    
    'LogisticRegression': LogisticRegression(penalty='l1', solver='saga', max_iter=1000, 
                                             verbose=VERBOSE, n_jobs=JOBS, random_state=Config.RANDOM_STATE),

    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, max_depth=15,
                                                     min_samples_leaf=5, min_samples_split=25, 
                                                     random_state=Config.RANDOM_STATE, 
                                                     verbose=VERBOSE, n_jobs=JOBS),

    'XGBClassifier': XGBClassifier(n_estimators=100, objective="multi:softprob", 
                                   n_jobs=JOBS, random_state=Config.RANDOM_STATE),
}

In [82]:
# logging.getLogger().setLevel(logging.DEBUG)

# Get metrics row for the report - will fit() and predict() to generate metrics
for name, model in models_tuned.items():
    results_tuned[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

# logging.getLogger().setLevel(logging.INFO)

LogisticRegression: Starting (use_best=False)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 30 epochs took 273 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression: Done: 4m 42.07s
RandomForestClassifier: Starting (use_best=False)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min


building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100building tree 43 of 100

building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   14.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   51.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    9.9s finished
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForestClassifier: Done: 5m 9.49s
XGBClassifier: Starting (use_best=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBClassifier: Done: 22m 0.55s


In [83]:
# View results by highest Test Accuracy and lowest log_loss
results_tuned_df = pd.DataFrame(results_tuned).T.sort_values(by=['Test Accuracy', 'LogLoss'], ascending=[False, True])

In [84]:
# format the output to hilite results
hilite=Config.TBL_HILITE_COLOR
results_tuned_styled = results_tuned_df[report_cols_tuned].style.map(lambda val: f'background-color: {hilite}', 
                                                                     subset=['Test Accuracy','LogLoss'])
results_tuned_styled = results_tuned_styled.set_table_styles({
    'params': [{'selector': 'td', 'props': [('max-width', '300px'), 
                                  ('white-space', 'normal'), 
                                  ('word-wrap', 'break-word')]}],
    'LogLoss': [{'selector': 'th', 'props': [('background-color', hilite)]}],
    'Test Accuracy': [{'selector': 'th', 'props': [('background-color', hilite)]}]
}, overwrite=False)
results_tuned_styled

Unnamed: 0,params,Train Time,Train Accuracy,Test Accuracy,LogLoss,Precision,Recall,F1,AUC
XGBClassifier,"{'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 100, 'n_jobs': -1, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}",17m 19.34s,35.92%,33.64%,2.3761,27.36%,33.64%,24.63%,74.45%
RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 25, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 2, 'warm_start': False}",3m 44.77s,35.67%,33.05%,2.4068,28.66%,33.05%,21.77%,74.16%
LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': -1, 'penalty': 'l1', 'random_state': 42, 'solver': 'saga', 'tol': 0.0001, 'verbose': 2, 'warm_start': False}",4m 33.13s,29.19%,29.20%,2.6362,14.15%,29.20%,14.66%,63.23%


In [85]:
# Save results to file for final report
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_tuned_styled, Config.IMAGE_DIR / 'table_models_tuned.png')

## Model Tuning: BayesSearchCV

In [87]:
# BayesSearchCV Tuning parameters
CV=3             # cross-validation splitting strategy: StratifiedKFold=3
VERBOSE=2        # 0: None, 1: Iteration end, 2: Iteration scores
JOBS=-1          # Use all cores
#ITERATIONS=20    # Num of param settings that are sampled (Def=50). Trades off runtime vs quality of the solution
ITERATIONS=2     # Num of param settings that are sampled (Def=50). Trades off runtime vs quality of the solution

In [88]:
results_CV = {}

# let's start saving the results for reporting out
results_tuned = {}

# Reports won't print all the columns
report_cols_CV = ['best_params', 'Train Time', 
                  'Train Accuracy', 'Test Accuracy', 'LogLoss',
                  'Precision', 'Recall', 'F1', 'AUC']

In [89]:
# Initialize our models for BayesSearchCV
models = {
    'RandomForestClassifier': RandomForestClassifier(random_state=Config.RANDOM_STATE),
    'XGBClassifier': XGBClassifier(objective="multi:softprob", random_state=Config.RANDOM_STATE),
}

# Define the search spaces for hyperparameter tuning 
model_search_spaces = {
    
    # Define search spaces for RandomForestClassifier
    'RandomForestClassifier': {
        'n_estimators': Integer(50, 100),
        'max_depth': Integer(3, 20),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Categorical(['sqrt', 'log2', None]),       # None=n_features
    },
    
    # Define search spaces for XGBClassifier
    'XGBClassifier': {
        'n_estimators': Integer(50, 1000),
        'max_depth': Integer(3, 100),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'min_child_weight': Integer(1, 10),
        'subsample': Real(0.5, 1.0),
        'colsample_bytree': Real(0.5, 1.0, 'uniform'),
        'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
    }
}

In [90]:
logging.getLogger().setLevel(logging.DEBUG)

# Get metrics row for the report - will fit() and predict() to generate metrics
for name, model in models.items():
    
    print(f'{name}: Starting BayesSearchCV optimization ({CV}-fold)', flush=True)
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=model_search_spaces[name],
        scoring = 'neg_log_loss',
        n_iter=ITERATIONS,
        cv=CV,
        n_jobs=JOBS,
        random_state=Config.RANDOM_STATE,
        verbose=VERBOSE
    )

    results_CV[name] = build_results_row(name, model, X_train_scaled, X_test_scaled, y_train, y_test, optimizer=opt)

    print(f'{name}: Done BayesSearchCV optimization ({CV}-fold)', flush=True)

logging.getLogger().setLevel(logging.INFO)

RandomForestClassifier: Starting BayesSearchCV optimization (3-fold)
RandomForestClassifier: Starting (use_best=True)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
... Iteration #1 Best(score: 2.5062, best_params: [10, None, 9, 8, 84])
Fitting 3 folds for each of 1 candidates, totalling 3 fits
... Iteration #2 Best(score: 2.4721, best_params: [17, None, 4, 19, 93])
[CV] END max_depth=10, max_features=None, min_samples_leaf=9, min_samples_split=8, n_estimators=84; total time=19.9min
[CV] END max_depth=17, max_features=None, min_samples_leaf=4, min_samples_split=19, n_estimators=93; total time=19.8min
[CV] END max_depth=10, max_features=None, min_samples_leaf=9, min_samples_split=8, n_estimators=84; total time=19.8min
[CV] END max_depth=17, max_features=None, min_samples_leaf=4, min_samples_split=19, n_estimators=93; total time=19.9min
[CV] END max_depth=10, max_features=None, min_samples_leaf=9, min_samples_split=8, n_estimators=84; total time=19.9min
[CV] END max_depth=17,

DEBUG: RandomForestClassifier: Fitted: 3173.6253917217255 secs
DEBUG: RandomForestClassifier: Best Model=RandomForestClassifier(max_depth=17, max_features=None, min_samples_leaf=4,
                       min_samples_split=19, n_estimators=93, random_state=42)
DEBUG: RandomForestClassifier: Best Params=OrderedDict([('max_depth', 17), ('max_features', None), ('min_samples_leaf', 4), ('min_samples_split', 19), ('n_estimators', 93)])
DEBUG: RandomForestClassifier: Got preds/probs
DEBUG: RandomForestClassifier: cm.shape: (45, 45)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
DEBUG: RandomForestClassifier: Got metrics


RandomForestClassifier: Done: 54m 8.63s
RandomForestClassifier: Done BayesSearchCV optimization (3-fold)
XGBClassifier: Starting BayesSearchCV optimization (3-fold)
XGBClassifier: Starting (use_best=True)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
... Iteration #1 Best(score: 3.1450, best_params: [0.4160029192647807, 0.8638628715886625, 0.2387586688716479, 34, 7, 443, 1.440064730980368e-06, 0.7482570377261556, 0.6522316555182531])
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END colsample_bylevel=0.4160029192647807, colsample_bytree=0.8638628715886625, learning_rate=0.2387586688716479, max_depth=34, min_child_weight=7, n_estimators=443, reg_alpha=1.440064730980368e-06, reg_lambda=0.7482570377261556, subsample=0.6522316555182531; total time=110.6min
... Iteration #2 Best(score: 2.4544, best_params: [0.8390144719977516, 0.9416576386904312, 0.02806554771929606, 95, 9, 109, 1.7570205641667407e-08, 1.7500432085329334e-05, 0.8178645509395852])
[CV] END cols

DEBUG: XGBClassifier: Fitted: 9158.986104726791 secs
DEBUG: XGBClassifier: Best Model=XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.8390144719977516, colsample_bynode=None,
              colsample_bytree=0.9416576386904312, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.02806554771929606,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=95, max_leaves=None,
              min_child_weight=9, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=109, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
DEBUG: XGBClassifier: Best Params=OrderedDict([('colsample_bylevel', 0.8390144719977516), ('cols

XGBClassifier: Done: 160m 52.31s
XGBClassifier: Done BayesSearchCV optimization (3-fold)


In [91]:
# View results by highest Test Accuracy and lowest log_loss
results_CV_df = pd.DataFrame(results_CV).T.sort_values(by=['Test Accuracy', 'LogLoss'], ascending=[False, True])

In [92]:
# format the output to hilite results
hilite=Config.TBL_HILITE_COLOR
results_CV_styled = results_CV_df[report_cols_CV].style.map(lambda val: f'background-color: {hilite}', 
                                                                     subset=['Test Accuracy','LogLoss'])
results_CV_styled = results_CV_styled.set_table_styles({
    'best_params': [{'selector': 'td', 'props': [('max-width', '300px'), 
                                                 ('white-space', 'normal'), 
                                                 ('word-wrap', 'break-word')]}],
    'LogLoss': [{'selector': 'th', 'props': [('background-color', hilite)]}],
    'Test Accuracy': [{'selector': 'th', 'props': [('background-color', hilite)]}]
}, overwrite=False)
results_CV_styled

Unnamed: 0,best_params,Train Time,Train Accuracy,Test Accuracy,LogLoss,Precision,Recall,F1,AUC
XGBClassifier,"{'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.9416576386904312, 'learning_rate': 0.02806554771929606, 'max_depth': 95, 'min_child_weight': 9, 'n_estimators': 109, 'reg_alpha': 1.7570205641667407e-08, 'reg_lambda': 1.7500432085329334e-05, 'subsample': 0.8178645509395852}",152m 38.99s,48.57%,34.59%,2.429,28.94%,34.59%,26.23%,75.63%
RandomForestClassifier,"{'max_depth': 17, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 19, 'n_estimators': 93}",52m 53.63s,39.73%,33.56%,2.4402,28.20%,33.56%,23.89%,74.34%


In [93]:
# Save results to file for final report
if not Config.SUPPRESS_OUTPUT_FILES:
    dfi.export(results_CV_styled, Config.IMAGE_DIR / 'table_models_CV.png')