# ML Model Training

In [1]:
# | echo: false
%load_ext lab_black

In [2]:
import json
import os
from datetime import datetime
from typing import Dict, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as skm
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

## About

### Objective

This step trains a ML model for predicting visitors' propensity to make a purchase on a future visit to the merchandise store.

### Data
Same as for baseline model development.

### Evaluation Metrics
Same as for baseline model development.

### Assumptions
Same as for baseline model development.

### Notation
Same as for baseline model development.

## User Inputs

Get relative path to project root directory

In [3]:
# | code-fold: false
PROJ_ROOT_DIR = os.path.join(os.pardir)

Define the different types of features in the transformed data

1. categoricals
2. numericals
3. categorical features not used in this step
4. metadata features (visit ID, visit number, etc.) for each visit
5. `datetime` features not used in this step

In [4]:
# | code-fold: false
categorical_features = [
    "deviceCategory",
    "bounces",
    "channelGrouping",
    "medium",
    "source",
]
numerical_features = [
    "hits",
    "promos_displayed",
    "promos_clicked",
    "product_views",
    "product_clicks",
    "pageviews",
    "time_on_site",
]

categorical_features_numerical = ["last_action"]
metadata_features_unused = [
    "fullvisitorid",
    "visitId",
    "visitNumber",
    "visitStartTime",
]
datetime_features_unused = [
    "quarter",
    "month",
    "day_of_month",
    "day_of_week",
    "hour",
    "minute",
    "second",
]

Get path to `data/processed` in which the transformed data splits (training, validation and test) produced by the (preceding) data transformation step were exported

In [5]:
# | code-fold: false
data_dir = os.path.join(PROJ_ROOT_DIR, "data")
processed_data_dir = os.path.join(PROJ_ROOT_DIR, "data", "processed")
models_dir = os.path.join(PROJ_ROOT_DIR, "models")

Define a Python helper function to change probabilities into labels, using a user-specified discrimination threshold

In [6]:
# | code-fold: false
def convert_soft_to_hard_labels(
    y_pred_proba: pd.Series, disc_threshold: float = 0.5
) -> pd.Series:
    """Convert probabilities to labels."""
    y_pred = (y_pred_proba > disc_threshold).astype(int)
    return y_pred

A helper function is defined to retrieve the area under precision recall curve

In [7]:
# | code-fold: false
def get_pr_auc(y_true, y_pred_proba, sample_weights) -> float:
    """."""
    precision, recall, _ = skm.precision_recall_curve(
        y_true, y_pred_proba, pos_label=1, sample_weight=sample_weights
    )
    auc_score = skm.auc(recall, precision)
    return auc_score

A helper function is defined to get the same metrics used in development of the baseline model, without including uplift due to the difficulties we had in using that metric with the baseline model

In [8]:
# | code-fold: false
def get_metrics(
    y_true,
    y_pred,
    y_pred_proba,
    ds_factor: float = 1.0,
    average="binary",
    zero_division="warn",
) -> Dict[str, float]:
    """Calculate sklearn evaluation metrics."""
    if ds_factor != 1.0:
        sample_weights = get_sample_weight(y_pred, ds_factor)
    else:
        sample_weights = None

    # get area under precision-recall curve
    pr_auc_score = get_pr_auc(y_true, y_pred_proba, sample_weights)

    # assemble summary dict to compute metrics
    metrics_dict = dict(
        # accuracy
        accuracy=skm.accuracy_score(y_true, y_pred, sample_weight=sample_weights),
        # balanced accuracy
        balanced_accuracy=skm.balanced_accuracy_score(
            y_true, y_pred, sample_weight=sample_weights
        ),
        # precision
        precision=skm.precision_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        # recall
        recall=skm.recall_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        # f1
        f1=skm.f1_score(
            y_true,
            y_pred,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        # f-0.5
        f05=skm.fbeta_score(
            y_true,
            y_pred,
            beta=0.5,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        # f2
        f2=skm.fbeta_score(
            y_true,
            y_pred,
            beta=2.0,
            average=average,
            sample_weight=sample_weights,
            zero_division=zero_division,
        ),
        # brier score
        brier=skm.brier_score_loss(
            y_true, y_pred, sample_weight=sample_weights, pos_label=1
        ),
        # area under precision-recall curve (calculated above)
        pr_auc=pr_auc_score,
    )
    return metrics_dict

::: {.callout-note title="Notes"}

1. For `average`, the value chosen is binary so that the metric is calculated and returned for the minority class (visitor made purchase on return visit to merchandise store) only.
2. sample weights are not used to calculate the metrics.
:::

Below is a helper function to customize the axes of a `matplotlib` plot

In [9]:
# | code-fold: false
def customize_axis(ax) -> None:
    """Customize matplotlib axis properties."""
    ax.spines["left"].set_edgecolor("black")
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["bottom"].set_edgecolor("black")
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["top"].set_edgecolor("whitesmoke")
    ax.spines["top"].set_linewidth(1.5)
    ax.spines["right"].set_edgecolor("whitesmoke")
    ax.spines["right"].set_linewidth(1.5)
    ax.grid(which="both", axis="both", color="gainsboro", zorder=3)

## Get Data

Load the transformed data for the training, validation and test data splits

In [10]:
# | code-fold: false
df_train = pd.read_parquet(
    os.path.join(processed_data_dir, "train_processed.parquet.gzip")
).astype({"bounces": pd.CategoricalDtype()})
df_val = pd.read_parquet(
    os.path.join(processed_data_dir, "val_processed.parquet.gzip")
).astype({"bounces": pd.CategoricalDtype()})
df_test = pd.read_parquet(
    os.path.join(processed_data_dir, "test_processed.parquet.gzip")
).astype({"bounces": pd.CategoricalDtype()})
df_train.head()

Unnamed: 0,fullvisitorid,visitId,visitNumber,visitStartTime,country,quarter,month,day_of_month,day_of_week,hour,...,added_to_cart,made_purchase_on_future_visit,bounces,last_action,source,medium,channelGrouping,browser,os,deviceCategory
0,4180680121446408775,1476943855,1,2016-10-19 23:10:55,United States,4,10,19,4,23,...,0,False,0,0,google,organic,Organic Search,Chrome,Android,mobile
1,3072592563711482446,1476880065,1,2016-10-19 05:27:45,United States,4,10,19,4,5,...,0,False,0,0,google,organic,Organic Search,Chrome,Android,mobile
2,1687301606877489412,1477794145,1,2016-10-29 19:22:25,United States,4,10,29,7,19,...,0,False,0,0,youtube.com,referral,other,other,Windows,desktop
3,796191439564725883,1473279331,1,2016-09-07 13:15:31,United States,3,9,7,4,13,...,0,False,0,0,google,organic,Organic Search,Chrome,Windows,desktop
4,9194147359170837949,1478035636,1,2016-11-01 14:27:16,United States,4,11,1,3,14,...,0,False,0,0,youtube.com,referral,other,Chrome,Android,mobile


We need categorical features to be numeric for convenience (eg. checking for collinearity). So, we will use the `.cat.categories` attribute of `pandas.CategoricalDtype()` columns to create mapper dict that maps each unique category (string) in these columns to an integer.

Map strings in categorical columns in copy of training data, using the `.cat.categories attribute`

In [11]:
cat_mapper_dicts_train = []
for cat_col in categorical_features:
    cat_mapper_dict = dict(
        zip(
            df_train[cat_col].cat.categories.tolist(),
            range(df_train[cat_col].nunique()),
        )
    )
    df_train[cat_col] = df_train[cat_col].map(cat_mapper_dict)
    cat_mapper_dicts_train.append({cat_col: cat_mapper_dict})
df_train[categorical_features].head()

Unnamed: 0,deviceCategory,bounces,channelGrouping,medium,source
0,1,0,1,1,1
1,1,0,1,1,1
2,0,0,3,3,4
3,0,0,1,1,1
4,1,0,3,3,4


::: {.callout-tip title="Observations"}

1. The categorical columns now contain integers instead of strings.
:::

Map strings in categorical columns in copy of validation data, using the `.cat.categories attribute`

In [12]:
cat_mapper_dicts_val = []
for cat_col in categorical_features:
    cat_mapper_dict = dict(
        zip(
            df_val[cat_col].cat.categories.tolist(),
            range(df_val[cat_col].nunique()),
        )
    )
    df_val[cat_col] = df_val[cat_col].map(cat_mapper_dict)
    cat_mapper_dicts_val.append({cat_col: cat_mapper_dict})
df_val[categorical_features].head()

Unnamed: 0,deviceCategory,bounces,channelGrouping,medium,source
0,0,0,1,1,1
1,2,0,1,1,1
2,0,0,1,1,1
3,0,0,1,1,1
4,0,0,1,1,1


Map strings in categorical columns in copy of test data, using the `.cat.categories attribute`

In [13]:
cat_mapper_dicts_test = []
for cat_col in categorical_features:
    cat_mapper_dict = dict(
        zip(
            df_test[cat_col].cat.categories.tolist(),
            range(df_test[cat_col].nunique()),
        )
    )
    df_test[cat_col] = df_test[cat_col].map(cat_mapper_dict)
    cat_mapper_dicts_test.append({cat_col: cat_mapper_dict})
df_test[categorical_features].head()

Unnamed: 0,deviceCategory,bounces,channelGrouping,medium,source
0,0,0,1,1,1
1,0,0,1,1,1
2,0,0,1,1,1
3,0,0,1,1,1
4,0,0,3,2,1


Show the mapper dictionary that has been created for each categorical column in the training data

In [14]:
cat_mapper_dicts_train

[{'deviceCategory': {'desktop': 0, 'mobile': 1, 'tablet': 2}},
 {'bounces': {0: 0, 1: 1}},
 {'channelGrouping': {'Direct': 0,
   'Organic Search': 1,
   'Referral': 2,
   'other': 3}},
 {'medium': {'(none)': 0, 'organic': 1, 'other': 2, 'referral': 3}},
 {'source': {'(direct)': 0,
   'google': 1,
   'mall.googleplex.com': 2,
   'other': 3,
   'youtube.com': 4}}]

Show the mapper dictionary that has been created for each categorical column in the validation data

In [15]:
cat_mapper_dicts_val

[{'deviceCategory': {'desktop': 0, 'mobile': 1, 'tablet': 2}},
 {'bounces': {0: 0, 1: 1}},
 {'channelGrouping': {'Direct': 0,
   'Organic Search': 1,
   'Referral': 2,
   'other': 3}},
 {'medium': {'(none)': 0, 'organic': 1, 'other': 2, 'referral': 3}},
 {'source': {'(direct)': 0,
   'google': 1,
   'mall.googleplex.com': 2,
   'other': 3,
   'youtube.com': 4}}]

Show the mapper dictionary that has been created for each categorical column in the test data

In [16]:
cat_mapper_dicts_test

[{'deviceCategory': {'desktop': 0, 'mobile': 1, 'tablet': 2}},
 {'bounces': {0: 0, 1: 1}},
 {'channelGrouping': {'Direct': 0,
   'Organic Search': 1,
   'Referral': 2,
   'other': 3}},
 {'medium': {'(none)': 0, 'organic': 1, 'other': 2, 'referral': 3}},
 {'source': {'(direct)': 0,
   'google': 1,
   'mall.googleplex.com': 2,
   'other': 3,
   'youtube.com': 4}}]

::: {.callout-tip title="Observations"}

1.The categorical columns now contain integers instead of strings.
:::

We'll also select the necessary columns (numericals, categoricals and label) that will be used for ML model development

In [17]:
cols_to_use = numerical_features + categorical_features + ["last_action"]
label = "made_purchase_on_future_visit"

::: {.callout-note title="Notes"}

1. `last_action` is a categorical but is already present in the transformed data as an integer, so it does not need to be encoded.
2. `made_purchase_on_future_visit` is the label column and it is a `boolean`, so it does not need to be encoded.
:::

Get features from the transformed data splits

In [18]:
features_used = (
    numerical_features
    + categorical_features
    + ["last_action", "made_purchase_on_future_visit"]
)
features_used

['hits',
 'promos_displayed',
 'promos_clicked',
 'product_views',
 'product_clicks',
 'pageviews',
 'time_on_site',
 'deviceCategory',
 'bounces',
 'channelGrouping',
 'medium',
 'source',
 'last_action',
 'made_purchase_on_future_visit']

In [19]:
df_train = df_train[features_used].copy()
df_val = df_val[features_used].copy()
df_test = df_test[features_used].copy()

Create a new split with the combination of the training and validation data splits

In [20]:
# | code-fold: false
df_train_val = pd.concat(
    [df_train.assign(split="train"), df_val.assign(split="val")], ignore_index=True
)

Shuffle the data in the

1. combined training and validation
2. test

data splits

In [21]:
# | code-fold: false
df_train_val = df_train_val.sample(frac=1.0, random_state=88)
df_test = df_test.sample(frac=1.0, random_state=88)

## Get Features and Label

Separate features from the target in the train and validation splits

In [22]:
# | code-fold: false
X_train = df_train_val.query("split == 'train'").drop(
    columns=["made_purchase_on_future_visit", "split"]
)
y_train = df_train_val.query("split == 'train'")[
    "made_purchase_on_future_visit"
].astype(int)

X_val = df_train_val.query("split == 'val'").drop(
    columns=["made_purchase_on_future_visit", "split"]
)
y_val = df_train_val.query("split == 'val'")["made_purchase_on_future_visit"].astype(
    int
)

Separate features from the target in the combined train-validation split

In [23]:
# | code-fold: false
X_train_val = df_train_val.drop(columns=["made_purchase_on_future_visit", "split"])
y_train_val = df_train_val["made_purchase_on_future_visit"].astype(int)

Separate features from the target in the test split

In [24]:
# | code-fold: false
X_test = df_test.drop(columns=["made_purchase_on_future_visit"])
y_test = df_test["made_purchase_on_future_visit"].astype(int)

## Resampling Due to Class Imbalance

Due to the class imbalance, we will undersample the majority class using `imblearn.RandomUnderSampler()`. To compensate for this, when evaluating predictions against this resampled data, we will apply a weighting factor.

The following order is used for resampling and data splitting/preprocessing

1. Resampling is done after splitting the data ([link](https://towardsdatascience.com/4-tips-for-advanced-feature-engineering-and-preprocessing-ec11575c09ea))
2. Normalizing numerical features will be performed after resampling ([1](https://datascience.stackexchange.com/a/71519/17543), [2](https://stats.stackexchange.com/a/363325/144450))

Resampling will be performed as follows

1. ML model development (selecting best model using training and validation splits)
   - undersample majority class in training data
     - do not make predictions against undersampled validation data
     - no evaluation necessary
   - do not undersample majority class in validataion data
     - make predictions against original data
     - evaluate without using sample weights
2. ML model evaluation (evaluating performance of best model using combined training + validation and test splits)
   - undersample majority class in combined training and validation data
     - make predictions against undersampled combined training and validation data
     - evaluate using sample weights
   - do not undersample majority class in test data
     - make predictions against original data
     - evaluate without using sample weights

Below is a helper function to get the sample weights based on a downsampling factor. The downsampling factor is multiplied by the value of the label (0 or 1). For the majority class, this gives a weight greater than 1. For the minority class, this gives a weight of zero, so we have replaced zeros by a sample weight of 1 ([1](https://stackoverflow.com/a/34477381/4057186), [2](https://stackoverflow.com/a/71686189/4057186)).

This is shown below

In [97]:
def get_sample_weight(y: pd.Series, ds_factor: float) -> pd.Series:
    """."""
    y = pd.Series(y, name="test")
    y_name = y.name
    sample_weight = y.to_frame().assign(
        sample_weight=lambda df: ((df[y_name] == 0) * ds_factor).replace(0, 1)
    )["sample_weight"]
    return sample_weight

We will undersample using a sampling strategy with a ratio of 10:1 (or 1/10). In the original training data, the ratio is approximately 19:1, as we saw in earlier steps in the analysis.

A `RandomUnderSampler` object is defined below with this sampling strategy

In [26]:
us = RandomUnderSampler(sampling_strategy=1 / 10)

Downsample the training data

In [27]:
X_train_us, y_train_us = us.fit_resample(X_train, y_train)
df_train_us = pd.concat([X_train_us, y_train_us], axis=1).sample(frac=1.0)
X_train_us, y_train_us = [
    df_train_us.drop(columns=["made_purchase_on_future_visit"]),
    df_train_us["made_purchase_on_future_visit"],
]
y_train_us.value_counts(normalize=True).reset_index()

Unnamed: 0,made_purchase_on_future_visit,proportion
0,0,0.909091
1,1,0.090909


Calculate the downsampling factor for the training split, as the ratio of the class imbalance after downsampling to before downsampling

In [28]:
ds_factor_train = (y_train_us.value_counts().loc[1] / len(y_train_us)) / (
    y_train.value_counts().loc[1] / len(y_train)
)
print(ds_factor_train)

1.9797005347593584


Downsample the combined training and validation data

In [29]:
X_train_val_us, y_train_val_us = us.fit_resample(X_train_val, y_train_val)
df_train_val_us = pd.concat([X_train_val_us, y_train_val_us], axis=1).sample(frac=1.0)
X_train_val_us, y_train_val_us = [
    df_train_val_us.drop(columns=["made_purchase_on_future_visit"]),
    df_train_val_us["made_purchase_on_future_visit"],
]
y_train_val_us.value_counts(normalize=True).reset_index()

Unnamed: 0,made_purchase_on_future_visit,proportion
0,0,0.909091
1,1,0.090909


Calculate the downsampling factor for the combined training and validation split

In [30]:
ds_factor_train_val = (y_train_val_us.value_counts().loc[1] / len(y_train_val_us)) / (
    y_train_val.value_counts().loc[1] / len(y_train_val)
)
print(ds_factor_train_val)

2.0794266071820378


Combine features and label in all splits

In [31]:
df_train_combo = pd.concat([X_train_us, y_train_us], axis=1)
df_train_val_combo = pd.concat([X_train_val_us, y_train_val_us], axis=1)

df_val_combo = pd.concat([X_val, y_val], axis=1)
df_test_combo = pd.concat([X_test, y_test], axis=1)

In [32]:
# from sklearn.datasets import make_classification

# X, y = make_classification(n_samples=10000, weights=[0.95], flip_y=0)
# X = pd.DataFrame(X)
# y = pd.Series(y, name="true")

# us = RandomUnderSampler(sampling_strategy=1 / 10)
# X_us, y_us = us.fit_resample(X, y)
# ds_factor = (y_us.value_counts().loc[1] / len(y_us)) / (
#     y.value_counts().loc[1] / len(y)
# )
# print(ds_factor)
# display(pd.Series(y).value_counts(normalize=False).reset_index())
# display(pd.Series(y_us).value_counts(normalize=False).reset_index())
# sample_weight = get_sample_weight(y_us, ds_factor)
# sample_weight

## Feature Processing

### Handling Missing Values

Missing values are not present in this dataset. See the discussion in the data transformation step for more details.

### Feature Selection

We'll be starting with `LogisticRegression` as one of the classifiers and [this type of model cannot handle multi-collinearity between the features](https://stats.stackexchange.com/a/583034/144450). We will also briefly explore tree-based models, [which can handle multi-collinearity](https://datascience.stackexchange.com/a/12597/17543).

Multi-collinear features makes it difficult to interpret model's coefficients ([1](https://www.tandfonline.com/doi/abs/10.1080/09720502.2010.10700699?journalCode=tjim20), [2](https://towardsdatascience.com/how-to-avoid-multicollinearity-in-categorical-data-46eb39d9cd0d)). This is a functionality we would like to provide to the non-technical marketing team (our client for this use-case), so we must remove multi-collinear features from the transformed data.

Genreally, a correlation coefficient between two features of 0.7 or higher is considered as high (features are highly correlated), while a value between 0.5 and 0.7 is moderate ([1](https://www.andrews.edu/~calkins/math/edrm611/edrm05.htm), [2](https://www.westga.edu/academics/research/vrc/assets/docs/scatterplots_and_correlation_notes.pdf)). With this in mind, we will perform feature selection based on the inter-correlation between features. To do this, we will select features that are not correlated to each other (correlation coefficient less than 0.7) and drop those that are correlated.

Now, we'll show the collinearity between all numerical and categorical features

In [33]:
df_corr = df_train_combo.corr()
display(df_corr.drop(columns=[label]).reset_index().rename(columns={"index": "column"}))

Unnamed: 0,column,hits,promos_displayed,promos_clicked,product_views,product_clicks,pageviews,time_on_site,deviceCategory,bounces,channelGrouping,medium,source,last_action
0,hits,1.0,0.426498,0.246171,0.746177,0.853819,0.986334,0.69891,-0.085487,-0.349008,-0.02866,0.012319,-0.062944,0.647588
1,promos_displayed,0.426498,1.0,0.462642,0.331835,0.200887,0.455489,0.326451,0.056031,-0.36141,0.119008,0.160851,0.092925,0.196579
2,promos_clicked,0.246171,0.462642,1.0,0.237362,0.122976,0.217341,0.154906,0.137447,-0.196248,0.036655,0.029673,0.017272,0.052187
3,product_views,0.746177,0.331835,0.237362,1.0,0.592322,0.764883,0.543804,-0.052768,-0.327465,0.002974,0.033979,-0.04299,0.367652
4,product_clicks,0.853819,0.200887,0.122976,0.592322,1.0,0.791095,0.520807,-0.082364,-0.220815,-0.022741,0.008344,-0.044522,0.5098
5,pageviews,0.986334,0.455489,0.217341,0.764883,0.791095,1.0,0.720267,-0.092173,-0.372873,-0.029736,0.013788,-0.067647,0.655521
6,time_on_site,0.69891,0.326451,0.154906,0.543804,0.520807,0.720267,1.0,-0.063548,-0.303877,-0.039434,-0.008904,-0.05558,0.519868
7,deviceCategory,-0.085487,0.056031,0.137447,-0.052768,-0.082364,-0.092173,-0.063548,1.0,0.071771,-0.114469,-0.198789,-0.171836,-0.136539
8,bounces,-0.349008,-0.36141,-0.196248,-0.327465,-0.220815,-0.372873,-0.303877,0.071771,1.0,-0.042276,-0.097357,0.006067,-0.348434
9,channelGrouping,-0.02866,0.119008,0.036655,0.002974,-0.022741,-0.029736,-0.039434,-0.114469,-0.042276,1.0,0.896407,0.844177,-0.108924


::: {.callout-tip title="Observations"}

1. Unfortunately, several attributes that were extracted from the raw visits data are correlated to each other.
2. The following numerical features demonstrate multi-collinearity
   - `product_views`
   - `product_clicks`
   - `hits`
   - `time_on_site`
3. The following categorical features demonstrate multi-collinearity
   - `channelGrouping`
   - `medium`

   We also noted the correlation between `channelGrouping` and `medium` in the EDA step of the analysis.
:::

Based on these observations, we'll create a list of features to be dropped

In [34]:
cols_to_drop = [
    "channelGrouping",
    "medium",
    "hits",
    "product_views",
    "product_clicks",
    "time_on_site",
]

Showing the correlations after dropping these features

In [35]:
(
    df_train_combo.drop(columns=cols_to_drop + [label])
    .corr()
    .reset_index()
    .rename(columns={"index": "column"})
)

Unnamed: 0,column,promos_displayed,promos_clicked,pageviews,deviceCategory,bounces,source,last_action
0,promos_displayed,1.0,0.462642,0.455489,0.056031,-0.36141,0.092925,0.196579
1,promos_clicked,0.462642,1.0,0.217341,0.137447,-0.196248,0.017272,0.052187
2,pageviews,0.455489,0.217341,1.0,-0.092173,-0.372873,-0.067647,0.655521
3,deviceCategory,0.056031,0.137447,-0.092173,1.0,0.071771,-0.171836,-0.136539
4,bounces,-0.36141,-0.196248,-0.372873,0.071771,1.0,0.006067,-0.348434
5,source,0.092925,0.017272,-0.067647,-0.171836,0.006067,1.0,-0.11054
6,last_action,0.196579,0.052187,0.655521,-0.136539,-0.348434,-0.11054,1.0


::: {.callout-tip title="Observations"}

1. The remaining features are moderately (`last_action` and `pageviews`) or weakly (all ther combinations) correlated to each other.
:::

Finally, we'll show the correlation between the selected features and the label

In [36]:
display(
    df_train_combo.drop(columns=cols_to_drop)
    .corr()[["made_purchase_on_future_visit"]]
    .reset_index()
    .rename(columns={"index": "column"})
    .query(f"column != '{label}'")
)

Unnamed: 0,column,made_purchase_on_future_visit
0,promos_displayed,0.045566
1,promos_clicked,-0.024906
2,pageviews,0.196882
3,deviceCategory,-0.150762
4,bounces,-0.134878
5,source,-0.027029
6,last_action,0.296827


::: {.callout-tip title="Observations"}

1. The remaining features are weakly correlated to the label (`made_purchase_on_future_visit`), with `last_action`, `pageviews` and `promos_displayed` showing the highest correlation.
:::

### Feature Engineering

For each numerical feature, we'll extract three new features as the ratio to the mean. For `pageviews`, this will give
```python
df = df['pageviews'] / df['pageviews'].mean()
```

The intuition behind this transformation is that visitors with a higher-than-average number of pages viewed on their first visit have a higher likelihood of making a purchase on a return visit.

A custom `sklearn` transformer is used to define this below

In [37]:
class AboveAveragePagePromoEngager(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        above_avg_pageviews = (
            0 if X["pageviews"].mean() == 0 else X["pageviews"] / X["pageviews"].mean()
        )
        above_avg_promos_clicked = (
            0
            if X["promos_clicked"].mean() == 0
            else X["promos_clicked"] / X["promos_clicked"].mean()
        )
        above_avg_promos_displayed = (
            0
            if X["promos_displayed"].mean() == 0
            else X["promos_displayed"] / X["promos_displayed"].mean()
        )

        X = (
            X.assign(above_avg_pageviews=lambda df: above_avg_pageviews)
            .assign(above_avg_promos_clicked=lambda df: above_avg_promos_clicked)
            .assign(above_avg_promos_displayed=lambda df: above_avg_promos_displayed)
        )
        return X

### Feature Processing for Model Development Using Training and Validation Data

An overall feature processing pipeline is now defined to process the selected features.

Get the names of the features after selecting (non-correlated) features

In [38]:
numerical_features_after_dropping = [
    c for c in numerical_features if c not in cols_to_drop
]
categorical_features_after_dropping = [
    c for c in categorical_features + ["last_action"] if c not in cols_to_drop
]

A preprocessor is defined to perform the following

1. normalize all numerical features (using `MinMaxScaler`)
2. dummy encoding for all categorical features (using `OneHotEncoder`)
   - similar to one-hot encoding, but with [one less category per categorical feature](https://datascience.stackexchange.com/a/98173/17543)

In [39]:
numeric_transformer = Pipeline(
    steps=[("aboveavg", AboveAveragePagePromoEngager()), ("scaler", MinMaxScaler())]
)
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore", dtype=int, drop="first"))]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features_after_dropping),
        ("cat", categorical_transformer, categorical_features_after_dropping),
    ]
)

The feature processing pipeline is now defined

In [40]:
pipe_trans = Pipeline(steps=[("preprocessor", preprocessor)])

Train the pipeline on the undersampled training data

In [41]:
_ = pipe_trans.fit(
    X_train_us[numerical_features_after_dropping + categorical_features_after_dropping]
)

Extract the processed categorical feature names from the trained processing pipeline

In [42]:
categoricals_processed = (
    pipe_trans.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .get_feature_names_out(categorical_features_after_dropping)
    .tolist()
)
categoricals_processed

['deviceCategory_1',
 'deviceCategory_2',
 'bounces_1',
 'source_1',
 'source_2',
 'source_3',
 'source_4',
 'last_action_1',
 'last_action_2',
 'last_action_3',
 'last_action_4',
 'last_action_5',
 'last_action_6']

Extract all processed feature names

In [43]:
features_processed = (
    numerical_features_after_dropping
    + [f"above_avg_{c}" for c in numerical_features_after_dropping]
    + categoricals_processed
)
features_processed

['promos_displayed',
 'promos_clicked',
 'pageviews',
 'above_avg_promos_displayed',
 'above_avg_promos_clicked',
 'above_avg_pageviews',
 'deviceCategory_1',
 'deviceCategory_2',
 'bounces_1',
 'source_1',
 'source_2',
 'source_3',
 'source_4',
 'last_action_1',
 'last_action_2',
 'last_action_3',
 'last_action_4',
 'last_action_5',
 'last_action_6']

Create a datatype mapping dictionary to change the dummy-encoded categorical features to integers in the processed data

In [44]:
categoricals_processed_dtypes = dict(
    zip(categoricals_processed, [pd.Int8Dtype() for _ in categoricals_processed])
)
categoricals_processed_dtypes

{'deviceCategory_1': Int8Dtype(),
 'deviceCategory_2': Int8Dtype(),
 'bounces_1': Int8Dtype(),
 'source_1': Int8Dtype(),
 'source_2': Int8Dtype(),
 'source_3': Int8Dtype(),
 'source_4': Int8Dtype(),
 'last_action_1': Int8Dtype(),
 'last_action_2': Int8Dtype(),
 'last_action_3': Int8Dtype(),
 'last_action_4': Int8Dtype(),
 'last_action_5': Int8Dtype(),
 'last_action_6': Int8Dtype()}

Process the features in the undersampled training data

In [45]:
X_train_trans = pd.DataFrame(
    pipe_trans.transform(
        X_train_us[
            numerical_features_after_dropping + categorical_features_after_dropping
        ]
    ),
    columns=features_processed,
).astype(categoricals_processed_dtypes)
display(X_train_trans.head())
display(X_train_trans.tail())

Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
0,0.0,0.0,0.018349,0.018349,0.0,0.0,0,0,0,0,0,1,0,0,0,1,0,0,0
1,0.0,0.0,0.073394,0.073394,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.013761,0.013761,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0
4,0.012658,0.0,0.004587,0.004587,0.0,0.012658,1,0,0,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
46745,0.0,0.0,0.018349,0.018349,0.0,0.0,0,0,0,0,0,1,0,0,0,1,0,0,0
46746,0.012658,0.0,0.004587,0.004587,0.0,0.012658,0,0,0,1,0,0,0,0,0,0,0,0,0
46747,0.025316,0.029412,0.009174,0.009174,0.029412,0.025316,0,0,0,0,0,1,0,0,0,0,0,0,0
46748,0.012658,0.0,0.027523,0.027523,0.0,0.012658,0,0,0,1,0,0,0,0,0,0,0,0,0
46749,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0


Process the features in the validation data, using the preprocessing pipeline that was trained on the undersampled training data

In [46]:
X_val_trans = pd.DataFrame(
    pipe_trans.transform(
        X_val[numerical_features_after_dropping + categorical_features_after_dropping]
    ),
    columns=features_processed,
).astype(categoricals_processed_dtypes)
display(X_val_trans.head())
display(X_val_trans.tail())

Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
0,0.012658,0.0,0.013761,0.016072,0.0,0.01592,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0.037975,0.0,0.068807,0.07805,0.0,0.04776,0,0,0,1,0,0,0,0,0,1,0,0,0
2,0.012658,0.0,0.0,0.000578,0.0,0.01592,0,0,1,0,0,1,0,0,0,0,0,0,0
3,0.0,0.0,0.004587,0.005742,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.012658,0.0,0.013761,0.016072,0.0,0.01592,0,0,0,0,1,0,0,0,0,0,0,0,0


Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
21172,0.012658,0.0,0.192661,0.217499,0.0,0.01592,0,0,0,0,1,0,0,0,0,0,0,0,1
21173,0.012658,0.0,0.004587,0.005742,0.0,0.01592,0,0,0,0,0,0,0,0,0,0,0,0,0
21174,0.0,0.0,0.0,0.000578,0.0,0.0,0,0,1,1,0,0,0,0,0,0,0,0,0
21175,0.0,0.0,0.0,0.000578,0.0,0.0,1,0,1,1,0,0,0,0,0,0,0,0,0
21176,0.012658,0.0,0.0,0.000578,0.0,0.01592,0,0,1,0,0,0,1,0,0,0,0,0,0


### Feature Processing for Model Evaluation Using Training + Validation and Testing Data

Train the pipeline on the undersampled combined training and validation data

In [47]:
_ = pipe_trans.fit(
    X_train_val_us[
        numerical_features_after_dropping + categorical_features_after_dropping
    ]
)

Extract the processed categorical feature names from the trained processing pipeline

In [48]:
categoricals_processed_train_val = (
    pipe_trans.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .get_feature_names_out(categorical_features_after_dropping)
    .tolist()
)
categoricals_processed_train_val

['deviceCategory_1',
 'deviceCategory_2',
 'bounces_1',
 'source_1',
 'source_2',
 'source_3',
 'source_4',
 'last_action_1',
 'last_action_2',
 'last_action_3',
 'last_action_4',
 'last_action_5',
 'last_action_6']

Extract all processed feature names

In [49]:
features_processed_train_val = (
    numerical_features_after_dropping
    + [f"above_avg_{c}" for c in numerical_features_after_dropping]
    + categoricals_processed_train_val
)
features_processed_train_val

['promos_displayed',
 'promos_clicked',
 'pageviews',
 'above_avg_promos_displayed',
 'above_avg_promos_clicked',
 'above_avg_pageviews',
 'deviceCategory_1',
 'deviceCategory_2',
 'bounces_1',
 'source_1',
 'source_2',
 'source_3',
 'source_4',
 'last_action_1',
 'last_action_2',
 'last_action_3',
 'last_action_4',
 'last_action_5',
 'last_action_6']

Create a datatype mapping dictionary to change the dummy-encoded categorical features to integers in the processed data

In [50]:
categoricals_processed_dtypes_train_val = dict(
    zip(
        categoricals_processed_train_val,
        [pd.Int8Dtype() for _ in categoricals_processed_train_val],
    )
)
categoricals_processed_dtypes_train_val

{'deviceCategory_1': Int8Dtype(),
 'deviceCategory_2': Int8Dtype(),
 'bounces_1': Int8Dtype(),
 'source_1': Int8Dtype(),
 'source_2': Int8Dtype(),
 'source_3': Int8Dtype(),
 'source_4': Int8Dtype(),
 'last_action_1': Int8Dtype(),
 'last_action_2': Int8Dtype(),
 'last_action_3': Int8Dtype(),
 'last_action_4': Int8Dtype(),
 'last_action_5': Int8Dtype(),
 'last_action_6': Int8Dtype()}

Process the features in the undersampled combined training and validation data

In [51]:
X_train_val_trans = pd.DataFrame(
    pipe_trans.transform(
        X_train_val_us[
            numerical_features_after_dropping + categorical_features_after_dropping
        ]
    ),
    columns=features_processed,
).astype(categoricals_processed_dtypes)
display(X_train_val_trans.head())
display(X_train_val_trans.tail())

Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
0,0.017241,0.047619,0.028112,0.028112,0.047619,0.017241,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.004016,0.004016,0.0,0.0,1,0,1,1,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.008032,0.008032,0.0,0.0,1,0,0,1,0,0,0,0,1,0,0,0,0
3,0.017241,0.047619,0.024096,0.024096,0.047619,0.017241,0,0,0,1,0,0,0,0,1,0,0,0,0
4,0.0,0.0,0.016064,0.016064,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
54687,0.017241,0.0,0.012048,0.012048,0.0,0.017241,0,0,0,1,0,0,0,0,0,0,0,0,0
54688,0.0,0.0,0.004016,0.004016,0.0,0.0,0,0,1,1,0,0,0,0,0,0,0,0,0
54689,0.017241,0.047619,0.012048,0.012048,0.047619,0.017241,0,0,0,0,0,0,1,0,0,1,0,0,0
54690,0.034483,0.0,0.012048,0.012048,0.0,0.034483,0,0,0,1,0,0,0,0,0,0,0,0,0
54691,0.0,0.0,0.004016,0.004016,0.0,0.0,1,0,1,1,0,0,0,0,0,0,0,0,0


Process the features in the test data, using the preprocessing pipeline that was trained on the undersampled combined training and validation data

In [52]:
X_test_trans = pd.DataFrame(
    pipe_trans.transform(
        X_test[numerical_features_after_dropping + categorical_features_after_dropping]
    ),
    columns=features_processed,
).astype(categoricals_processed_dtypes)
display(X_test_trans.head())
display(X_test_trans.tail())

Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
0,0.017241,0.0,0.004016,0.005013,0.0,0.021259,1,0,1,1,0,0,0,0,0,0,0,0,0
1,0.017241,0.0,0.004016,0.005013,0.0,0.021259,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0.017241,0.0,0.024096,0.030077,0.0,0.021259,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0.017241,0.0,0.004016,0.005013,0.0,0.021259,0,0,1,0,0,0,1,0,0,0,0,0,0
4,0.0,0.0,0.008032,0.010026,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0


Unnamed: 0,promos_displayed,promos_clicked,pageviews,above_avg_promos_displayed,above_avg_promos_clicked,above_avg_pageviews,deviceCategory_1,deviceCategory_2,bounces_1,source_1,source_2,source_3,source_4,last_action_1,last_action_2,last_action_3,last_action_4,last_action_5,last_action_6
20159,0.034483,0.0,0.02008,0.025065,0.0,0.042519,0,0,0,0,0,0,0,0,0,0,0,0,0
20160,0.0,0.0,0.004016,0.005013,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0
20161,0.0,0.0,0.008032,0.010026,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0
20162,0.0,0.0,0.004016,0.005013,0.0,0.0,0,0,1,0,1,0,0,0,0,0,0,0,0
20163,0.017241,0.0,0.004016,0.005013,0.0,0.021259,0,0,1,0,1,0,0,0,0,0,0,0,0


Verify that there are no missing values in the transformed training data

In [53]:
assert (X_train_trans.isna().sum() == 0).all()
X_train_trans.isna().sum().reset_index().rename(
    columns={0: "missing", "index": "transformed_feature"}
)

Unnamed: 0,transformed_feature,missing
0,promos_displayed,0
1,promos_clicked,0
2,pageviews,0
3,above_avg_promos_displayed,0
4,above_avg_promos_clicked,0
5,above_avg_pageviews,0
6,deviceCategory_1,0
7,deviceCategory_2,0
8,bounces_1,0
9,source_1,0


Verify that there are no missing values in the transformed combined training and validation data

In [54]:
assert (X_train_val_trans.isna().sum() == 0).all()
X_train_val_trans.isna().sum().reset_index().rename(
    columns={0: "missing", "index": "transformed_feature"}
)

Unnamed: 0,transformed_feature,missing
0,promos_displayed,0
1,promos_clicked,0
2,pageviews,0
3,above_avg_promos_displayed,0
4,above_avg_promos_clicked,0
5,above_avg_pageviews,0
6,deviceCategory_1,0
7,deviceCategory_2,0
8,bounces_1,0
9,source_1,0


::: {.callout-tip title="Observations"}

1. The processed training data has no missing values so our feature processing pipeline has not produced errors when transforming the data. In order to avoid data leakage/lookahead bias, we will assume this is also the case with the validation split.
2. The same is true for the combined training and validation split after transformation using the feature processing pipeline. Here, we will assume this is also the case with the test split.
:::

## ML Training using Validation Data

Define ML pipelines to be compared in this step

In [55]:
pipe_lr = Pipeline([("clf", LogisticRegression())])
pipe_rf = Pipeline([("clf", RandomForestClassifier(n_estimators=500))])
pipe_gb = Pipeline([("clf", GradientBoostingClassifier(n_estimators=200))])

### `LogisticRegression`

Train using undersampled training data

In [61]:
%%time
_ = pipe_lr.fit(X_train_trans, y_train_us)

CPU times: user 503 ms, sys: 1.43 s, total: 1.93 s
Wall time: 176 ms


Make predictions on validation data

In [65]:
y_val_pred = pipe_lr.predict(X_val_trans)
y_val_pred_proba = pipe_lr.predict_proba(X_val_trans)[:, 1:]

Evaluate predictions on the validation data

In [67]:
scores_lr = get_metrics(
    y_val,
    y_val_pred,
    y_val_pred_proba,
    ds_factor=1.0,
    average="binary",
    zero_division="warn",
)
df_scores_lr = pd.DataFrame.from_records([scores_lr]).assign(model_type="lr")
df_scores_lr

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,model_type
0,0.960901,0.554864,0.309353,0.119114,0.172,0.23446,0.135818,0.039099,0.19994,lr


### `RandomForestClassifier`

Repeat for `RandomForestClassifier`

In [72]:
# train
_ = pipe_rf.fit(X_train_trans, y_train_us)

# predict
y_val_pred = pipe_rf.predict(X_val_trans)
y_val_pred_proba = pipe_rf.predict_proba(X_val_trans)[:, 1:]

# evaluate
scores_rf = get_metrics(
    y_val,
    y_val_pred,
    y_val_pred_proba,
    ds_factor=1.0,
    average="binary",
    zero_division="warn",
)
df_scores_rf = pd.DataFrame.from_records([scores_rf]).assign(model_type="rf")
df_scores_rf

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,model_type
0,0.95514,0.595974,0.285714,0.210526,0.242424,0.266667,0.222222,0.04486,0.182232,rf


### `GradientBoostingClassifier`

Repeat for `GradientBoostingClassifier`

In [77]:
# train
_ = pipe_gb.fit(X_train_trans, y_train_us)

# predict
y_val_pred = pipe_gb.predict(X_val_trans)
y_val_pred_proba = pipe_gb.predict_proba(X_val_trans)[:, 1:]

# evaluate
scores_gb = get_metrics(
    y_val,
    y_val_pred,
    y_val_pred_proba,
    ds_factor=1.0,
    average="binary",
    zero_division="warn",
)
df_scores_gb = pd.DataFrame.from_records([scores_gb]).assign(model_type="gb")
df_scores_gb

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,model_type
0,0.958918,0.603943,0.342553,0.222992,0.270134,0.309377,0.239726,0.041082,0.23629,gb


### Extracting Best ML Model

Combine validation scores for all models and rank the models based on the validation split scores

In [78]:
# | code-fold: false
df_validation_scores = (
    pd.concat([df_scores_lr, df_scores_rf, df_scores_gb])
    .assign(
        rank=lambda df: df["f05"]
        .rank(ascending=False)
        .astype(int)
        .astype(pd.Int8Dtype())
    )
    .sort_values(by=["rank"], ascending=True)
)
df_validation_scores

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,model_type,rank
0,0.958918,0.603943,0.342553,0.222992,0.270134,0.309377,0.239726,0.041082,0.23629,gb,1
0,0.95514,0.595974,0.285714,0.210526,0.242424,0.266667,0.222222,0.04486,0.182232,rf,2
0,0.960901,0.554864,0.309353,0.119114,0.172,0.23446,0.135818,0.039099,0.19994,lr,3


Finally, we'll extract the best ML model

In [79]:
# | code-fold: false
df_validation_scores.query("rank == 1")

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,model_type,rank
0,0.958918,0.603943,0.342553,0.222992,0.270134,0.309377,0.239726,0.041082,0.23629,gb,1


## ML Evaluation

The best ML model is now used to make predictions on the test split and on the combined train and validation split. Recall that the combined train and validation split has been undersampled. The test split has not been undersampled.

### Make and Score Predictions

Repeat training and evaluation using the best model found above and evaluate the predictions on both splits

In [80]:
pipe_best = Pipeline([("clf", GradientBoostingClassifier(n_estimators=200))])

In [91]:
# train
_ = pipe_best.fit(X_train_val_trans, y_train_val_us)

# predict combined train and validation
y_train_val_us_pred = pipe_best.predict(X_test_trans)
y_train_val_us_proba = pipe_best.predict_proba(X_test_trans)[:, 1:]

# predict test
y_test_pred = pipe_best.predict(X_test_trans)
y_test_pred_proba = pipe_best.predict_proba(X_test_trans)[:, 1:]

# # evaluate train
# scores_train_val = get_metrics(
#     y_train_val_us,
#     y_train_val_us_pred,
#     y_train_val_us_proba,
#     ds_factor=ds_factor,
#     average="binary",
#     zero_division="warn",
# )
scores_test = get_metrics(
    y_test,
    y_test_pred,
    y_test_pred_proba,
    ds_factor=1.0,
    average="binary",
    zero_division="warn",
)

The evaluation scores from both splits are now combined and shown below

In [90]:
df_scores_eval = pd.concat(
    [
        # pd.DataFrame.from_records([scores_train_val]).assign(split='train+val'),
        pd.DataFrame.from_records([scores_test]).assign(split="test")
    ]
)
df_scores_eval

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2,brier,pr_auc,split
0,0.968161,0.584748,0.244957,0.182796,0.20936,0.229358,0.192569,0.031839,0.162201,test


['promos_displayed',
 'promos_clicked',
 'pageviews',
 'above_avg_promos_displayed',
 'above_avg_promos_clicked',
 'above_avg_pageviews',
 'deviceCategory_1',
 'deviceCategory_2',
 'bounces_1',
 'source_1',
 'source_2',
 'source_3',
 'source_4',
 'last_action_1',
 'last_action_2',
 'last_action_3',
 'last_action_4',
 'last_action_5',
 'last_action_6']

### Feature Importances

In [105]:
dict(
    zip(list(X_test_trans), pipe_best.named_steps["clf"].feature_importances_.tolist())
)

{'promos_displayed': 0.019885111057703055,
 'promos_clicked': 0.0022416910516807036,
 'pageviews': 0.09548847570837914,
 'above_avg_promos_displayed': 0.09999988830954988,
 'above_avg_promos_clicked': 0.0028009122223698175,
 'above_avg_pageviews': 0.021791269245015957,
 'deviceCategory_1': 0.08599058985304658,
 'deviceCategory_2': 0.004802666487717654,
 'bounces_1': 0.0,
 'source_1': 0.21743892994936045,
 'source_2': 0.05442867188756295,
 'source_3': 0.026799358179278504,
 'source_4': 0.046058195523088236,
 'last_action_1': 0.00016048090843583915,
 'last_action_2': 0.04215936755630716,
 'last_action_3': 0.14067749475288735,
 'last_action_4': 0.024812321171727224,
 'last_action_5': 0.1057741750560125,
 'last_action_6': 0.008690401079876871}

**Observations**
1. Traffic source, promotions displayed on the screen and last action performed during the first visit are the most important predictors of likelihood for a purchase during a return visit.

## Summary

1. All ML models have outperformed the baseline model.
2. The best model shows promise but requires optimization to help improve evaluation scores.