In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.calibration import CalibratedClassifierCV
import zipfile
import os

random_seed = 1

## 1. Data Preprocessing

### Load the data

In [2]:
dataset = pd.read_csv("./data/bank_dataset (3) (1) (1) (3) (1).csv")

### Preprocess the data
- Check for NAs or missing values;
- Separate features from target variable;
- Determine numeric and non-numeric features;
- One-hot encode non-numeric features;
- Turn target variable into 0/1 (useful for some algorithms).

In [3]:
dataset.isnull().any().any()

False

In [4]:
X = dataset.loc[:, dataset.columns != 'target']
y = dataset.loc[:, 'target']

In [5]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()
print(f"There are {len(numeric_features)} numeric features out of {len(X.columns)}.")

There are 10 numeric features out of 20.


In [6]:
X = pd.get_dummies(X, drop_first=True)

In [7]:
y = y.replace("yes", 1)
y = y.replace("no", 0)

### Split the data intro Train and Test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_seed)

## 2. Machine learning model training

### Train Model

Classical model choices for binary classification problems would be, for instance: Logistic Regression, Random Forests and Neural Networks.
- Given the number of categorical variables and possibility of complex relationships between the features (e.g. multicollinearity), we avoid logistic regression.
- We avoid Neural Networks given the limited amount of data vs number of features.

We choose to use **Random Forests** given their predictive power, ability to handle categorical data and ease of use (e.g., via the scikit-learn library).

In [None]:
classifier = RandomForestClassifier(random_state=random_seed)
classifier.fit(X_train,y_train)

We re-calibrate the classifier using isotonic regression.

In [None]:
calibrated_classifier = CalibratedClassifierCV(classifier, cv=20, method='isotonic')
calibrated_classifier.fit(X_train, y_train)

### Determine feature importance

Using permutation importance.

In [11]:
r = permutation_importance(
    calibrated_classifier,
    X_test,
    y_test,
    random_state=random_seed,
    scoring="neg_brier_score",
)
for i in r.importances_mean.argsort()[::-1]:
    print(X_train.columns[i])
    print(f"{r.importances_mean[i]:.4f}")
    print(f" +/- {r.importances_std[i]:.4f}")

last_contact_duration
0.0395
 +/- 0.0007
euri_3_month
0.0053
 +/- 0.0003
emp_var_rate
0.0045
 +/- 0.0002
nb_employees
0.0039
 +/- 0.0002
N_last_days
0.0010
 +/- 0.0000
month_oct
0.0010
 +/- 0.0000
contact_mode_telephone
0.0009
 +/- 0.0001
cons_conf_index
0.0009
 +/- 0.0001
cons_price_index
0.0007
 +/- 0.0001
previous_outcome_success
0.0007
 +/- 0.0000
month_may
0.0006
 +/- 0.0001
contacts_per_campaign
0.0005
 +/- 0.0001
month_mar
0.0002
 +/- 0.0000
age
0.0002
 +/- 0.0003
has_credit_unknown
0.0001
 +/- 0.0001
occupation_self-employed
0.0001
 +/- 0.0000
education_university.degree
0.0001
 +/- 0.0001
week_day_thu
0.0001
 +/- 0.0001
week_day_tue
0.0001
 +/- 0.0001
week_day_wed
0.0001
 +/- 0.0001
week_day_mon
0.0001
 +/- 0.0001
occupation_technician
0.0000
 +/- 0.0001
month_nov
0.0000
 +/- 0.0000
previous_outcome_nonexistent
0.0000
 +/- 0.0001
occupation_housemaid
0.0000
 +/- 0.0000
occupation_blue-collar
0.0000
 +/- 0.0001
marital_status_unknown
0.0000
 +/- 0.0000
personal_loan_unknown
0.0

Most important features seem to be _last_contact_duration_ , _euri_3_month_ and _emp_var_rate_ .

## 3. Model’s performance evaluation

"Proper" evaluation of model's performance would require information regarding model usage. In particular, information related to the "cost" of targeting a customer with a marketing campaign and the "benefit" of having a customer buying a financial product.

Because this is not available, we evaluate how correctly calibrated is our model, and we use a _strictly proper scoring rule for this_: **Brier Score**.

We compute the score for the entire test set and for the positive and negative classes.

In [12]:
y_pred_prob = calibrated_classifier.predict_proba(X_test)
brier_score = brier_score_loss(y_test, y_pred_prob[:,0])
brier_score_positive_class = brier_score_loss(y_test.iloc[np.where(y_test == 1)], y_pred_prob[:,0][np.where(y_test == 1)])
brier_score_negative_class = brier_score_loss(y_test.iloc[np.where(y_test == 0)], y_pred_prob[:,0][np.where(y_test == 0)])
print("Brier score:", brier_score)
print("Brier score positive class:", brier_score_positive_class)
print("Brier score negative class:", brier_score_negative_class)

Brier score: 0.8298187150168627
Brier score positive class: 0.30180480585643504
Brier score negative class: 0.8971792279572809


We also show here the precision, recall and F1 score in case it is relevant for discussion.

In [13]:
precision = precision_score(calibrated_classifier.predict(X_test), y_test)
recall = recall_score(calibrated_classifier.predict(X_test), y_test) 
F1_score = f1_score(calibrated_classifier.predict(X_test), y_test)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", F1_score)

Precision: 0.5133047210300429
Recall: 0.6542669584245077
F1: 0.5752765752765753


## 4. Release model into production

In [14]:
# Export the model to ONNX format
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_sklearn(calibrated_classifier, initial_types=initial_type)

# Save the model to a file
with open('calibrated_classifier.onnx', 'wb') as f:
    f.write(onnx_model.SerializeToString())

# Code to zip the file
zip_filename = 'calibrated_classifier.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    zipf.write('calibrated_classifier.onnx', compress_type=zipfile.ZIP_DEFLATED)

os.remove('calibrated_classifier.onnx')