In [2]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col, to_date, lit
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


In [3]:
# Build a .py script that takes a snapshot date, trains a model and outputs artefact into storage.

## set up pyspark session

In [4]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 12:38:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [5]:
# set up config
model_train_date_str = "2024-12-01"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] = oot_period_months
config["train_test_ratio"] = train_test_ratio

# Standardize all dates to datetime.date objects for consistency
config["model_train_date"] = datetime.strptime(model_train_date_str, "%Y-%m-%d").date()

# For monthly data (1st of each month), use month boundaries instead of day subtraction
# Get the first day of the month for OOT period
config["oot_end_date"] = (config['model_train_date'].replace(day=1) - timedelta(days=1)).replace(day=1)
config["oot_start_date"] = (config["oot_end_date"] - relativedelta(months=oot_period_months - 1)).replace(day=1)

# Training period - ensure we use 1st of months
config["train_test_end_date"] = (config["oot_start_date"] - timedelta(days=1)).replace(day=1)
config["train_test_start_date"] = (config["train_test_end_date"] - relativedelta(months=train_test_period_months - 1)).replace(day=1)

# Convert all dates to string format for Spark SQL operations
config["model_train_date_str"] = config["model_train_date"].strftime("%Y-%m-%d")
config["oot_end_date_str"] = config["oot_end_date"].strftime("%Y-%m-%d")
config["oot_start_date_str"] = config["oot_start_date"].strftime("%Y-%m-%d")
config["train_test_end_date_str"] = config["train_test_end_date"].strftime("%Y-%m-%d")
config["train_test_start_date_str"] = config["train_test_start_date"].strftime("%Y-%m-%d")

print("=== MONTHLY DATA CONFIG ===")
pprint.pprint(config)

# Validate the dates make sense for monthly data
print("\n=== DATE VALIDATION ===")
print(f"OOT Period: {config['oot_start_date_str']} to {config['oot_end_date_str']} ({oot_period_months} months)")
print(f"Train-Test Period: {config['train_test_start_date_str']} to {config['train_test_end_date_str']} ({train_test_period_months} months)")

=== MONTHLY DATA CONFIG ===
{'model_train_date': datetime.date(2024, 12, 1),
 'model_train_date_str': '2024-12-01',
 'oot_end_date': datetime.date(2024, 11, 1),
 'oot_end_date_str': '2024-11-01',
 'oot_period_months': 2,
 'oot_start_date': datetime.date(2024, 10, 1),
 'oot_start_date_str': '2024-10-01',
 'train_test_end_date': datetime.date(2024, 9, 1),
 'train_test_end_date_str': '2024-09-01',
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.date(2023, 10, 1),
 'train_test_start_date_str': '2023-10-01'}

=== DATE VALIDATION ===
OOT Period: 2024-10-01 to 2024-11-01 (2 months)
Train-Test Period: 2023-10-01 to 2024-09-01 (12 months)


## get label store

In [15]:
# connect to label store
folder_path = "/app/datamart/gold/label_store/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
label_store_sdf = spark.read.option("header", "true").parquet(*files_list)

# Ensure snapshot_date is in proper DateType
label_store_sdf = label_store_sdf.withColumn("snapshot_date", to_date(col("snapshot_date"), "yyyy-MM-dd"))

print("row_count:", label_store_sdf.count())
label_store_sdf.show()

                                                                                

row_count: 39868
+--------------------+-----------+-----+----------------+-------------------+-------------+---+---+
|             loan_id|Customer_ID|label|label_definition|label_snapshot_date|snapshot_date|mob|dpd|
+--------------------+-----------+-----+----------------+-------------------+-------------+---+---+
|CUS_0x100b_2024_0...| CUS_0x100b|    0|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01|  7|  0|
|CUS_0x1013_2023_1...| CUS_0x1013|    0|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01| 10|  0|
|CUS_0x102d_2024_0...| CUS_0x102d|    0|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01|  9|  0|
|CUS_0x102e_2024_0...| CUS_0x102e|    1|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01|  6|120|
|CUS_0x1048_2024_0...| CUS_0x1048|    1|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01|  8|210|
|CUS_0x104a_2023_1...| CUS_0x104a|    0|  DPD_30+_MOB_6+|         2024-10-01|   2024-10-01| 10|  0|
|CUS_0x1051_2024_0...| CUS_0x1051|    0|  DPD_30+_MOB_6+|         2024-10-01|   202

In [16]:
# extract label store using string dates for Spark SQL
labels_sdf = label_store_sdf.filter(
    (col("snapshot_date") >= to_date(lit(config["train_test_start_date_str"]))) & 
    (col("snapshot_date") <= to_date(lit(config["oot_end_date_str"])))
)

print("extracted labels_sdf", labels_sdf.count(), config["train_test_start_date_str"], config["oot_end_date_str"])

[Stage 10:>                                                         (0 + 9) / 9]

extracted labels_sdf 34239 2023-10-01 2024-11-01


                                                                                

## get features

In [42]:
# connect to feature store
folder_path = "/app/datamart/gold/feature_store/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
feature_sdf = spark.read.option("header", "true").parquet(*files_list)

# Ensure snapshot_date is in proper DateType
feature_sdf = feature_sdf.withColumn("snapshot_date", to_date(col("snapshot_date"), "yyyy-MM-dd"))

print("row_count:", feature_sdf.count())
feature_sdf.show()

                                                                                

row_count: 104288
+-----------+-------------+---------------------+---+----------+-------------------+----------------+-----------------------+-------------+-----------------+---------------+-------------------+-----------------------+----------------------+-------------------+-----------------------+-----------------------------+--------------------------+---------------------------------+-----------------------+-----------------------+---------------------+------------+-------------+------------+----------------+---------------+--------------------+---------------------+--------------------+----------------+-----------------+--------+-------+----------+------------------------+---------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+----------------------+---------------------+
|Customer_ID|snapshot_date|feature_snapshot_date|Age|Occupation|Delay_from_due_date|Outstanding_Debt|Amount_invested

In [21]:
# extract feature store using string dates
feature_sdf = feature_sdf.filter(
    (col("snapshot_date") >= to_date(lit(config["train_test_start_date_str"]))) & 
    (col("snapshot_date") <= to_date(lit(config["oot_end_date_str"])))
)

print("extracted feature_sdf", feature_sdf.count(), config["train_test_start_date_str"], config["oot_end_date_str"])
feature_sdf.show()

                                                                                

extracted feature_sdf 75875 2023-10-01 2024-11-01
+-----------+-------------+---------------------+---+----------+-------------------+----------------+-----------------------+-------------+-----------------+---------------+-------------------+-----------------------+----------------------+-------------------+-----------------------+-----------------------------+--------------------------+---------------------------------+-----------------------+-----------------------+---------------------+------------+-------------+------------+----------------+---------------+--------------------+---------------------+--------------------+----------------+-----------------+--------+-------+----------+------------------------+---------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+----------------------+---------------------+
|Customer_ID|snapshot_date|feature_snapshot_date|Age|Occupation|Delay_from_due_date|

## prepare data for modeling

In [43]:
# prepare data for modeling
data_sdf = labels_sdf.join(feature_sdf, on=["Customer_ID", "snapshot_date"], how="inner")
data_pdf = data_sdf.toPandas()
data_pdf

                                                                                

Unnamed: 0,Customer_ID,snapshot_date,loan_id,label,label_definition,label_snapshot_date,mob,dpd,feature_snapshot_date,Age,...,clickstream_total_events,clickstream_fe_5_mean,clickstream_fe_5_sum,clickstream_fe_5_std,clickstream_fe_9_mean,clickstream_fe_9_min,clickstream_fe_4_mean,clickstream_fe_4_min,clickstream_fe_10_mean,clickstream_fe_10_min
0,CUS_0x1011,2024-08-01,CUS_0x1011_2023_11_01,0,DPD_30+_MOB_6+,2024-08-01,9,0,2024-08-01,0,...,1,44.0,44,0.0,263.0,263,97.0,97,27.0,27
1,CUS_0x1013,2024-08-01,CUS_0x1013_2023_12_01,0,DPD_30+_MOB_6+,2024-08-01,8,0,2024-08-01,0,...,1,224.0,224,0.0,235.0,235,-55.0,-55,117.0,117
2,CUS_0x1018,2024-08-01,CUS_0x1018_2023_11_01,1,DPD_30+_MOB_6+,2024-08-01,9,180,2024-08-01,0,...,1,318.0,318,0.0,117.0,117,190.0,190,-43.0,-43
3,CUS_0x1026,2024-08-01,CUS_0x1026_2023_10_01,0,DPD_30+_MOB_6+,2024-08-01,10,0,2024-08-01,0,...,1,102.0,102,0.0,226.0,226,113.0,113,175.0,175
4,CUS_0x102d,2024-08-01,CUS_0x102d_2024_01_01,0,DPD_30+_MOB_6+,2024-08-01,7,0,2024-08-01,0,...,1,117.0,117,0.0,82.0,82,-91.0,-91,125.0,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34234,CUS_0xfaf,2023-10-01,CUS_0xfaf_2023_04_01,0,DPD_30+_MOB_6+,2023-10-01,6,0,2023-10-01,0,...,1,94.0,94,0.0,325.0,325,15.0,15,333.0,333
34235,CUS_0xfb6,2023-10-01,CUS_0xfb6_2023_04_01,0,DPD_30+_MOB_6+,2023-10-01,6,0,2023-10-01,0,...,1,-79.0,-79,0.0,59.0,59,74.0,74,238.0,238
34236,CUS_0xfc9,2023-10-01,CUS_0xfc9_2023_01_01,1,DPD_30+_MOB_6+,2023-10-01,9,240,2023-10-01,0,...,1,44.0,44,0.0,91.0,91,124.0,124,67.0,67
34237,CUS_0xfcb,2023-10-01,CUS_0xfcb_2023_04_01,0,DPD_30+_MOB_6+,2023-10-01,6,0,2023-10-01,0,...,1,180.0,180,0.0,52.0,52,159.0,159,-74.0,-74


In [23]:
# split data into train - test - oot using Spark DataFrame
oot_sdf = data_sdf.filter(
    (col("snapshot_date") >= to_date(lit(config["oot_start_date_str"]))) & 
    (col("snapshot_date") <= to_date(lit(config["oot_end_date_str"])))
)

train_test_sdf = data_sdf.filter(
    (col("snapshot_date") >= to_date(lit(config["train_test_start_date_str"]))) & 
    (col("snapshot_date") <= to_date(lit(config["train_test_end_date_str"])))
)

# Convert to pandas for modeling
oot_pdf = oot_sdf.toPandas()
train_test_pdf = train_test_sdf.toPandas()

# Define what columns are NOT features
non_feature_cols = [
    'Customer_ID', 'snapshot_date', 'label', 'loan_id', 
    'label_definition', 'label_snapshot_date', 'feature_snapshot_date', 'Occupation',
    'mob', 'dpd', 'dpd_mean', 'dpd_max', 'Loan_overdue_amt_sum', 'Loan_overdue_amt_mean',
    'Loan_overdue_amt_max', 'Loan_amt_sum', 'Loan_amt_mean', 'Loan_amt_std',
    'Loan_balance_sum', 'Loan_balance_mean', 'loan_count', 'Delay_from_due_date',
    'clickstream_total_events',
    'Age'
]

# All other columns are features 
feature_cols = [column for column in train_test_pdf.columns if column not in non_feature_cols]

X_oot = oot_pdf[feature_cols]
y_oot = oot_pdf["label"]
X_train, X_test, y_train, y_test = train_test_split(
    train_test_pdf[feature_cols], train_test_pdf["label"], 
    test_size= 1 - config["train_test_ratio"],
    random_state=88,
    shuffle=True,
    stratify=train_test_pdf["label"]
)

print('X_train', X_train.shape[0])
print('X_test', X_test.shape[0])
print('X_oot', X_oot.shape[0])
print('y_train', y_train.shape[0], round(y_train.mean(),2))
print('y_test', y_test.shape[0], round(y_test.mean(),2))
print('y_oot', y_oot.shape[0], round(y_oot.mean(),2))

X_train.head()

                                                                                

X_train 23364
X_test 5841
X_oot 5034
y_train 23364 0.28
y_test 5841 0.28
y_oot 5034 0.3


Unnamed: 0,Outstanding_Debt,Amount_invested_monthly,Interest_Rate,Num_Bank_Accounts,Num_Credit_Card,Loan_Type_Home_Loan,Loan_Type_Personal_Loan,Loan_Type_Student_Loan,Loan_Type_Auto_Loan,Loan_Type_Business_Loan,...,Loan_tenure_max,clickstream_fe_5_mean,clickstream_fe_5_sum,clickstream_fe_5_std,clickstream_fe_9_mean,clickstream_fe_9_min,clickstream_fe_4_mean,clickstream_fe_4_min,clickstream_fe_10_mean,clickstream_fe_10_min
18845,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,9.0,9,0.0,231.0,231,228.0,228,76.0,76
18314,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,180.0,180,0.0,152.0,152,100.0,100,18.0,18
7053,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,-98.0,-98,0.0,181.0,181,236.0,236,303.0,303
22602,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,-114.0,-114,0.0,239.0,239,129.0,129,-89.0,-89
16876,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,179.0,179,0.0,137.0,137,179.0,179,69.0,69


In [24]:
oot_sdf.show()

                                                                                

+-----------+-------------+--------------------+-----+----------------+-------------------+---+---+---------------------+---+----------+-------------------+----------------+-----------------------+-------------+-----------------+---------------+-------------------+-----------------------+----------------------+-------------------+-----------------------+-----------------------------+--------------------------+---------------------------------+-----------------------+-----------------------+---------------------+------------+-------------+------------+----------------+---------------+--------------------+---------------------+--------------------+----------------+-----------------+--------+-------+----------+------------------------+---------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+----------------------+---------------------+
|Customer_ID|snapshot_date|             loan_id|label|label_de

In [25]:
# Check feature columns and first few rows
print("Feature columns in X_train:")
print(X_train.columns.tolist())
print(f"\nNumber of features: {len(X_train.columns)}")

# Display the first few rows with all columns
print("\nX_train head (all columns):")
X_train.head()

Feature columns in X_train:
['Outstanding_Debt', 'Amount_invested_monthly', 'Interest_Rate', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Loan_Type_Home_Loan', 'Loan_Type_Personal_Loan', 'Loan_Type_Student_Loan', 'Loan_Type_Auto_Loan', 'Loan_Type_Business_Loan', 'Loan_Type_Credit-Builder_Loan', 'Loan_Type_Home_Equity_Loan', 'Loan_Type_Debt_Consolidation_Loan', 'Loan_Type_Mortgage_Loan', 'Loan_Type_Not_Specified', 'Loan_Type_Payday_Loan', 'Loan_tenure_mean', 'Loan_tenure_max', 'clickstream_fe_5_mean', 'clickstream_fe_5_sum', 'clickstream_fe_5_std', 'clickstream_fe_9_mean', 'clickstream_fe_9_min', 'clickstream_fe_4_mean', 'clickstream_fe_4_min', 'clickstream_fe_10_mean', 'clickstream_fe_10_min']

Number of features: 27

X_train head (all columns):


Unnamed: 0,Outstanding_Debt,Amount_invested_monthly,Interest_Rate,Num_Bank_Accounts,Num_Credit_Card,Loan_Type_Home_Loan,Loan_Type_Personal_Loan,Loan_Type_Student_Loan,Loan_Type_Auto_Loan,Loan_Type_Business_Loan,...,Loan_tenure_max,clickstream_fe_5_mean,clickstream_fe_5_sum,clickstream_fe_5_std,clickstream_fe_9_mean,clickstream_fe_9_min,clickstream_fe_4_mean,clickstream_fe_4_min,clickstream_fe_10_mean,clickstream_fe_10_min
18845,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,9.0,9,0.0,231.0,231,228.0,228,76.0,76
18314,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,180.0,180,0.0,152.0,152,100.0,100,18.0,18
7053,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,-98.0,-98,0.0,181.0,181,236.0,236,303.0,303
22602,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,-114.0,-114,0.0,239.0,239,129.0,129,-89.0,-89
16876,0.0,0.0,0.0,0,0,0,0,0,0,0,...,10,179.0,179,0.0,137.0,137,179.0,179,69.0,69


In [63]:
train_test_sdf.show()

                                                                                

+-----------+-------------+--------------------+-----+----------------+-------------------+---+---+---------------------+---+----------+-------------------+----------------+-----------------------+-------------+-----------------+---------------+-------------------+-----------------------+----------------------+-------------------+-----------------------+-----------------------------+--------------------------+---------------------------------+-----------------------+-----------------------+---------------------+------------+-------------+------------+----------------+---------------+--------------------+---------------------+--------------------+----------------+-----------------+--------+-------+----------+------------------------+---------------------+--------------------+--------------------+---------------------+--------------------+---------------------+--------------------+----------------------+---------------------+
|Customer_ID|snapshot_date|             loan_id|label|label_de

                                                                                

## preprocess data

In [27]:
scaler = StandardScaler() 
X_train_processed = scaler.fit_transform(X_train.fillna(0))
X_test_processed = scaler.transform(X_test.fillna(0))
X_oot_processed = scaler.transform(X_oot.fillna(0))

print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
print('X_oot_processed', X_oot_processed.shape[0])

pd.DataFrame(X_train_processed)

X_train_processed 23364
X_test_processed 5841
X_oot_processed 5034


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.970396,-0.970396,0.0,1.163361,1.163361,1.223867,1.223867,-0.426017,-0.426017
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.732326,0.732326,0.0,0.373515,0.373515,-0.056326,-0.056326,-1.005656,-1.005656
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-2.035843,-2.035843,0.0,0.663459,0.663459,1.303879,1.303879,1.842571,1.842571
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-2.195162,-2.195162,0.0,1.243345,1.243345,0.233718,0.233718,-2.074990,-2.074990
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.722368,0.722368,0.0,0.223544,0.223544,0.733793,0.733793,-0.495973,-0.495973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.240155,1.240155,0.0,0.243540,0.243540,-0.356371,-0.356371,0.173610,0.173610
23360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.469176,1.469176,0.0,0.173554,0.173554,-0.896452,-0.896452,-1.665245,-1.665245
23361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.840950,-0.840950,0.0,0.243540,0.243540,1.353886,1.353886,0.813212,0.813212
23362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.244411,0.244411,0.0,1.343326,1.343326,0.913820,0.913820,-0.046253,-0.046253


## train model

In [28]:
# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
param_dist = {
    'n_estimators': [25, 50],
    'max_depth': [2, 3],  # lower max_depth to simplify the model
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=3,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", train_auc_score)

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc_score)

# Evaluate the model on the oot set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("TRAIN GINI score: ", round(2*train_auc_score-1,3))
print("Test GINI score: ", round(2*test_auc_score-1,3))
print("OOT GINI score: ", round(2*oot_auc_score-1,3))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'subsample': 0.6, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best AUC score:  0.5095300383331
Train AUC score:  0.6390395124562616
Test AUC score:  0.6225546987056476
OOT AUC score:  0.6155376259362753
TRAIN GINI score:  0.278
Test GINI score:  0.245
OOT GINI score:  0.231


## prepare model artefact to save

In [31]:
model_artefact = {}

model_artefact['model'] = best_model
model_artefact['model_version'] = "credit_model_"+config["model_train_date_str"].replace('-','_')
model_artefact['preprocessing_transformers'] = {}
model_artefact['preprocessing_transformers']['stdscaler'] = scaler
model_artefact["feature_names"] = X_train.columns.tolist()
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {}
model_artefact['data_stats']['X_train'] = X_train.shape[0]
model_artefact['data_stats']['X_test'] = X_test.shape[0]
model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
model_artefact['data_stats']['y_train'] = round(y_train.mean(),2)
model_artefact['data_stats']['y_test'] = round(y_test.mean(),2)
model_artefact['data_stats']['y_oot'] = round(y_oot.mean(),2)
model_artefact['results'] = {}
model_artefact['results']['auc_train'] = train_auc_score
model_artefact['results']['auc_test'] = test_auc_score
model_artefact['results']['auc_oot'] = oot_auc_score
model_artefact['results']['gini_train'] = round(2*train_auc_score-1,3)
model_artefact['results']['gini_test'] = round(2*test_auc_score-1,3)
model_artefact['results']['gini_oot'] = round(2*oot_auc_score-1,3)
model_artefact['hp_params'] = random_search.best_params_


pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.date(2024, 12, 1),
                'model_train_date_str': '2024-12-01',
                'oot_end_date': datetime.date(2024, 11, 1),
                'oot_end_date_str': '2024-11-01',
                'oot_period_months': 2,
                'oot_start_date': datetime.date(2024, 10, 1),
                'oot_start_date_str': '2024-10-01',
                'train_test_end_date': datetime.date(2024, 9, 1),
                'train_test_end_date_str': '2024-09-01',
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.date(2023, 10, 1),
                'train_test_start_date_str': '2023-10-01'},
 'data_stats': {'X_oot': 5034,
                'X_test': 5841,
                'X_train': 23364,
                'y_oot': np.float64(0.3),
                'y_test': np.float64(0.28),
                'y_train': np.float64(0.28)},
 'feature_names': ['Outstanding_Debt',
         

## save artefact to model bank

In [39]:
# create model_bank dir
model_bank_directory = "/app/model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [40]:
# Full path to the file
file_path = os.path.join(model_bank_directory, model_artefact['model_version'] + '.pkl')

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")


Model saved to /app/model_bank/credit_model_2024_12_01.pkl


## test load pickle and make model inference

In [41]:
# Load the model from the pickle file
with open(file_path, 'rb') as file:
    loaded_model_artefact = pickle.load(file)

y_pred_proba = loaded_model_artefact['model'].predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("Model loaded successfully!")

OOT AUC score:  0.6155376259362753
Model loaded successfully!
