In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import Imputer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

In [2]:
# Load Gold Table
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

#Pyspark remove warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 06:04:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# set up config
model_train_date_str = "2024-09-01"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] =  oot_period_months
config["model_train_date"] =  datetime.strptime(model_train_date_str, "%Y-%m-%d")
config["oot_end_date"] =  config['model_train_date'] - timedelta(days = 1)
config["oot_start_date"] =  config['model_train_date'] - relativedelta(months = oot_period_months)
config["train_test_end_date"] =  config["oot_start_date"] - timedelta(days = 1)
config["train_test_start_date"] =  config["oot_start_date"] - relativedelta(months = train_test_period_months)
config["train_test_ratio"] = train_test_ratio

pprint.pprint(config)

{'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
 'model_train_date_str': '2024-09-01',
 'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
 'oot_period_months': 2,
 'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
 'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)}


# Load Labels

In [4]:
gold_label_directory = "/app/datamart/gold/label_store/"

# Read all CSV files into a single DataFrame
files_list = [gold_label_directory+os.path.basename(f) for f in glob.glob(os.path.join(gold_label_directory, '*'))]
df_labels = spark.read.option("header", "true").parquet(*files_list)

# extract label store
df_labels = df_labels.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))

print("extracted df_labels", df_labels.count(), config["train_test_start_date"], config["oot_end_date"])



extracted df_labels 6443 2023-07-01 00:00:00 2024-08-31 00:00:00


                                                                                

In [5]:
df_labels.show()

df_labels.count()

+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|90dpd_7mob|   2023-08-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|90dpd_7mob|   2023-08-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|90dpd_7mob|   2023-08-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|90dpd_7mob|   2023-08-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|90dpd_7mob|   2023-08-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|90dpd_7mob|   2023-08-01|
|CUS_0x144

6443

# Load Features

In [6]:
gold_feature_directory = "/app/datamart/gold/feature_store/"

# Read all CSV files into a single DataFrame
files_list = [gold_feature_directory+os.path.basename(f) for f in glob.glob(os.path.join(gold_feature_directory, '*'))]
df_features = spark.read.option("header", "true").parquet(*files_list)

df_features = df_features.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))

print("extracted df_features", df_features.count(), config["train_test_start_date"], config["oot_end_date"])

extracted df_features 125636 2023-07-01 00:00:00 2024-08-31 00:00:00


In [7]:
df_features.show()

+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------+------------------------+--------------------+------------------+-------------+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------------+
|Customer_ID|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Credit_Utilization_Ratio|   Payment_Behaviour|Credit_History_Age|   Occupation|fe_1|fe_2|fe_3|fe_4|fe_5|fe_6|fe_7|fe_8|fe_9|fe_10|fe_11|fe_12|fe_13|fe_14|fe_15|fe_16|fe_17|fe_18|fe_19|fe_20|snapshot_date|
+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+-------

# Clean Features

In [8]:
num_null_ids = df_features.filter(F.col("Customer_ID").isNull()).count()
total_rows = df_features.count()
print(f"Customer_ID nulls: {num_null_ids} / {total_rows}")

Customer_ID nulls: 0 / 125636


In [9]:
data_pdf = df_labels.join(df_features, on=["Customer_ID", "snapshot_date"], how="left").toPandas()
data_pdf = data_pdf.dropna()

print("Row count:", data_pdf.count())



Row count: Customer_ID                 4987
snapshot_date               4987
loan_id                     4987
label                       4987
label_def                   4987
Annual_Income               4987
Monthly_Inhand_Salary       4987
Num_Bank_Accounts           4987
Num_Credit_Card             4987
Interest_Rate               4987
Num_of_Loan                 4987
Delay_from_due_date         4987
Num_of_Delayed_Payment      4987
Changed_Credit_Limit        4987
Num_Credit_Inquiries        4987
Credit_Mix                  4987
Credit_Utilization_Ratio    4987
Payment_Behaviour           4987
Credit_History_Age          4987
Occupation                  4987
fe_1                        4987
fe_2                        4987
fe_3                        4987
fe_4                        4987
fe_5                        4987
fe_6                        4987
fe_7                        4987
fe_8                        4987
fe_9                        4987
fe_10                       4987

                                                                                

In [10]:
from sklearn.model_selection import train_test_split


# split data into train - test - oot
oot_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["oot_start_date"].date()) & (data_pdf['snapshot_date'] <= config["oot_end_date"].date())]
train_test_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["train_test_start_date"].date()) & (data_pdf['snapshot_date'] <= config["train_test_end_date"].date())]

num_cols = [
    "Annual_Income",
    "Monthly_Inhand_Salary",
    "Num_Bank_Accounts",
    "Num_Credit_Card",
    "Interest_Rate",
    "Num_of_Loan",
    "Delay_from_due_date",
    "Num_of_Delayed_Payment",
    "Changed_Credit_Limit",
    "Num_Credit_Inquiries",
    "Credit_Utilization_Ratio",
    "Credit_History_Age",
    "fe_1","fe_2","fe_3","fe_4","fe_5","fe_6","fe_7","fe_8","fe_9","fe_10",
    "fe_11","fe_12","fe_13","fe_14","fe_15","fe_16","fe_17","fe_18","fe_19","fe_20"
]

cat_cols = ["Credit_Mix", "Payment_Behaviour", "Occupation"]

feature_cols = cat_cols + num_cols

X_oot = oot_pdf[feature_cols]
y_oot = oot_pdf["label"]
X_train, X_test, y_train, y_test = train_test_split(
    train_test_pdf[feature_cols], train_test_pdf["label"], 
    test_size= 1 - config["train_test_ratio"],
    random_state=88,     # Ensures reproducibility
    shuffle=True,        # Shuffle the data before splitting
    stratify=train_test_pdf["label"]           # Stratify based on the label column
)


print('X_train', X_train.shape[0])
print('X_test', X_test.shape[0])
print('X_oot', X_oot.shape[0])
print('y_train', y_train.shape[0], round(y_train.mean(),2))
print('y_test', y_test.shape[0], round(y_test.mean(),2))
print('y_oot', y_oot.shape[0], round(y_oot.mean(),2))

X_train

X_train 3396
X_test 850
X_oot 741
y_train 3396 0.27
y_test 850 0.27
y_oot 741 0.28


Unnamed: 0,Credit_Mix,Payment_Behaviour,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,fe_11,fe_12,fe_13,fe_14,fe_15,fe_16,fe_17,fe_18,fe_19,fe_20
2364,Bad,Low_spent_Medium_value_payments,Mechanic,73083.437500,6264.286621,7.0,9.0,28.0,6.0,38.0,...,42,67,125,149,62,159,177,158,171,84
4783,Good,Low_spent_Medium_value_payments,Journalist,77224.882812,6221.406738,2.0,1.0,12.0,4.0,10.0,...,118,21,81,265,-11,127,207,-29,237,20
6182,Good,High_spent_Large_value_payments,Journalist,85038.867188,6836.572266,0.0,2.0,6.0,3.0,6.0,...,212,182,163,220,115,104,44,116,94,134
1420,Standard,High_spent_Large_value_payments,Engineer,88014.390625,7172.532715,4.0,5.0,17.0,2.0,13.0,...,79,139,103,135,-46,-17,-104,125,114,94
2556,Standard,Unknown,Journalist,44985.718750,3791.810059,6.0,4.0,15.0,4.0,23.0,...,144,98,-94,78,110,56,108,150,31,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,Bad,Unknown,Manager,15837.309570,1019.775818,8.0,6.0,16.0,8.0,54.0,...,108,24,209,81,274,66,127,52,50,175
662,Standard,Low_spent_Small_value_payments,Doctor,44103.179688,3416.264893,3.0,7.0,13.0,4.0,5.0,...,146,240,154,209,264,228,77,80,72,88
5522,Standard,Low_spent_Large_value_payments,Accountant,28163.570312,2466.964111,6.0,6.0,16.0,6.0,14.0,...,-83,172,27,102,123,46,233,57,164,242
4696,Standard,High_spent_Small_value_payments,Developer,29375.679688,2728.973389,4.0,8.0,27.0,6.0,5.0,...,117,11,85,10,166,-119,98,302,39,43


In [11]:
import numpy as np
import pandas as pd  # ✅ missing import
from sklearn.preprocessing import StandardScaler  # ✅ indentation fixed

def process_features(input_df):
    
    # Replace "_" with NaN
    string_cols = input_df.select_dtypes(include="object").columns.tolist()
    for c in string_cols:
        input_df[c] = input_df[c].replace("_", np.nan)

    # 3️⃣ One-hot encode categoricals
    onehot_cols = ["Credit_Mix", "Payment_Behaviour", "Occupation"]
    input_df = pd.get_dummies(input_df, columns=onehot_cols, drop_first=False)

    # Convert boolean columns to 0/1
    bool_cols = input_df.select_dtypes(include="bool").columns
    input_df[bool_cols] = input_df[bool_cols].astype(int)


    scaler = StandardScaler()

    num_cols = [
        "Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card",
        "Interest_Rate","Num_of_Loan","Delay_from_due_date","Num_of_Delayed_Payment",
        "Changed_Credit_Limit","Num_Credit_Inquiries","Credit_Utilization_Ratio",
        "Credit_History_Age",
        "fe_1","fe_2","fe_3","fe_4","fe_5","fe_6","fe_7","fe_8","fe_9","fe_10",
        "fe_11","fe_12","fe_13","fe_14","fe_15","fe_16","fe_17","fe_18","fe_19","fe_20"
    ]

    input_df[num_cols] = scaler.fit_transform(input_df[num_cols])

    return input_df


In [12]:
X_train_processed = process_features(X_train)
X_test_processed = process_features(X_test)
X_oot_processed = process_features(X_oot)


print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
print('X_oot_processed', X_oot_processed.shape[0])

pd.DataFrame(X_train_processed)

X_train_processed 3396
X_test_processed 850
X_oot_processed 741


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df[c] = input_df[c].replace("_", np.nan)


Unnamed: 0,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Unemployed,Occupation_Writer
2364,0.719012,0.790887,0.704714,1.804179,1.678644,1.120857,1.491725,0.347630,-1.349270,-0.096474,...,0,0,0,1,0,0,0,0,0,0
4783,0.840700,0.775711,-1.252760,-2.187672,-0.210717,0.268544,-0.708960,-0.632217,-1.509875,-0.096474,...,1,0,0,0,0,0,0,0,0,0
6182,1.070299,0.993429,-2.035750,-1.688690,-0.919227,-0.157613,-1.023343,-0.795524,-0.970815,-1.131629,...,1,0,0,0,0,0,0,0,0,0
1420,1.157729,1.112331,-0.469771,-0.191746,0.379708,-0.583769,-0.473172,0.021015,0.962805,-0.355263,...,0,0,0,0,0,0,0,0,0,0
2556,-0.106585,-0.084163,0.313219,-0.690728,0.143538,0.268544,0.312787,0.837554,0.603432,0.679892,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,-0.963054,-1.065231,1.096209,0.307235,0.261623,1.973170,2.749259,1.490785,1.417587,1.456258,...,0,0,1,0,0,0,0,0,0,0
662,-0.132517,-0.217074,-0.861266,0.806216,-0.092632,0.268544,-1.101939,0.510938,0.263140,0.162315,...,0,0,0,0,0,0,0,0,0,0
5522,-0.600871,-0.553048,0.313219,0.307235,0.261623,1.120857,-0.394576,-0.632217,0.794250,0.421103,...,0,0,0,0,0,0,0,0,0,0
4696,-0.565255,-0.460318,-0.469771,1.305198,1.560559,1.120857,-1.101939,-0.142293,0.364910,2.232625,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb



# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
param_dist = {
    'n_estimators': [25, 50],
    'max_depth': [2, 3],  # lower max_depth to simplify the model
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=3,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", train_auc_score)

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc_score)

# Evaluate the model on the oot set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("TRAIN GINI score: ", round(2*train_auc_score-1,3))
print("Test GINI score: ", round(2*test_auc_score-1,3))
print("OOT GINI score: ", round(2*oot_auc_score-1,3))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best AUC score:  0.685450912651809
Train AUC score:  0.8745471396073806
Test AUC score:  0.799541519875676
OOT AUC score:  0.804681194511703
TRAIN GINI score:  0.749
Test GINI score:  0.599
OOT GINI score:  0.609


In [17]:
scaler=StandardScaler()

model_artefact = {}

model_artefact['model'] = best_model
model_artefact['model_name'] = "xgb"
model_artefact['model_version'] = "credit_model_"+config["model_train_date_str"].replace('-','_')
model_artefact['preprocessing_transformers'] = {}
model_artefact['preprocessing_transformers']['stdscaler'] = scaler
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {}
model_artefact['data_stats']['X_train'] = X_train.shape[0]
model_artefact['data_stats']['X_test'] = X_test.shape[0]
model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
model_artefact['data_stats']['y_train'] = round(y_train.mean(),2)
model_artefact['data_stats']['y_test'] = round(y_test.mean(),2)
model_artefact['data_stats']['y_oot'] = round(y_oot.mean(),2)
model_artefact['results'] = {}
model_artefact['results']['auc_train'] = train_auc_score
model_artefact['results']['auc_test'] = test_auc_score
model_artefact['results']['auc_oot'] = oot_auc_score
model_artefact['results']['gini_train'] = round(2*train_auc_score-1,3)
model_artefact['results']['gini_test'] = round(2*test_auc_score-1,3)
model_artefact['results']['gini_oot'] = round(2*oot_auc_score-1,3)
model_artefact['hp_params'] = random_search.best_params_


pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
                'model_train_date_str': '2024-09-01',
                'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
                'oot_period_months': 2,
                'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
                'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)},
 'data_stats': {'X_oot': 741,
                'X_test': 850,
                'X_train': 3396,
                'y_oot': np.float64(0.28),
                'y_test': np.float64(0.27),
                'y_train': np.float64(0.27)},
 'hp_params': {'colsample_bytree': 0.8,
               'gamma': 0,
               'learning_rate': 0.1,
               'max_depth': 3,
               'min_child_weight': 3,
               'n_estimators': 50,
            

In [18]:
# create model_bank dir
model_bank_directory = "model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [19]:
import pickle

# Full path to the file
file_name = f"{model_artefact['model_name']}_{model_artefact['model_version']}.pkl"
file_path = os.path.join("model_bank/", file_name)

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")


Model saved to model_bank/xgb_credit_model_2024_09_01.pkl
