In [None]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# import model_inference


In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/08 22:11:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Config


In [3]:
snapshot_date_str = "2024-01-01"
model_name = "xgb_credit_model_2024_09_01.pkl"


In [4]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "model_bank/"
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

pprint.pprint(config)

{'model_artefact_filepath': 'model_bank/xgb_credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'xgb_credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2024, 1, 1, 0, 0),
 'snapshot_date_str': '2024-01-01'}


In [5]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

Model loaded successfully! model_bank/xgb_credit_model_2024_09_01.pkl


In [6]:
gold_feature_directory = "/app/datamart/gold/feature_store/"

# Read all CSV files into a single DataFrame
files_list = [gold_feature_directory+os.path.basename(f) for f in glob.glob(os.path.join(gold_feature_directory, '*'))]
features_store_sdf = spark.read.option("header", "true").parquet(*files_list)


# extract feature store
features_sdf = features_store_sdf.filter((col("snapshot_date") == config["snapshot_date"]))
print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])



[Stage 1:>                                                        (0 + 12) / 12]

extracted features_sdf 8974 2024-01-01 00:00:00


                                                                                

# Preporcess date for modeling

In [7]:

# 1️⃣ Drop all rows with ANY nulls
features_sdf = features_sdf.dropna(how="any")

features_sdf.show()

+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------+------------------------+--------------------+------------------+-------------+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-------------+
|Customer_ID|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Credit_Utilization_Ratio|   Payment_Behaviour|Credit_History_Age|   Occupation|fe_1|fe_2|fe_3|fe_4|fe_5|fe_6|fe_7|fe_8|fe_9|fe_10|fe_11|fe_12|fe_13|fe_14|fe_15|fe_16|fe_17|fe_18|fe_19|fe_20|snapshot_date|
+-----------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+-------

In [8]:
import numpy as np
import pandas as pd  # ✅ missing import
from sklearn.preprocessing import StandardScaler  # ✅ indentation fixed

def process_features(input_df):
    
    # Replace "_" with NaN
    string_cols = input_df.select_dtypes(include="object").columns.tolist()
    for c in string_cols:
        input_df[c] = input_df[c].replace("_", np.nan)

    # 3️⃣ One-hot encode categoricals
    onehot_cols = ["Credit_Mix", "Payment_Behaviour", "Occupation"]
    input_df = pd.get_dummies(input_df, columns=onehot_cols, drop_first=False)

    # Convert boolean columns to 0/1
    bool_cols = input_df.select_dtypes(include="bool").columns
    input_df[bool_cols] = input_df[bool_cols].astype(int)


    scaler = StandardScaler()

    num_cols = [
        "Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card",
        "Interest_Rate","Num_of_Loan","Delay_from_due_date","Num_of_Delayed_Payment",
        "Changed_Credit_Limit","Num_Credit_Inquiries","Credit_Utilization_Ratio",
        "Credit_History_Age",
        "fe_1","fe_2","fe_3","fe_4","fe_5","fe_6","fe_7","fe_8","fe_9","fe_10",
        "fe_11","fe_12","fe_13","fe_14","fe_15","fe_16","fe_17","fe_18","fe_19","fe_20"
    ]

    input_df[num_cols] = scaler.fit_transform(input_df[num_cols])

    return input_df


In [9]:
features_pdf = features_sdf.toPandas()
features_pdf

Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,fe_12,fe_13,fe_14,fe_15,fe_16,fe_17,fe_18,fe_19,fe_20,snapshot_date
0,CUS_0x6ba9,20272.230469,1776.352539,4.0,3.0,12.0,1.0,-1.0,5.0,4.83,...,309,164,180,77,-48,6,248,95,236,2024-01-01
1,CUS_0x74c7,87853.562500,7527.129883,4.0,1.0,4.0,2.0,2.0,2.0,9.99,...,21,58,143,79,35,-2,206,39,209,2024-01-01
2,CUS_0x78d3,17773.775391,1388.147949,3.0,1.0,2.0,2.0,5.0,2.0,4.57,...,-106,172,205,49,-83,29,63,12,219,2024-01-01
3,CUS_0x7f5d,20615.789062,1761.982544,7.0,7.0,21.0,7.0,56.0,13.0,10.01,...,256,109,268,52,48,182,106,176,124,2024-01-01
4,CUS_0xf6c,76046.250000,6082.187500,7.0,3.0,16.0,3.0,17.0,19.0,3.29,...,176,-48,-66,80,149,174,162,13,218,2024-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6991,CUS_0x32ba,115735.757812,9468.646484,5.0,7.0,13.0,0.0,22.0,11.0,5.41,...,52,-123,158,110,-18,25,103,189,199,2024-01-01
6992,CUS_0x9066,21761.490234,1615.457520,5.0,4.0,4.0,3.0,10.0,11.0,8.77,...,160,144,-88,171,331,134,110,50,91,2024-01-01
6993,CUS_0x915,19562.470703,1713.205811,3.0,3.0,9.0,3.0,8.0,9.0,19.35,...,127,286,20,116,-36,-15,77,99,214,2024-01-01
6994,CUS_0x219c,30812.960938,2422.746582,1.0,7.0,3.0,1.0,10.0,5.0,8.21,...,139,124,99,-56,36,57,14,168,299,2024-01-01


In [10]:
X_inference = process_features(features_pdf)\
    .drop(["Customer_ID", "snapshot_date"], axis=1)

X_inference

Unnamed: 0,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,...,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Unemployed,Occupation_Writer
0,-0.813354,-0.779692,-0.490827,-1.209349,-0.227239,-1.001846,-1.592938,-1.295457,-0.820424,-0.105445,...,0,1,0,0,0,0,0,0,0,0
1,1.135819,1.219423,-0.490827,-2.198951,-1.170020,-0.581496,-1.358028,-1.779342,-0.007660,-0.618442,...,0,0,0,0,0,0,0,0,1,0
2,-0.885415,-0.914642,-0.880681,-2.198951,-1.405715,-0.581496,-1.123118,-1.779342,-0.861377,-0.618442,...,1,0,0,0,0,0,0,0,0,0
3,-0.803446,-0.784687,0.678733,0.769856,0.833390,1.520254,2.870346,-0.005095,-0.004510,0.407553,...,0,0,0,0,0,0,0,1,0,0
4,0.795274,0.717125,0.678733,-1.209349,0.244151,-0.161146,-0.183480,0.962676,-1.062993,0.664052,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6991,1.939995,1.894343,-0.100974,0.769856,-0.109391,-1.422197,0.208036,-0.327686,-0.729067,-1.387939,...,0,1,0,0,0,0,0,0,0,0
6992,-0.770401,-0.835623,-0.100974,-0.714548,-1.170020,-0.161146,-0.731602,-0.327686,-0.199825,-1.131440,...,0,0,0,0,0,1,0,0,0,0
6993,-0.833825,-0.801643,-0.880681,-1.209349,-0.580782,-0.161146,-0.888209,-0.650276,1.466656,0.920551,...,1,0,0,0,0,0,0,0,0,0
6994,-0.509340,-0.554989,-1.660387,0.769856,-1.287868,-1.001846,-0.731602,-1.295457,-0.288032,-0.618442,...,0,0,0,0,0,0,0,0,1,0


# Inference

In [11]:
# load model
model = model_artefact["model"]

# predict model
y_inference = model.predict_proba(X_inference)[:, 1]

# prepare output
y_inference_pdf = features_pdf[["Customer_ID","snapshot_date"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference
y_inference_pdf



Unnamed: 0,Customer_ID,snapshot_date,model_name,model_predictions
0,CUS_0x6ba9,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.059874
1,CUS_0x74c7,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.051707
2,CUS_0x78d3,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.080133
3,CUS_0x7f5d,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.259417
4,CUS_0xf6c,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.050153
...,...,...,...,...
6991,CUS_0x32ba,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.113659
6992,CUS_0x9066,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.145555
6993,CUS_0x915,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.057995
6994,CUS_0x219c,2024-01-01,xgb_credit_model_2024_09_01.pkl,0.200528


## save model inference to datamart gold table

In [12]:

# create bronze datalake
gold_directory = f"datamart/gold/model_predictions/{config["model_name"][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.parquet'
filepath = gold_directory + partition_name
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
# df.toPandas().to_parquet(filepath,
#           compression='gzip')
print('saved to:', filepath)

datamart/gold/model_predictions/xgb_credit_model_2024_09_01/


                                                                                

saved to: datamart/gold/model_predictions/xgb_credit_model_2024_09_01/xgb_credit_model_2024_09_01_predictions_2024_01_01.parquet


# Check Datamart

In [13]:
folder_path = "datamart/gold/model_predictions/xgb_credit_model_2024_09_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",df.count())

df.show()

row_count: 6996
+-----------+-------------+--------------------+-------------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|
+-----------+-------------+--------------------+-------------------+
| CUS_0x9576|   2024-01-01|xgb_credit_model_...|0.07228091359138489|
| CUS_0xc072|   2024-01-01|xgb_credit_model_...|0.23394308984279633|
| CUS_0x9425|   2024-01-01|xgb_credit_model_...| 0.2983936071395874|
| CUS_0xb37a|   2024-01-01|xgb_credit_model_...| 0.4886404573917389|
| CUS_0xb5da|   2024-01-01|xgb_credit_model_...| 0.1836709827184677|
| CUS_0x8247|   2024-01-01|xgb_credit_model_...|0.44099560379981995|
| CUS_0x7b09|   2024-01-01|xgb_credit_model_...| 0.1731003075838089|
| CUS_0x83a1|   2024-01-01|xgb_credit_model_...|0.08000364899635315|
|  CUS_0xa6b|   2024-01-01|xgb_credit_model_...|0.18841594457626343|
| CUS_0x4ab2|   2024-01-01|xgb_credit_model_...|0.19059191644191742|
| CUS_0x4c2c|   2024-01-01|xgb_credit_model_...|  0.086251400411129|
| CUS_0x2ef8|   20