# Model monitoring

In [8]:
import os
import glob
import random
import pprint

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pyspark

In [9]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

#Pyspark remove warnings
spark.sparkContext.setLogLevel("ERROR")

In [20]:
# Load predictions
predictions_directory = "/app/datamart/gold/model_predictions/"
files_list = [predictions_directory+os.path.basename(f) for f in glob.glob(os.path.join(predictions_directory, '*'))]
df_predictions = spark.read.option("header", "true").parquet(*files_list)
print("predictions:")
df_predictions.show()
# Load Gold Features
gold_label_directory = "/app/datamart/gold/label_store/"
files_list = [gold_label_directory+os.path.basename(f) for f in glob.glob(os.path.join(gold_label_directory, '*'))]
label_df = spark.read.option("header", "true").parquet(*files_list)
print("True labels:")
label_df.show()
# Load Features
gold_feature_directory = "/app/datamart/gold/feature_store/"
df_features = spark.read.option("header", "true").parquet(gold_feature_directory)
print("Features:")
df_features.show()

predictions:
+-----------+-------------+--------------------+-------------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|
+-----------+-------------+--------------------+-------------------+
| CUS_0x58f7|   2024-12-01|xgb_credit_model_...|0.15767906606197357|
| CUS_0x8ae0|   2024-12-01|xgb_credit_model_...| 0.6158557534217834|
| CUS_0x8d91|   2024-12-01|xgb_credit_model_...|0.08418285101652145|
| CUS_0x4596|   2024-12-01|xgb_credit_model_...|0.21586577594280243|
| CUS_0x7dab|   2024-12-01|xgb_credit_model_...| 0.5243225693702698|
| CUS_0x7226|   2024-12-01|xgb_credit_model_...|0.08298828452825546|
| CUS_0x7c59|   2024-12-01|xgb_credit_model_...|0.09743735194206238|
| CUS_0x67cb|   2024-12-01|xgb_credit_model_...|0.07109204679727554|
| CUS_0x612f|   2024-12-01|xgb_credit_model_...| 0.8014252781867981|
| CUS_0x8b12|   2024-12-01|xgb_credit_model_...|0.11519676446914673|
| CUS_0x2e9e|   2024-12-01|xgb_credit_model_...| 0.1076139286160469|
| CUS_0x3ce3|   2024-

In [21]:
df_predictions.show(truncate=False)

+-----------+-------------+-------------------------------+-------------------+
|Customer_ID|snapshot_date|model_name                     |model_predictions  |
+-----------+-------------+-------------------------------+-------------------+
|CUS_0x58f7 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.15767906606197357|
|CUS_0x8ae0 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.6158557534217834 |
|CUS_0x8d91 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.08418285101652145|
|CUS_0x4596 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.21586577594280243|
|CUS_0x7dab |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.5243225693702698 |
|CUS_0x7226 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.08298828452825546|
|CUS_0x7c59 |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.09743735194206238|
|CUS_0x67cb |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.07109204679727554|
|CUS_0x612f |2024-12-01   |xgb_credit_model_2024_09_01.pkl|0.8014252781867981 |
|CUS_0x8b12 |2024-12-01   |xgb_credit_mo

In [22]:
label_df.groupBy("snapshot_date") \
        .agg(F.count("*").alias("count")) \
        .orderBy("snapshot_date") \
        .show(truncate=False)

+-------------+-----+
|snapshot_date|count|
+-------------+-----+
|2023-08-01   |530  |
|2023-09-01   |501  |
|2023-10-01   |506  |
|2023-11-01   |510  |
|2023-12-01   |521  |
|2024-01-01   |517  |
|2024-02-01   |471  |
|2024-03-01   |481  |
|2024-04-01   |454  |
|2024-05-01   |487  |
|2024-06-01   |491  |
|2024-07-01   |489  |
|2024-08-01   |485  |
|2024-09-01   |518  |
|2024-10-01   |511  |
|2024-11-01   |513  |
|2024-12-01   |491  |
|2025-01-01   |498  |
|2025-02-01   |505  |
|2025-03-01   |543  |
+-------------+-----+
only showing top 20 rows



In [23]:
merged_df = df_predictions.alias("p").join(
    df_features.alias("x"),
    on=["Customer_ID","snapshot_date"],
    how="inner"
).join(
    label_df.select("Customer_ID","snapshot_date","label").alias("y"),
    on=["Customer_ID","snapshot_date"],
    how="inner"   # labels may be missing
)

merged_df.show()


+-----------+-------------+--------------------+--------------------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------+------------------------+--------------------+------------------+-------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+-----+
|Customer_ID|snapshot_date|          model_name|   model_predictions|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Credit_Utilization_Ratio|   Payment_Behaviour|Credit_History_Age|   Occupation|fe_1_5m_avg|fe_2_5m_avg|fe_3_5m_avg|fe_4_5m_avg|fe_5_5m_avg|fe

In [25]:
# assume y_true_pdf and y_pred_pdf are pandas Series aligned by Customer_ID+snapshot_date
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, brier_score_loss


from scipy.stats import ks_2samp


baseline = merged_df.filter(F.col("snapshot_date") == "2024-11-01").toPandas()
current  = merged_df.filter(F.col("snapshot_date") == "2024-12-01").toPandas()
baseline.head()



Unnamed: 0,Customer_ID,snapshot_date,model_name,model_predictions,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,fe_12_5m_avg,fe_13_5m_avg,fe_14_5m_avg,fe_15_5m_avg,fe_16_5m_avg,fe_17_5m_avg,fe_18_5m_avg,fe_19_5m_avg,fe_20_5m_avg,label


In [None]:


for col in ["Annual_Income","Num_of_Loan","Credit_History_Age","fe_1","fe_2","fe_3"]:
    stat, p = ks_2samp(baseline[col].dropna(), current[col].dropna())
    print(col, "KS stat:", stat, "p-value:", p)

