# Model monitoring

In [7]:
import os
import glob
import random
import pprint

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pyspark

import pyspark.sql.functions as F

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

#Pyspark remove warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/09 08:10:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/09 08:10:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [24]:
# Load predictions
predictions_directory = "/app/datamart/gold/model_predictions/"
files_list = [predictions_directory+os.path.basename(f) for f in glob.glob(os.path.join(predictions_directory, '*'))]
df_predictions = spark.read.option("header", "true").parquet(*files_list)
print("predictions:")
df_predictions.show(truncate=False)

# Load Gold Features
gold_label_directory = "/app/datamart/gold/label_store/"
files_list = [gold_label_directory+os.path.basename(f) for f in glob.glob(os.path.join(gold_label_directory, '*'))]
label_df = spark.read.option("header", "true").parquet(*files_list)
print("True labels:")
label_df.show(truncate=False)

# load features (same snapshot granularity as predictions)
features_dir = "/app/datamart/gold/feature_store/"
files = [features_dir + os.path.basename(f) for f in glob.glob(os.path.join(features_dir, '*'))]
df_features = spark.read.parquet(*files)
print("Features:")
df_features.show(truncate=False)


predictions:
+-----------+-------------+-------------------------------+--------------------+
|Customer_ID|snapshot_date|model_name                     |model_predictions   |
+-----------+-------------+-------------------------------+--------------------+
|CUS_0x1d9e |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.08877021819353104 |
|CUS_0x7ea2 |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.1391143500804901  |
|CUS_0xedb  |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.6090103983879089  |
|CUS_0x4b91 |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.2919446527957916  |
|CUS_0x6b0c |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.04694114997982979 |
|CUS_0x971a |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.2924809753894806  |
|CUS_0x6a1a |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.2607428729534149  |
|CUS_0x7c64 |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.14412736892700195 |
|CUS_0xab92 |2024-11-01   |xgb_credit_model_2024_09_01.pkl|0.1766100525856018  |
|CUS_0xc16d |20

In [25]:
label_df.groupBy("snapshot_date") \
        .agg(F.count("*").alias("count")) \
        .orderBy("snapshot_date") \
        .show(truncate=False)

+-------------+-----+
|snapshot_date|count|
+-------------+-----+
|2023-08-01   |530  |
|2023-09-01   |501  |
|2023-10-01   |506  |
|2023-11-01   |510  |
|2023-12-01   |521  |
|2024-01-01   |517  |
|2024-02-01   |471  |
|2024-03-01   |481  |
|2024-04-01   |454  |
|2024-05-01   |487  |
|2024-06-01   |491  |
|2024-07-01   |489  |
|2024-08-01   |485  |
|2024-09-01   |518  |
|2024-10-01   |511  |
|2024-11-01   |513  |
|2024-12-01   |491  |
|2025-01-01   |498  |
|2025-02-01   |505  |
|2025-03-01   |543  |
+-------------+-----+
only showing top 20 rows



In [26]:
merged_df = df_predictions.alias("p").join(
    df_features.alias("x"),
    on=["Customer_ID","snapshot_date"],
    how="inner"
).join(
    label_df.select("Customer_ID","snapshot_date","label").alias("y"),
    on=["Customer_ID","snapshot_date"],
    how="inner"   # labels may be missing
)

merged_df.show()

+-----------+-------------+--------------------+--------------------+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------+------------------------+--------------------+------------------+------------+----+----+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|Customer_ID|snapshot_date|          model_name|   model_predictions|Annual_Income|Monthly_Inhand_Salary|Num_Bank_Accounts|Num_Credit_Card|Interest_Rate|Num_of_Loan|Delay_from_due_date|Num_of_Delayed_Payment|Changed_Credit_Limit|Num_Credit_Inquiries|Credit_Mix|Credit_Utilization_Ratio|   Payment_Behaviour|Credit_History_Age|  Occupation|fe_1|fe_2|fe_3|fe_4|fe_5|fe_6|fe_7|fe_8|fe_9|fe_10|fe_11|fe_12|fe_13|fe_14|fe_15|fe_16|fe_17|fe_18|fe_19|fe_20|label|
+-----------+-------------+--------------------+--------------------+-------------+---

# Data Drift (feature distribution shift)

compare feature distributions now vs a baseline snapshot

Kolmogorov–Smirnov test per numeric feature

p < 0.05 → feature has drifted

KS stat closer to 1 → heavy drift

In [30]:
from scipy.stats import ks_2samp

baseline = merged_df.filter(F.col("snapshot_date") == "2024-11-01").toPandas()
current  = merged_df.filter(F.col("snapshot_date") == "2024-12-01").toPandas()

for col in ["Annual_Income","Num_of_Loan","Credit_History_Age","fe_1","fe_2","fe_3"]:
    stat, p = ks_2samp(baseline[col].dropna(), current[col].dropna())
    print(col, "KS stat:", stat, "p-value:", p)



Annual_Income KS stat: 0.11257651056139721 p-value: 0.010990015007604005
Num_of_Loan KS stat: 0.028982726715724196 p-value: 0.9932526646117849
Credit_History_Age KS stat: 0.06969346264056592 p-value: 0.26547589465285126
fe_1 KS stat: 0.04579184176161506 p-value: 0.767998302949824
fe_2 KS stat: 0.0619696866548252 p-value: 0.40013012873589193
fe_3 KS stat: 0.07980616292958863 p-value: 0.14345904071452828


# Concept Drift (model behavior or learned relationship changed)

In [31]:
df = merged_df.withColumn("prediction_error", F.abs(F.col("label") - F.col("model_predictions")))
df.groupBy("snapshot_date").agg(F.avg("prediction_error")).show()


+-------------+---------------------+
|snapshot_date|avg(prediction_error)|
+-------------+---------------------+
|   2024-11-01|   0.3155889545889979|
|   2024-12-01|    0.299262545102039|
+-------------+---------------------+



In [32]:
import numpy as np

def psi(expected, actual, bins=10):
    e_perc, _ = np.histogram(expected, bins=bins)
    a_perc, _ = np.histogram(actual, bins=bins)
    e_perc = e_perc/len(expected)
    a_perc = a_perc/len(actual)
    psi_val = sum((e - a) * np.log(e/a) for e, a in zip(e_perc, a_perc) if e!=0 and a!=0)
    return psi_val

base = baseline["model_predictions"]
curr = current["model_predictions"]
print("PSI:", psi(base, curr))


PSI: 0.0567709270854883
