# Model Monitoring KPI Example
This notebook demonstrates how to use the `predlab` module to define and analyze Key Performance Indicators (KPIs) across different data segments.

In [12]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from predlab import (
    SegmentCustom,
)

# Import the new model analyses builder
from predlab.model_analyses import ModelAnalysisDataBuilder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import inspect

import predlab

print("predlab file:", predlab.__file__)
try:
    import predlab.segmentation as seg

    print("has compute_percentile_bins:", hasattr(seg, "compute_percentile_bins"))
    print(
        "SegmentCustom has round_bounds:",
        "round_bounds" in str(inspect.signature(seg.SegmentCustom.__init__)),
    )
    print(
        "apply uses percentile:",
        "compute_percentile_bins" in inspect.getsource(seg.SegmentCustom.apply),
    )
except Exception as e:
    print("segmentation import/inspect error:", e)

predlab file: /home/diego/Dropbox/DropboxGit/VarTester/src/predlab/__init__.py
has compute_percentile_bins: True
SegmentCustom has round_bounds: True
apply uses percentile: True


## 1. Generate Synthetic Data

First, we'll create a synthetic dataset. This dataset mimics a typical insurance scenario with multiple coverages, predictions (risk premiums), and observed outcomes (targets/claims). The data is saved to a parquet file.

In [2]:
# --- Create a Synthetic Dataset ---
np.random.seed(42)
n_samples = 10000

# Create some numerical features
features = pd.DataFrame(
    {
        "age": np.random.randint(18, 70, n_samples),
        "income": np.random.gamma(2, 40000, n_samples),
        "credit_score": np.random.randint(300, 850, n_samples),
        "market_premium": np.random.uniform(0.1, 0.5, n_samples),
    }
)

# Create a categorical feature
features["region"] = np.random.choice(
    ["North", "South", "East", "West"], n_samples, p=[0.3, 0.2, 0.25, 0.25]
)

# --- Generate data for N=3 coverages in a wide format ---
N_coverages = 3
df = features.copy()

for i in range(N_coverages):
    cov_suffix = f"_{chr(65 + i)}"  # e.g., _A, _B, _C

    # --- Generate True Risk (rate for Poisson) ---
    true_risk_formula = (
        -4.0
        + (i * 0.1)
        + df["age"] / (20 + i * 2)
        - df["credit_score"] / (500 + i * 20)
        + df["income"] / 100000
    )
    true_risk_index = np.exp(true_risk_formula)

    # --- Generate a Slightly Incorrect Prediction ---
    prediction_formula = (
        -3.9
        + (i * 0.1)
        + df["age"] / (22 + i * 2)
        - df["credit_score"] / (550 + i * 20)
        + df["income"] / 110000
    )
    df[f"prediction{cov_suffix}"] = np.exp(prediction_formula)
    df[f"prediction{cov_suffix}_comp"] = np.exp(
        prediction_formula + 0.1 * +df["age"] / (22 + i * 2)
    )
    # --- Generate Target (claims) from the true risk ---
    df[f"target{cov_suffix}"] = np.random.poisson(true_risk_index)

# Add a single weight column
df["weight"] = 1.0

# Save to parquet
output_path = "../../data/segmentation_data_comp.parquet"
df.to_parquet(output_path)

print(f"Synthetic data generated and saved to {output_path}")
df.head()

Synthetic data generated and saved to ../../data/segmentation_data_comp.parquet


Unnamed: 0,age,income,credit_score,market_premium,region,prediction_A,prediction_A_comp,target_A,prediction_B,prediction_B_comp,target_B,prediction_C,prediction_C_comp,target_C,weight
0,56,31626.419807,693,0.295256,West,0.097585,0.125872,0,0.091178,0.11514,0,0.087754,0.108845,0,1.0
1,69,94350.67679,565,0.230289,East,0.393303,0.538192,0,0.346978,0.462553,1,0.317891,0.414508,0,1.0
2,46,34226.773664,803,0.112133,South,0.051926,0.064002,0,0.050744,0.061465,0,0.050761,0.060585,0,1.0
3,32,189107.488476,433,0.410269,North,0.220123,0.254587,0,0.221539,0.253137,0,0.226736,0.256432,1,1.0
4,60,32588.089714,325,0.305161,South,0.230528,0.302809,0,0.207231,0.26609,0,0.192646,0.242651,0,1.0


In [3]:
from catboost import CatBoostRegressor

# Define features and target
feature_cols = ["age", "income", "credit_score"]
target_col = "target_A"

X = df[feature_cols]
y = df[target_col]

# Initialize and train the CatBoost Regressor model
# Using some default parameters for this example
cat_model = CatBoostRegressor(
    iterations=200,
    learning_rate=0.05,
    depth=5,
    verbose=50,
    loss_function="Poisson",
)

cat_model.fit(X, y)

print("CatBoost model training complete.")

0:	learn: 0.9188294	total: 50ms	remaining: 9.95s
50:	learn: 0.3913641	total: 252ms	remaining: 736ms
100:	learn: 0.3766409	total: 419ms	remaining: 411ms
150:	learn: 0.3725671	total: 640ms	remaining: 208ms
199:	learn: 0.3690083	total: 798ms	remaining: 0us
CatBoost model training complete.


## 2. Configure the Analysis

Now, we set up the analysis by defining the coverages, segmentation strategies, and data treatments.

### Prediction & Target Mapping
We create a dictionary to map each coverage to its corresponding prediction, target, and weight columns.

In [4]:
pred_dict = {
    "A": {"sel_col": "weight", "pred_col": "prediction_A", "target_col": "target_A"},
    "B": {"sel_col": "weight", "pred_col": "prediction_B", "target_col": "target_B"},
    "C": {"sel_col": "weight", "pred_col": "prediction_C", "target_col": "target_C"},
}

### Segmentation Strategies
We define how to segment the data. We can create segments from categorical features or by binning numerical features.

In [15]:
segments = [
    # Segment for Age with custom bins
    SegmentCustom(
        seg_col="age",
        seg_name="age_group",
        bins=[18, 30, 45, 60, 75],
        bin_labels=["18-29", "30-44", "45-59", "60+"],
    ),
    # Segment for Income with 5 equal-width bins
    SegmentCustom(seg_col="income", seg_name="income_level", bins=5),
    # Segment for Credit Score (binned)
    SegmentCustom(seg_col="credit_score", seg_name="credit_score_level", bins=5),
]
print(f"{len(segments)} segments defined.")

3 segments defined.


### Initialize Analysis and Apply Treatments
We initialize the `AnalysisDataBuilder` object, which orchestrates the data loading, treatment, and segmentation. Treatments are applied to the data, such as iso resources scaling of predictions or aggregating totals.

In [6]:
# Define a configuration dict for model analysis (PDP)
func_dict_pdp = {
    "model": {
        "model": cat_model,
        "name": "model A",
        # Provide feature_cols explicitly so PDP knows input layout
        "feature_cols": ["age", "income", "credit_score"],
    },
    # Optional global weight/target
    "target_col": "target_A",
    "weight_col": "weight",
}

In [19]:
# Initialize ModelAnalysisDataBuilder with the data path
lr_analysis = ModelAnalysisDataBuilder(
    data=output_path, extra_cols=["market_premium"]
)  # extra_cols optional here

# Register analyses
lr_analysis.add_analysis("PDP", func_dict_pdp)

# Add segments (used to derive PDP grids for the corresponding features)
for s in segments:
    lr_analysis.add_segment(s)

# Load data
lr_analysis.load_data()

# Calculate all
lr_analysis.calculate()

# Collect analysis objects
analyses_objs = lr_analysis.get_analyses_objects()

In [18]:
# Inspect DB after calculation
cols = [c for c in lr_analysis.db.columns if "income" in c or "level" in c]
print("DB columns (income-related):", cols)
if "income_level" in lr_analysis.db.columns:
    s = lr_analysis.db["income_level"]
    print("dtype:", s.dtype)
    try:
        cats = list(s.dtype.categories[:5])
        print("categories sample:", cats)
        print("category type:", type(cats[0]))
    except Exception as e:
        print("no categories or error:", e)
    print("value sample:", s.astype(str).head().tolist())
else:
    print("income_level not found in DB")

DB columns (income-related): ['income']
income_level not found in DB


In [20]:
analyses_objs

Unnamed: 0,segment,bin,eval_value,prediction,pdp
0,age_group,18-29,24.0,0.051606,0.051606
1,age_group,30-44,37.5,0.105462,0.105462
2,age_group,45-59,52.5,0.226281,0.226281
3,age_group,60+,67.5,0.463256,0.463256
4,income_level,"[221, 33200)",16698.653094,0.08538,0.08538
5,income_level,"[33200, 54800)",44009.225526,0.111193,0.111193
6,income_level,"[54800, 80300)",67586.423705,0.1311,0.1311
7,income_level,"[80300, 121000)",100652.244478,0.195735,0.195735
8,income_level,"[121000, 548000)",334380.032911,3.031624,3.031624
9,credit_score_level,"[300, 409)",354.5,0.28629,0.28629


In [21]:
# Show unique bins for the income_level PDP to verify labels
try:
    bins_series = analyses_objs.loc[analyses_objs["segment"] == "income_level", "bin"]
    print("unique bins (income_level):", sorted(set(bins_series.tolist()))[:8])
except Exception as e:
    print("Error extracting bins:", e)

unique bins (income_level): ['[121000, 548000)', '[221, 33200)', '[33200, 54800)', '[54800, 80300)', '[80300, 121000)']


In [16]:
# Debug: inspect the dtype and first categories of the income_level segment
seg_name = "income_level"
raw = lr_analysis.db[seg_name]
print("dtype:", raw.dtype)
try:
    cats = list(raw.dtype.categories[:5])
    print("categories sample:", cats)
    print("category type:", type(cats[0]))
except Exception as e:
    print("no categories or error:", e)

# Show unique values (limited)
print("unique sample:", list(pd.Series(raw).astype(str).unique())[:5])

KeyError: 'income_level'