# Model Monitoring KPI Example
This notebook demonstrates how to use the `model_monitoring` module to define and analyze Key Performance Indicators (KPIs) across different data segments.

In [1]:
# Install missing dependencies
%pip install pyarrow scipy fastparquet matplotlib seaborn plotly

# Workaround for PyArrow extension type conflict
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='pyarrow')

# Clear any existing PyArrow extension types to avoid conflicts
try:
    import pyarrow as pa
    # This helps avoid the extension type registration conflict
    import importlib
    if hasattr(pa, '_extension_types'):
        pa._extension_types.clear()
except:
    pass

%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from model_monitoring import (
    AnalysisDataBuilder,
    SegmentCategorical,
    SegmentCustom,
    calculate_statistics,
)
from model_monitoring.plotting import plot_segment_statistics, set_plot_theme

print("✓ All imports successful! model_monitoring package is now available.")

Collecting scipy
  Using cached scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting scipy
  Using cached scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Using cached scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.2 MB)
Using cached scipy-1.16.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.2 MB)
Installing collected packages: scipy
Installing collected packages: scipy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
vartester 0.1.0 requires catboost>=1.2.8, which is not installed.
vartester 0.1.0 requires interpret==0.6.12, which is not installed.
vartester 0.1.0 requires mkdocs>=1.6.1, which is not installed.
vartester 0.1.0 requires mkdocs-material>=9.6.15, which is not installed.
vartester 0.1.0 requires mkdocstrings-pyth

ModuleNotFoundError: No module named 'matplotlib'

## 1. Generate Synthetic Data

First, we'll create a synthetic dataset. This dataset mimics a typical insurance scenario with multiple coverages, predictions (risk premiums), and observed outcomes (targets/claims). The data is saved to a parquet file.

In [None]:
# --- Create a Synthetic Dataset ---
np.random.seed(42)
n_samples = 10000

# Create some numerical features
features = pd.DataFrame(
    {
        "age": np.random.randint(18, 70, n_samples),
        "income": np.random.gamma(2, 40000, n_samples),
        "credit_score": np.random.randint(300, 850, n_samples),
        "market_premium": np.random.uniform(0.1, 0.5, n_samples),
    }
)

# Create a categorical feature
features["region"] = np.random.choice(
    ["North", "South", "East", "West"], n_samples, p=[0.3, 0.2, 0.25, 0.25]
)

# --- Generate data for N=3 coverages in a wide format ---
N_coverages = 3
df = features.copy()

for i in range(N_coverages):
    cov_suffix = f"_{chr(65 + i)}"  # e.g., _A, _B, _C

    # --- Generate True Risk (rate for Poisson) ---
    true_risk_formula = (
        -4.0
        + (i * 0.1)
        + df["age"] / (20 + i * 2)
        - df["credit_score"] / (500 + i * 20)
        + df["income"] / 100000
    )
    true_risk_index = np.exp(true_risk_formula)

    # --- Generate a Slightly Incorrect Prediction ---
    prediction_formula = (
        -3.9
        + (i * 0.1)
        + df["age"] / (22 + i * 2)
        - df["credit_score"] / (550 + i * 20)
        + df["income"] / 110000
    )
    df[f"prediction{cov_suffix}"] = np.exp(prediction_formula)

    # --- Generate Target (claims) from the true risk ---
    df[f"target{cov_suffix}"] = np.random.poisson(true_risk_index)

# Add a single weight column
df["weight"] = 1.0

# Save to parquet with error handling
output_path = "../data/raw/segmentation_data.parquet"
try:
    # Try with pyarrow engine first
    df.to_parquet(output_path, engine='pyarrow')
    print(f"Synthetic data generated and saved to {output_path} (using pyarrow)")
except Exception as e:
    print(f"PyArrow failed with error: {e}")
    try:
        # Fallback to fastparquet engine
        df.to_parquet(output_path, engine='fastparquet')
        print(f"Synthetic data generated and saved to {output_path} (using fastparquet)")
    except Exception as e2:
        print(f"FastParquet also failed: {e2}")
        # Final fallback to pickle
        output_path_pickle = "../data/raw/segmentation_data.pkl"
        df.to_pickle(output_path_pickle)
        print(f"Saved as pickle instead: {output_path_pickle}")
        # Update output_path for consistency
        output_path = output_path_pickle

df.head()

ArrowKeyError: A type extension with name pandas.period already defined