In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
import re
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.float_format', '{:,.4f}'.format)
pd.set_option('display.max_rows', 50)

df = pd.read_csv("car_age_0_updated_dataset.csv", low_memory=False)
print(f"Initial shape: {df.shape}")

counts = df['od_claim_count'].value_counts().sort_index()
print(counts)

df['policy_start_date'] = pd.to_datetime(df['policy_start_date'])
start_date = df['policy_start_date'].min()
end_date = df['policy_start_date'].max()

print(f"Start date: {start_date}")
print(f"End date:   {end_date}")

print("\n Data Types")
with pd.option_context('display.max_rows', None):
    df.info()

print("\n Top Missing Value Columns")
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False).head(15))

print("\n Numerical Feature Summary")
with pd.option_context('display.max_rows', None, 'display.float_format', '{:,.2f}'.format):
    print(df.describe(include='number').transpose())

Successfully loaded data. Initial shape: (82994, 57)
od_claim_count
0.0000    61784
1.0000    15169
2.0000     4498
3.0000     1092
4.0000      315
5.0000       85
6.0000       31
7.0000       11
8.0000        6
9.0000        3
Name: count, dtype: int64
Start date: 2018-09-10 00:00:00
End date:   2025-09-30 00:00:00

 Data Types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82994 entries, 0 to 82993
Data columns (total 57 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Unnamed: 0                  82994 non-null  int64         
 1   base_policy                 82994 non-null  object        
 2   od_claim_count              82994 non-null  float64       
 3   car_age                     82994 non-null  float64       
 4   experian_rank_final         77375 non-null  float64       
 5   quarterly_service           37 non-null     float64       
 6   pps_mapped                  81321 non-null 

In [2]:
# 1. Preprocessing + Feature Engineering

# 1a. Drop columns
missing_percent = (df.isnull().sum() / len(df)) * 100
cols_mostly_empty = missing_percent[missing_percent > 70].index.tolist()
unique_counts = df.nunique()
cols_single_value = unique_counts[unique_counts == 1].index.tolist()

cols_to_drop = list(set(cols_mostly_empty + cols_single_value))

cols_to_drop.extend([
    'Unnamed: 0', 
    'policy_start_period', 
    'policy_start_year', 
    'user_agent', 
    'device_vendor_category',
    'base_policy',           
    'recommended_idv',
    'previous_policy_expired',
    'policy_start_month'
])

key_cols = ['od_claim_count', 'exposure', 'policy_start_date']
cols_to_drop = [col for col in cols_to_drop if col not in key_cols]

df_cleaned = df.drop(columns=cols_to_drop)
print(f"Shape after dropping columns: {df_cleaned.shape}")

# 1b. Sanitize Feature Names
original_cols = df_cleaned.columns.tolist()
sanitized_cols = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in original_cols]
df_cleaned.columns = sanitized_cols

renamed_cols_dict = {orig: new for orig, new in zip(original_cols, sanitized_cols) if orig != new}
if renamed_cols_dict:
    print("Renamed columns:")
    for orig, new in renamed_cols_dict.items():
        print(f"  '{orig}'  =>  '{new}'")

# 1c. Handle date Column
DATE_COL_NAME = 'policy_start_date'
df_cleaned[DATE_COL_NAME] = pd.to_datetime(df_cleaned[DATE_COL_NAME])

# 1d. Define Feature, Target, and Weight
TARGET = 'od_claim_count'
WEIGHT = 'exposure'
DATE_COL = 'policy_start_date'
features = [col for col in df_cleaned.columns if col not in [TARGET, WEIGHT, DATE_COL]]

# 1e. Handle Categorical Features & NaNs
categorical_features = df_cleaned[features].select_dtypes(include=['object', 'category']).columns.tolist()

for col in categorical_features:
    df_cleaned[col] = df_cleaned[col].astype('category')

# 1f. Print Feature Lists
print(f"\nFinal Features for Model Training ({len(features)})")
numerical_features = df_cleaned[features].select_dtypes(include=np.number).columns.tolist()
print(f"\nNumerical Features ({len(numerical_features)}):")
for col in numerical_features:
    print(f"  - {col}")

print(f"\nCategorical Features ({len(categorical_features)}):")
for col in categorical_features:
    print(f"  - {col}")

Shape after dropping columns: (82994, 31)
Renamed columns:
  'Product+Plan'  =>  'Product_Plan'

Final Features for Model Training (28)

Numerical Features (13):
  - experian_rank_final
  - variant_bracket_mapped
  - cc_group_ordinal
  - customer_age_group_ordinal
  - hit_flag_service
  - personal_loan_flag
  - embedded_red_flag
  - is_rsa
  - ex_showroom_price
  - customer_age
  - exposure_calculated
  - cc
  - intermediary_id

Categorical Features (15):
  - corrected_body_type
  - fuel_type
  - transmission_type
  - city_mapped
  - Product_Type2
  - make_mapped
  - model
  - variant_bracket
  - cc_group
  - customer_age_group
  - Product_Plan
  - make
  - policy_created_on
  - Status2
  - recommended_idv_grouped


In [3]:
# 2. Time-Based Data Splitting
print("\nSplitting data into new Train, Validation, and OOT sets...")

TRAIN_START = '2023-01-01'
TRAIN_END = '2024-08-31'
VALID_START = '2024-09-01'
VALID_END = '2024-12-31'
OOT_START = '2025-01-01'
OOT_END = '2025-03-31'

# Create the sets
train_df = df_cleaned[
    (df_cleaned[DATE_COL] >= TRAIN_START) &
    (df_cleaned[DATE_COL] <= TRAIN_END)
].copy()

test_df = df_cleaned[
    (df_cleaned[DATE_COL] >= VALID_START) &
    (df_cleaned[DATE_COL] <= VALID_END)
].copy()

oot_df = df_cleaned[
    (df_cleaned[DATE_COL] >= OOT_START) &
    (df_cleaned[DATE_COL] <= OOT_END)
].copy()

print(f"  Train set shape: {train_df.shape} (Dates: {train_df[DATE_COL].min().date()} to {train_df[DATE_COL].max().date()})")
print(f"  Test set shape: {test_df.shape} (Dates: {test_df[DATE_COL].min().date()} to {test_df[DATE_COL].max().date()})")
print(f"  OOT set shape:   {oot_df.shape} (Dates: {oot_df[DATE_COL].min().date()} to {oot_df[DATE_COL].max().date()})")


Splitting data into new Train, Validation, and OOT sets...
  Train set shape: (36558, 31) (Dates: 2023-01-01 to 2024-08-31)
  Test set shape: (8614, 31) (Dates: 2024-09-01 to 2024-12-31)
  OOT set shape:   (5809, 31) (Dates: 2025-01-01 to 2025-03-31)


In [4]:
# 3. Model Training (LightGBM Poisson Regressor)

# Prepare data splits
X_train, y_train, w_train = train_df[features], train_df[TARGET], train_df[WEIGHT]
X_test, y_test, w_test = test_df[features], test_df[TARGET], test_df[WEIGHT]
X_oot, y_oot, w_oot = oot_df[features], oot_df[TARGET], oot_df[WEIGHT]

lgb_params = {
    'objective': 'poisson',
    'metric': 'poisson',
    'n_estimators': 4000,         
    'learning_rate': 0.01,       
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'num_leaves': 20,            
    'min_data_in_leaf': 500,     
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

print("\nTraining LightGBM model (Recent Window)...")

model = lgb.LGBMRegressor(**lgb_params)

model.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_test, y_test)],
    eval_sample_weight=[w_test],
    eval_metric='poisson',
    callbacks=[lgb.early_stopping(150, verbose=100)],
    categorical_feature=categorical_features
)

print("Model training complete.")

# Prediction
print("Generating predictions...")
pred_train = model.predict(X_train) 
pred_test = model.predict(X_test) 
pred_oot = model.predict(X_oot)


Training LightGBM model (Recent Window)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 630
[LightGBM] [Info] Number of data points in the train set: 36558, number of used features: 26
[LightGBM] [Info] Start training from score -0.909493
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[568]	valid_0's poisson: 0.696702
Model training complete.
Generating predictions...


In [5]:
# 4. Decile Analysis Function
def create_decile_analysis(y_true, y_pred_count, exposure, set_name):
    print(f"\n- Decile Analysis: {set_name} Set -")
    
    eval_df = pd.DataFrame({
        'actual_count': y_true,
        'predicted_count': y_pred_count,
        'exposure': exposure
    })
    eval_df = eval_df[eval_df['exposure'] > 0].copy()
    
    if eval_df.empty:
        print(f"Warning: No data with exposure > 0 in {set_name} set. Skipping analysis.")
        return

    eval_df['predicted_ir_record'] = eval_df['predicted_count'] / eval_df['exposure']
    
    try:
        eval_df['decile'] = pd.qcut(eval_df['predicted_ir_record'], 10, labels=False, duplicates='drop')
    except ValueError:
        print("Warning: Could not create 10 unique deciles. Using fewer bins.")
        try:
            eval_df['decile'] = pd.qcut(eval_df['predicted_ir_record'], 5, labels=False, duplicates='drop') # Fallback
        except ValueError:
             print("Error: Could not create any deciles. Data may be too uniform. Skipping decile table.")
             return

    decile_groups = eval_df.groupby('decile')
    
    decile_summary = pd.DataFrame({
        'Policies': decile_groups.size(),
        'Total Exposure': decile_groups['exposure'].sum(),
        'Actual Claims': decile_groups['actual_count'].sum(),
        'Predicted Claims': decile_groups['predicted_count'].sum()
    })
    
    decile_summary['Actual IR'] = decile_summary['Actual Claims'] / decile_summary['Total Exposure']
    decile_summary['Predicted IR'] = decile_summary['Predicted Claims'] / decile_summary['Total Exposure']
    decile_summary['Delta'] = decile_summary['Actual IR'] - decile_summary['Predicted IR']
    
    mean_abs_delta = (decile_summary['Actual IR'] - decile_summary['Predicted IR']).abs().mean() * 100

    format_cols_pct = ['Actual IR', 'Predicted IR', 'Delta']
    for col in format_cols_pct:
        decile_summary[col] = (decile_summary[col] * 100).map('{:,.2f}%'.format)

    display_cols = ['Policies', 'Actual IR', 'Predicted IR', 'Delta']
    print(decile_summary[display_cols])
    
    overall_actual_ir = eval_df['actual_count'].sum() / eval_df['exposure'].sum()
    overall_pred_ir = eval_df['predicted_count'].sum() / eval_df['exposure'].sum()
    
    print(f"\nOverall Actual IR:    {overall_actual_ir:.4%}")
    print(f"Overall Predicted IR: {overall_pred_ir:.4%}")
    print(f"Mean Absolute Delta:  {mean_abs_delta:.4f}%") 
    
    return mean_abs_delta

# 5. Run Analysis & Report
create_decile_analysis(y_train, pred_train, w_train, "Train")
create_decile_analysis(y_test, pred_test, w_test, "Test")
create_decile_analysis(y_oot, pred_oot, w_oot, "OOT")


- Decile Analysis: Train Set -
        Policies Actual IR Predicted IR   Delta
decile                                         
0           3656    17.29%       23.86%  -6.57%
1           3656    22.72%       28.88%  -6.16%
2           3656    26.84%       31.95%  -5.11%
3           3655    31.96%       34.63%  -2.67%
4           3656    35.85%       37.12%  -1.26%
5           3656    39.38%       39.71%  -0.33%
6           3655    41.79%       42.68%  -0.89%
7           3656    50.49%       46.46%   4.02%
8           3656    56.74%       51.88%   4.86%
9           3656    78.99%       64.77%  14.23%

Overall Actual IR:    40.2071%
Overall Predicted IR: 40.1939%
Mean Absolute Delta:  4.6105%

- Decile Analysis: Test Set -
        Policies Actual IR Predicted IR    Delta
decile                                          
0            862    23.73%       25.57%   -1.84%
1            861    26.55%       31.70%   -5.15%
2            861    31.82%       35.62%   -3.80%
3            862    32.

26.07182475602491

In [6]:
# 6. Post-Model Analysis
def create_feature_level_analysis(X_data, y_true, y_pred_count, exposure, model, set_name):
    """
    Generates a feature-level analysis for top model features.
    """
    print(f"\n--- Feature-Level Analysis: {set_name} Set ---")
    
    # Combine all data needed
    eval_df = X_data.copy()
    eval_df['actual_count'] = y_true
    eval_df['predicted_count'] = y_pred_count
    eval_df['exposure'] = exposure
    
    eval_df = eval_df[eval_df['exposure'] > 0].copy()
    
    if eval_df.empty:
        print(f"Warning: No data in {set_name} set for this analysis.")
        return
        
    try:
        top_features = model.booster_.feature_name()[:10]
        print(f"Analyzing top 10 features: {top_features}")
    except Exception as e:
        print(f"Could not get feature names, skipping: {e}")
        return

    for col in top_features:
        if col not in eval_df.columns:
            print(f"Skipping '{col}', not in X_data.")
            continue
            
        print(f"\nAnalyzing Feature: '{col}'")
        
        feature_data = eval_df[col]
        if feature_data.isnull().any():
            if feature_data.dtype.name == 'category':
                feature_data = feature_data.cat.add_categories('__NaN__').fillna('__NaN__')
            else:
                feature_data = feature_data.fillna('__NaN__')
                
        groups = eval_df.groupby(feature_data)
        
        summary_df = pd.DataFrame({
            'Policies': groups.size(),
            'Total Exposure': groups['exposure'].sum(),
            'Actual Claims': groups['actual_count'].sum(),
            'Predicted Claims': groups['predicted_count'].sum()
        })
        
        summary_df['Actual IR'] = summary_df['Actual Claims'] / summary_df['Total Exposure']
        summary_df['Predicted IR'] = summary_df['Predicted Claims'] / summary_df['Total Exposure']
        summary_df['Delta'] = summary_df['Actual IR'] - summary_df['Predicted IR']
        
        summary_df = summary_df.sort_values(by='Policies', ascending=False)
        
        format_cols_pct = ['Actual IR', 'Predicted IR', 'Delta']
        for pct_col in format_cols_pct:
            summary_df[pct_col] = (summary_df[pct_col] * 100).map('{:,.2f}%'.format)

        with pd.option_context('display.max_rows', 25):
             print(summary_df[['Policies', 'Actual IR', 'Predicted IR', 'Delta']].head(20))
        
        if len(summary_df) > 20:
            print(f"  ... (and {len(summary_df) - 20} more values)")

# Run the analysis on the OOT set
print("\nRunning Feature-Level Analysis on OOT Set...")
create_feature_level_analysis(
    X_oot, 
    y_oot, 
    pred_oot, 
    w_oot, 
    model, 
    "OOT (2025 Data)"
)


Running Feature-Level Analysis on OOT Set...

--- Feature-Level Analysis: OOT (2025 Data) Set ---
Analyzing top 10 features: ['experian_rank_final', 'variant_bracket_mapped', 'cc_group_ordinal', 'customer_age_group_ordinal', 'hit_flag_service', 'personal_loan_flag', 'embedded_red_flag', 'is_rsa', 'ex_showroom_price', 'corrected_body_type']

Analyzing Feature: 'experian_rank_final'
                     Policies Actual IR Predicted IR    Delta
experian_rank_final                                          
7.0000                   1887    36.66%       59.48%  -22.82%
5.0000                   1459    41.33%       69.64%  -28.30%
6.0000                   1251    38.69%       66.01%  -27.32%
 __NaN__                  443    33.33%       55.47%  -22.14%
4.0000                    423    42.25%       68.46%  -26.20%
3.0000                    329    42.46%       69.76%  -27.29%
2.0000                     17    37.29%       83.31%  -46.02%

Analyzing Feature: 'variant_bracket_mapped'
            