# setup and configuration

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pickle

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('seaborn-v0_8-darkgrid')

# --- Random Seed ---
RANDOM_STATE = 123
np.random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)

# --- Model/Metrics ---
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# --- ML Models ---
import lightgbm as lgb
import xgboost as xgb

# --- Tuning ---
try:
    import optuna
except ImportError:
    print("Installing Optuna...")
    !pip install optuna -q
    import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)

try:
    import imblearn
except ImportError:
    print("Installing imbalanced-learn...")
    !pip install imbalanced-learn -q
    import imblearn

# --- Feature Engineering Helpers ---
# REMOVED pyzipcode - we won't use 'state' feature due to data quality issues
# try:
#     from pyzipcode import ZipCodeDatabase
# except ImportError:
#     print("Installing pyzipcode...")
#     !pip install pyzipcode -q
#     from pyzipcode import ZipCodeDatabase

print("\n‚úì Setup complete. All libraries loaded.")


‚úì Setup complete. All libraries loaded.


# data loading


In [44]:
# --- Load Data ---
# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import files
    try:
        train_df = pd.read_csv('Training_TriGuard.csv')
        test_df = pd.read_csv('Testing_TriGuard.csv')
        print("‚úì Files loaded from local environment.")
    except FileNotFoundError:
        print("Please upload Training_TriGuard.csv:")
        uploaded_train = files.upload()
        train_file = list(uploaded_train.keys())[0]
        train_df = pd.read_csv(train_file)

        print("\nPlease upload Testing_TriGuard.csv:")
        uploaded_test = files.upload()
        test_file = list(uploaded_test.keys())[0]
        test_df = pd.read_csv(test_file)
        print("‚úì Files uploaded successfully.")
else:
    # Local environment
    possible_paths = [
        'Training_TriGuard.csv',
        'data/Training_TriGuard.csv',
        '../Training_TriGuard.csv',
        './Training_TriGuard.csv'
    ]

    train_path = None
    test_path = None

    for path in possible_paths:
        if os.path.exists(path):
            train_path = path
            test_path = path.replace('Training', 'Testing')
            if os.path.exists(test_path):
                break

    if train_path and test_path:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"‚úì Files loaded from: {train_path} and {test_path}")
    else:
        train_path = input("Enter path to Training_TriGuard.csv: ").strip()
        test_path = input("Enter path to Testing_TriGuard.csv: ").strip()
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print("‚úì Files loaded successfully.")

# --- Critical Cleaning ---
initial_train_count = len(train_df)
train_df = train_df.dropna(subset=['subrogation'])
print(f"\nCleaned training data: Removed {initial_train_count - len(train_df)} rows with NaN target.")

train_df['subrogation'] = train_df['subrogation'].astype(int)

print(f"\n‚úì Train shape: {train_df.shape}")
print(f"‚úì Test shape: {test_df.shape}")
print(f"\nTarget distribution (after cleaning):")
print(train_df['subrogation'].value_counts(normalize=True).to_string())

test_ids = test_df['claim_number'].copy()

print("‚úì Data loading complete.")

‚úì Files loaded from local environment.

Cleaned training data: Removed 2 rows with NaN target.

‚úì Train shape: (17999, 29)
‚úì Test shape: (12000, 28)

Target distribution (after cleaning):
subrogation
0   0.771
1   0.229
‚úì Data loading complete.


# feature engineering


In [46]:
def feature_engineer(df):
    """Feature engineering WITHOUT vehicle_made_year/vehicle_age/state (data quality issues)"""
    df_fe = df.copy()

    # ========================================================================
    # TEMPORAL FEATURES
    # ========================================================================
    df_fe['claim_date'] = pd.to_datetime(df_fe['claim_date'], errors='coerce')
    df_fe['claim_year'] = df_fe['claim_date'].dt.year
    df_fe['claim_month'] = df_fe['claim_date'].dt.month
    df_fe['claim_day'] = df_fe['claim_date'].dt.day
    df_fe['claim_quarter'] = df_fe['claim_date'].dt.quarter
    df_fe['claim_dayofweek'] = df_fe['claim_date'].dt.dayofweek
    df_fe['is_weekend'] = (df_fe['claim_dayofweek'] >= 5).astype(int)
    df_fe['is_monday'] = (df_fe['claim_dayofweek'] == 0).astype(int)
    df_fe['is_friday'] = (df_fe['claim_dayofweek'] == 4).astype(int)
    df_fe['is_q4'] = (df_fe['claim_quarter'] == 4).astype(int)

    # NEW: Time-of-day features from Doc 8
    df_fe['claim_hour'] = df_fe['claim_date'].dt.hour
    df_fe['rush_hour'] = df_fe['claim_hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)
    df_fe['late_night'] = df_fe['claim_hour'].isin([0, 1, 2, 3, 4, 5]).astype(int)

    season_map = {
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall',
        12: 'Winter', 1: 'Winter', 2: 'Winter'
    }
    df_fe['season'] = df_fe['claim_month'].map(season_map).fillna('Unknown')

    # ========================================================================
    # DATA CLEANING
    # ========================================================================
    df_fe.loc[(df_fe['year_of_born'] < 1900) | (df_fe['year_of_born'] > 2025), 'year_of_born'] = np.nan

    # ========================================================================
    # BINARY CONVERSIONS (for interactions)
    # ========================================================================
    df_fe['witness_binary'] = (df_fe['witness_present_ind'] == 'Y').astype(int)
    df_fe['police_binary'] = df_fe['policy_report_filed_ind']
    df_fe['multicar_binary'] = df_fe['accident_type'].isin(['multi_vehicle_clear', 'multi_vehicle_unclear']).astype(int)
    df_fe['highrisk_site_binary'] = df_fe['accident_site'].isin(['Highway/Intersection', 'Local']).astype(int)

    # ========================================================================
    # CRITICAL INTERACTION FEATURES (2-way)
    # ========================================================================
    df_fe['liab_x_witness'] = df_fe['liab_prct'] * df_fe['witness_binary']
    df_fe['liab_x_police'] = df_fe['liab_prct'] * df_fe['police_binary']
    df_fe['liab_x_multicar'] = df_fe['liab_prct'] * df_fe['multicar_binary']
    df_fe['liab_x_highrisk_site'] = df_fe['liab_prct'] * df_fe['highrisk_site_binary']
    df_fe['liab_x_evidence'] = df_fe['liab_prct'] * (df_fe['witness_binary'] + df_fe['police_binary'])
    df_fe['liab_x_payout'] = df_fe['liab_prct'] * df_fe['claim_est_payout']
    df_fe['liab_x_mileage'] = df_fe['liab_prct'] * df_fe['vehicle_mileage']

    df_fe['witness_x_police'] = df_fe['witness_binary'] * df_fe['police_binary']
    df_fe['witness_x_multicar'] = df_fe['witness_binary'] * df_fe['multicar_binary']
    df_fe['police_x_multicar'] = df_fe['police_binary'] * df_fe['multicar_binary']
    df_fe['multicar_x_highrisk'] = df_fe['multicar_binary'] * df_fe['highrisk_site_binary']
    df_fe['weekend_highway'] = (df_fe['claim_dayofweek'] >= 5).astype(int) * (df_fe['accident_site'] == 'Highway/Intersection').astype(int)

    # 3-way interaction
    df_fe['witness_police_multicar'] = df_fe['witness_binary'] * df_fe['police_binary'] * df_fe['multicar_binary']

    # ========================================================================
    # POLYNOMIAL FEATURES (liability & key variables)
    # ========================================================================
    df_fe['liab_prct_squared'] = df_fe['liab_prct'] ** 2
    df_fe['liab_prct_cubed'] = df_fe['liab_prct'] ** 3
    df_fe['liab_prct_sqrt'] = np.sqrt(df_fe['liab_prct'])
    df_fe['liab_prct_log'] = np.log1p(df_fe['liab_prct'])
    df_fe['liab_inverse'] = 100 - df_fe['liab_prct']
    df_fe['liab_inverse_squared'] = (100 - df_fe['liab_prct']) ** 2

    df_fe['log_claim_est_payout'] = np.log1p(df_fe['claim_est_payout'])
    df_fe['log_vehicle_mileage'] = np.log1p(df_fe['vehicle_mileage'])
    df_fe['log_vehicle_price'] = np.log1p(df_fe['vehicle_price'])
    df_fe['log_annual_income'] = np.log1p(df_fe['annual_income'])
    df_fe['sqrt_vehicle_mileage'] = np.sqrt(df_fe['vehicle_mileage'])

    # ========================================================================
    # ACCIDENT TYPE FEATURES
    # ========================================================================
    df_fe['is_multi_vehicle_clear'] = (df_fe['accident_type'] == 'multi_vehicle_clear').astype(int)
    df_fe['is_multi_vehicle_unclear'] = (df_fe['accident_type'] == 'multi_vehicle_unclear').astype(int)
    df_fe['is_single_car'] = (df_fe['accident_type'] == 'single_car').astype(int)
    df_fe['has_recovery_target'] = df_fe['multicar_binary']

    df_fe['recovery_case_clarity'] = 0
    df_fe.loc[df_fe['is_multi_vehicle_clear'] == 1, 'recovery_case_clarity'] = 3
    df_fe.loc[df_fe['is_multi_vehicle_unclear'] == 1, 'recovery_case_clarity'] = 1

    # ========================================================================
    # LIABILITY BUCKETS (fine-grained)
    # ========================================================================
    df_fe['liab_under_10'] = (df_fe['liab_prct'] < 10).astype(int)
    df_fe['liab_10_to_15'] = ((df_fe['liab_prct'] >= 10) & (df_fe['liab_prct'] < 15)).astype(int)
    df_fe['liab_15_to_20'] = ((df_fe['liab_prct'] >= 15) & (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['liab_20_to_25'] = ((df_fe['liab_prct'] >= 20) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['liab_25_to_30'] = ((df_fe['liab_prct'] >= 25) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['liab_30_to_35'] = ((df_fe['liab_prct'] >= 30) & (df_fe['liab_prct'] < 35)).astype(int)
    df_fe['liab_35_to_40'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 40)).astype(int)
    df_fe['liab_40_to_50'] = ((df_fe['liab_prct'] >= 40) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['liab_over_50'] = (df_fe['liab_prct'] >= 50).astype(int)

    df_fe['not_at_fault'] = df_fe['liab_under_10']
    df_fe['minimal_fault'] = (df_fe['liab_prct'] < 25).astype(int)
    df_fe['low_fault'] = (df_fe['liab_prct'] < 35).astype(int)
    df_fe['shared_fault'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['high_fault'] = (df_fe['liab_prct'] >= 50).astype(int)

    # ========================================================================
    # EVIDENCE QUALITY FEATURES
    # ========================================================================
    df_fe['witness_present'] = df_fe['witness_binary']
    df_fe['police_report'] = df_fe['police_binary']

    df_fe['evidence_none'] = ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 0)).astype(int)
    df_fe['evidence_weak'] = (((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 0)) |
                              ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 1))).astype(int)
    df_fe['evidence_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1)).astype(int)
    df_fe['evidence_very_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1) &
                                      (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['evidence_score'] = df_fe['witness_present'] + df_fe['police_report']

    # ========================================================================
    # ACCIDENT SITE FEATURES
    # ========================================================================
    df_fe['high_risk_site'] = df_fe['highrisk_site_binary']
    df_fe['parking_accident'] = (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['unknown_site'] = (df_fe['accident_site'] == 'Unknown').astype(int)
    df_fe['highway_accident'] = (df_fe['accident_site'] == 'Highway/Intersection').astype(int)
    df_fe['local_accident'] = (df_fe['accident_site'] == 'Local').astype(int)

    # ========================================================================
    # DRIVER AGE & EXPERIENCE
    # ========================================================================
    df_fe['driver_age'] = df_fe['claim_year'] - df_fe['year_of_born']
    df_fe.loc[(df_fe['driver_age'] < 16) | (df_fe['driver_age'] > 100), 'driver_age'] = np.nan

    df_fe['young_driver'] = ((df_fe['driver_age'] >= 16) & (df_fe['driver_age'] <= 25)).astype(int)
    df_fe['prime_driver'] = ((df_fe['driver_age'] > 25) & (df_fe['driver_age'] <= 45)).astype(int)
    df_fe['middle_age_driver'] = ((df_fe['driver_age'] > 45) & (df_fe['driver_age'] <= 65)).astype(int)
    df_fe['senior_driver'] = (df_fe['driver_age'] > 65).astype(int)

    df_fe['driving_experience'] = (df_fe['driver_age'] - df_fe['age_of_DL']).clip(lower=0)
    df_fe.loc[df_fe['driving_experience'] < 0, 'driving_experience'] = np.nan

    df_fe['novice_driver'] = (df_fe['driving_experience'] < 3).astype(int)
    df_fe['experienced_driver'] = ((df_fe['driving_experience'] >= 3) & (df_fe['driving_experience'] <= 10)).astype(int)
    df_fe['veteran_driver'] = (df_fe['driving_experience'] > 10).astype(int)

    df_fe['experience_x_safety'] = df_fe['driving_experience'] * df_fe['safety_rating']
    df_fe['driver_age_x_safety'] = df_fe['driver_age'] * df_fe['safety_rating']

    # NEW: Driver risk interactions from Doc 8
    df_fe['young_novice'] = df_fe['young_driver'] * df_fe['novice_driver']

    # ========================================================================
    # VEHICLE FEATURES (without vehicle_age)
    # ========================================================================
    df_fe['luxury_vehicle'] = (df_fe['vehicle_price'] > 50000).astype(int)
    df_fe['mid_price_vehicle'] = ((df_fe['vehicle_price'] >= 20000) & (df_fe['vehicle_price'] <= 50000)).astype(int)
    df_fe['economy_vehicle'] = (df_fe['vehicle_price'] < 20000).astype(int)

    df_fe['heavy_vehicle'] = (df_fe['vehicle_weight'] > 30000).astype(int)
    df_fe['light_vehicle'] = (df_fe['vehicle_weight'] < 15000).astype(int)
    df_fe['medium_weight'] = ((df_fe['vehicle_weight'] >= 15000) & (df_fe['vehicle_weight'] <= 30000)).astype(int)

    df_fe['is_large_vehicle'] = (df_fe['vehicle_category'] == 'Large').astype(int)
    df_fe['is_compact_vehicle'] = (df_fe['vehicle_category'] == 'Compact').astype(int)
    df_fe['is_medium_vehicle'] = (df_fe['vehicle_category'] == 'Medium').astype(int)

    # ========================================================================
    # CLAIM CHARACTERISTICS
    # ========================================================================
    df_fe['high_mileage'] = (df_fe['vehicle_mileage'] > 100000).astype(int)
    df_fe['low_mileage'] = (df_fe['vehicle_mileage'] < 50000).astype(int)
    df_fe['very_high_mileage'] = (df_fe['vehicle_mileage'] > 150000).astype(int)
    df_fe['medium_mileage'] = ((df_fe['vehicle_mileage'] >= 50000) & (df_fe['vehicle_mileage'] <= 100000)).astype(int)

    df_fe['frequent_claimer'] = (df_fe['past_num_of_claims'] > 5).astype(int)
    df_fe['moderate_claimer'] = ((df_fe['past_num_of_claims'] >= 1) & (df_fe['past_num_of_claims'] <= 5)).astype(int)
    df_fe['first_time_claimer'] = (df_fe['past_num_of_claims'] == 0).astype(int)
    df_fe['very_frequent_claimer'] = (df_fe['past_num_of_claims'] > 10).astype(int)

    df_fe['large_payout'] = (df_fe['claim_est_payout'] > 5000).astype(int)
    df_fe['medium_payout'] = ((df_fe['claim_est_payout'] >= 2000) & (df_fe['claim_est_payout'] <= 5000)).astype(int)
    df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
    df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)

    df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
    df_fe['mileage_x_claims'] = df_fe['vehicle_mileage'] * df_fe['past_num_of_claims']

    # NEW: Claims risk interactions from Doc 8
    df_fe['senior_frequent_claimer'] = df_fe['senior_driver'] * df_fe['frequent_claimer']
    df_fe['low_safety_high_claims'] = ((df_fe['safety_rating'] < 60) & (df_fe['past_num_of_claims'] > 3)).astype(int)

    # ========================================================================
    # RATIO FEATURES
    # ========================================================================
    df_fe['payout_to_price_ratio'] = df_fe['claim_est_payout'] / (df_fe['vehicle_price'] + 1)
    df_fe['severe_damage'] = (df_fe['payout_to_price_ratio'] > 0.3).astype(int)
    df_fe['moderate_damage'] = ((df_fe['payout_to_price_ratio'] >= 0.1) & (df_fe['payout_to_price_ratio'] <= 0.3)).astype(int)
    df_fe['minor_damage'] = (df_fe['payout_to_price_ratio'] < 0.1).astype(int)

    df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (df_fe['vehicle_price'] + 1)
    df_fe['can_afford_vehicle'] = (df_fe['income_to_vehicle_price'] >= 0.5).astype(int)
    df_fe['expensive_for_income'] = (df_fe['income_to_vehicle_price'] < 0.3).astype(int)

    df_fe['claims_per_year_driving'] = df_fe['past_num_of_claims'] / (df_fe['driving_experience'] + 1)
    df_fe['claim_frequency_high'] = (df_fe['claims_per_year_driving'] > 0.5).astype(int)

    df_fe['safety_to_liability'] = df_fe['safety_rating'] / (df_fe['liab_prct'] + 1)
    df_fe['payout_to_income'] = df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)
    df_fe['mileage_to_price'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['weight_to_price'] = df_fe['vehicle_weight'] / (df_fe['vehicle_price'] + 1)

    # ========================================================================
    # POLICYHOLDER CHARACTERISTICS
    # ========================================================================
    df_fe['high_income'] = (df_fe['annual_income'] > 70000).astype(int)
    df_fe['mid_income'] = ((df_fe['annual_income'] >= 40000) & (df_fe['annual_income'] <= 70000)).astype(int)
    df_fe['low_income'] = (df_fe['annual_income'] < 40000).astype(int)
    df_fe['very_high_income'] = (df_fe['annual_income'] > 100000).astype(int)

    df_fe['high_safety_rating'] = (df_fe['safety_rating'] > 80).astype(int)
    df_fe['low_safety_rating'] = (df_fe['safety_rating'] < 60).astype(int)
    df_fe['very_high_safety'] = (df_fe['safety_rating'] > 90).astype(int)
    df_fe['medium_safety'] = ((df_fe['safety_rating'] >= 60) & (df_fe['safety_rating'] <= 80)).astype(int)

    df_fe['contact_available'] = df_fe['email_or_tel_available']
    df_fe['has_education'] = df_fe['high_education_ind']
    df_fe['recent_move'] = df_fe['address_change_ind']
    df_fe['home_owner'] = (df_fe['living_status'] == 'Own').astype(int)
    df_fe['renter'] = (df_fe['living_status'] == 'Rent').astype(int)
    df_fe['female'] = (df_fe['gender'] == 'F').astype(int)

    # ========================================================================
    # CHANNEL FEATURES
    # ========================================================================
    df_fe['via_broker'] = (df_fe['channel'] == 'Broker').astype(int)
    df_fe['via_online'] = (df_fe['channel'] == 'Online').astype(int)
    df_fe['via_phone'] = (df_fe['channel'] == 'Phone').astype(int)
    df_fe['in_network_repair'] = (df_fe['in_network_bodyshop'] == 'yes').astype(int)
    df_fe['out_network_repair'] = (df_fe['in_network_bodyshop'] == 'no').astype(int)

    # ========================================================================
    # COMPOSITE RECOVERY SCORES
    # ========================================================================
    liability_score = np.sqrt((100 - df_fe['liab_prct']) / 100.0)
    evidence_score_composite = (df_fe['evidence_none'] * 0.0 + df_fe['evidence_weak'] * 0.4 +
                      df_fe['evidence_strong'] * 0.7 + df_fe['evidence_very_strong'] * 1.0)
    clarity_score = df_fe['recovery_case_clarity'] / 3.0
    site_score = df_fe['high_risk_site'] * 0.7 + (1 - df_fe['unknown_site']) * 0.3

    df_fe['recovery_feasibility_score'] = (0.35 * liability_score + 0.30 * df_fe['has_recovery_target'] +
                                           0.20 * evidence_score_composite + 0.10 * clarity_score + 0.05 * site_score)

    # NEW: Alternative recovery potential score from Doc 8
    df_fe['recovery_potential'] = (
        (100 - df_fe['liab_prct']) * 0.4 +
        df_fe['evidence_score'] * 20 * 0.3 +
        df_fe['multicar_binary'] * 30 * 0.2 +
        (df_fe['claim_est_payout'] / 100) * 0.1
    )

    # ========================================================================
    # DOMAIN LOGIC FLAGS (CRITICAL FOR F1)
    # ========================================================================
    df_fe['perfect_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['witness_present'] == 1) &
                             (df_fe['police_report'] == 1) & (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['strong_case'] = ((df_fe['liab_prct'] < 25) & (df_fe['evidence_strong'] == 1) &
                            (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['good_case'] = ((df_fe['liab_prct'] < 35) & (df_fe['evidence_score'] >= 1) &
                          (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['weak_case'] = ((df_fe['liab_prct'] > 40) | (df_fe['is_single_car'] == 1) |
                          (df_fe['evidence_none'] == 1)).astype(int)

    df_fe['no_case'] = ((df_fe['liab_prct'] > 60) | ((df_fe['is_single_car'] == 1) & (df_fe['evidence_none'] == 1))).astype(int)

    df_fe['high_value_opportunity'] = ((df_fe['claim_est_payout'] > 3000) & (df_fe['liab_prct'] < 30) &
                                       (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['slam_dunk_case'] = ((df_fe['liab_prct'] < 10) & (df_fe['witness_present'] == 1) &
                               (df_fe['police_report'] == 1) & (df_fe['multicar_binary'] == 1) &
                               (df_fe['high_risk_site'] == 1)).astype(int)

    df_fe['low_liab_high_payout'] = ((df_fe['liab_prct'] < 20) & (df_fe['claim_est_payout'] > 5000)).astype(int)
    df_fe['clear_fault_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['multicar_binary'] == 1)).astype(int)
    df_fe['high_mileage_low_fault'] = ((df_fe['vehicle_mileage'] > 100000) & (df_fe['liab_prct'] < 30)).astype(int)

    # NEW: More interaction flags from Doc 8
    df_fe['low_liab_witness_police'] = ((df_fe['liab_prct'] < 20) & (df_fe['witness_binary'] == 1) &
                                         (df_fe['police_binary'] == 1)).astype(int)
    df_fe['multicar_low_liab'] = ((df_fe['multicar_binary'] == 1) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['high_payout_evidence'] = ((df_fe['claim_est_payout'] > 5000) & (df_fe['evidence_score'] >= 1)).astype(int)
    df_fe['severe_damage_low_fault'] = ((df_fe['payout_to_price_ratio'] > 0.3) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['minor_damage_high_fault'] = ((df_fe['payout_to_price_ratio'] < 0.1) & (df_fe['liab_prct'] > 50)).astype(int)

    # --- Temporal & Behavior Dynamics ---
    df_fe['claim_early_in_year'] = (df_fe['claim_month'] <= 3).astype(int)
    df_fe['claim_end_of_year'] = (df_fe['claim_month'] >= 10).astype(int)
    df_fe['weekend_parking'] = df_fe['is_weekend'] * (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['winter_claim_high_payout'] = ((df_fe['season'] == 'Winter') & (df_fe['claim_est_payout'] > 5000)).astype(int)

    # --- Vehicle Utilization Proxies (without vehicle_age) ---
    df_fe['mileage_x_weight'] = df_fe['vehicle_mileage'] * df_fe['vehicle_weight']
    df_fe['mileage_per_dollar'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['payout_to_weight'] = df_fe['claim_est_payout'] / (df_fe['vehicle_weight'] + 1)

    # --- Policyholder Risk Profile ---
    df_fe['unstable_policyholder'] = ((df_fe['recent_move'] == 1) & (df_fe['renter'] == 1)).astype(int)
    df_fe['financial_stress_risk'] = ((df_fe['expensive_for_income'] == 1) & (df_fe['large_payout'] == 1)).astype(int)
    df_fe['young_driver_highway'] = df_fe['young_driver'] * df_fe['highway_accident']
    df_fe['senior_driver_parking'] = df_fe['senior_driver'] * df_fe['parking_accident']

    # --- Liability & Evidence Interaction Insights ---
    df_fe['low_liab_weak_evidence'] = ((df_fe['liab_prct'] < 20) & (df_fe['evidence_weak'] == 1)).astype(int)
    df_fe['high_liab_strong_evidence'] = ((df_fe['liab_prct'] > 50) & (df_fe['evidence_strong'] == 1)).astype(int)

    # Composite confidence / case quality index
    df_fe['case_confidence_score'] = (
        0.4 * (100 - df_fe['liab_prct']) / 100 +
        0.4 * df_fe['evidence_score'] / 2 +
        0.2 * df_fe['recovery_case_clarity'] / 3
    )

    # --- Statistical Normalization & Percentile Features ---
    for col in ['claim_est_payout', 'vehicle_mileage', 'annual_income']:
        df_fe[f'{col}_z'] = (df_fe[col] - df_fe[col].mean()) / (df_fe[col].std() + 1e-9)

    try:
        df_fe['liab_percentile'] = pd.qcut(df_fe['liab_prct'], 10, labels=False, duplicates='drop')
        df_fe['payout_percentile'] = pd.qcut(df_fe['claim_est_payout'], 10, labels=False, duplicates='drop')
    except Exception:
        df_fe['liab_percentile'] = np.nan
        df_fe['payout_percentile'] = np.nan

    # --- Aggregate / Hybrid Indices ---
    df_fe['case_strength_index'] = df_fe['evidence_score'] * (1 - df_fe['liab_prct'] / 100)
    df_fe['financial_exposure_index'] = (
        (df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)) * (1 + df_fe['liab_prct'] / 100)
    )
    df_fe['behavioral_risk_index'] = (
        df_fe['claims_per_year_driving'] * (100 - df_fe['safety_rating']) / 100
    )

    return df_fe

print("‚úì Feature engineering function defined (190+ features)")

‚úì Feature engineering function defined (190+ features)


# pre-modeling with target encoding


In [47]:
print("="*80)
print("Running Feature Engineering on train and test data...")

train_fe = feature_engineer(train_df)
test_fe = feature_engineer(test_df)
print("‚úì Feature engineering complete.")

# Define Categorical Feature Lists
features_to_target_encode = [
    'gender', 'living_status', 'accident_site',
    'channel', 'vehicle_category', 'vehicle_color', 'accident_type',
    'in_network_bodyshop', 'season', 'zip_code'
]

# Apply Target Encoding
print(f"\nApplying Smoothed Target Encoding to {len(features_to_target_encode)} features...")
global_mean = train_fe['subrogation'].mean()
categorical_features_for_lgbm = []

for col in features_to_target_encode:
    target_mean = train_fe.groupby(col)['subrogation'].mean()
    category_counts = train_fe.groupby(col).size()
    smoothing = 20

    smoothed_mean = (target_mean * category_counts + global_mean * smoothing) / (category_counts + smoothing)

    new_col_name = f'{col}_target_enc'
    train_fe[new_col_name] = train_fe[col].map(smoothed_mean)
    test_fe[new_col_name] = test_fe[col].map(smoothed_mean)

    test_fe[new_col_name] = test_fe[new_col_name].fillna(global_mean)

    categorical_features_for_lgbm.append(new_col_name)

print("‚úì Target encoding complete.")

# Create Final X, y, and X_test
y_all = train_fe['subrogation'].copy()

drop_cols = [
    'subrogation', 'claim_number', 'claim_date', 'year_of_born',
    'witness_present_ind', 'policy_report_filed_ind',
    'vehicle_made_year',  # Bad data quality
    'claim_hour'  # Drop raw hour (we keep rush_hour and late_night flags)
]
drop_cols.extend(features_to_target_encode)

feature_cols = [col for col in train_fe.columns if col not in drop_cols]
X_all = train_fe[feature_cols].copy()
X_test_all = test_fe[feature_cols].copy()

# Apply Label Encoding (if any object columns remain)
other_cat_cols = X_all.select_dtypes(include='object').columns.tolist()
if other_cat_cols:
    print(f"\nApplying Label Encoding to {len(other_cat_cols)} remaining features...")
    for col in other_cat_cols:
        le = LabelEncoder()
        all_values = pd.concat([X_all[col].astype(str), X_test_all[col].astype(str)]).unique()
        le.fit(all_values)
        X_all[col] = le.transform(X_all[col].astype(str))
        X_test_all[col] = le.transform(X_test_all[col].astype(str))
    print("‚úì Label encoding complete.")

# Impute NaN values with median
print("\nImputing NaN values with the median from the training data...")
X_all_median = X_all.median()
X_all = X_all.fillna(X_all_median)
X_test_all = X_test_all.fillna(X_all_median)
print("‚úì NaN values imputed.")

# Calculate scale_pos_weight
scale_pos_weight = (y_all == 0).sum() / (y_all == 1).sum()

print("\n" + "="*80)
print("PRE-MODELING COMPLETE")
print(f"‚úì X_all shape: {X_all.shape}")
print(f"‚úì y_all shape: {y_all.shape}")
print(f"‚úì X_test_all shape: {X_test_all.shape}")
print(f"‚úì Total features: {len(feature_cols)}")
print(f"‚úì scale_pos_weight (for F1 score): {scale_pos_weight:.4f}")

Running Feature Engineering on train and test data...
‚úì Feature engineering complete.

Applying Smoothed Target Encoding to 10 features...
‚úì Target encoding complete.

Applying Label Encoding to 1 remaining features...
‚úì Label encoding complete.

Imputing NaN values with the median from the training data...
‚úì NaN values imputed.

PRE-MODELING COMPLETE
‚úì X_all shape: (17999, 201)
‚úì y_all shape: (17999,)
‚úì X_test_all shape: (12000, 201)
‚úì Total features: 201
‚úì scale_pos_weight (for F1 score): 3.3740


# optuna + smote




In [49]:
# Install if needed
try:
    from imblearn.over_sampling import SMOTE
except ImportError:
    print("\nInstalling imbalanced-learn...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'imbalanced-learn', '-q'])
    from imblearn.over_sampling import SMOTE
    print("‚úì imbalanced-learn installed")

import json
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve,
    f1_score, precision_score, recall_score, confusion_matrix
)
import pandas as pd

print("\n" + "="*90)
print("STEP 1: OPTUNA HYPERPARAMETER TUNING WITH SMOTE")
print("="*90)

def objective_smote(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        # NOTE: scale_pos_weight REMOVED when using SMOTE
        'random_state': RANDOM_STATE,
        'verbose': -1,
        'n_jobs': -1,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    fold_scores = []
    smote = SMOTE(random_state=RANDOM_STATE, sampling_strategy=1.0)  # Balance to 1:1

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_all, y_all), start=1):
        X_tr, X_va = X_all.iloc[train_idx], X_all.iloc[val_idx]
        y_tr, y_va = y_all.iloc[train_idx], y_all.iloc[val_idx]

        # Apply SMOTE only to training fold
        X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr, y_tr)

        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_tr_resampled, y_tr_resampled,
            eval_set=[(X_va, y_va)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(period=0)]
        )

        if hasattr(model, "best_iteration_") and model.best_iteration_ is not None:
            proba = model.predict_proba(X_va, num_iteration=model.best_iteration_)[:, 1]
        else:
            proba = model.predict_proba(X_va)[:, 1]

        score = average_precision_score(y_va, proba)
        fold_scores.append(score)

        trial.report(float(np.mean(fold_scores)), step=fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(fold_scores))

print("\nRunning Optuna with SMOTE (50 trials)...")
print("This will take longer due to synthetic sample generation...")

storage_smote = "sqlite:///lgbm_optuna_smote.db"
study_smote = optuna.create_study(
    direction="maximize",
    study_name="lgbm_smote_prauc",
    storage=storage_smote,
    load_if_exists=True,
    pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=1, reduction_factor=3)
)

try:
    study_smote.optimize(objective_smote, n_trials=50, show_progress_bar=True)
except KeyboardInterrupt:
    pass

print("\n" + "="*90)
print("OPTUNA RESULTS (SMOTE)")
print("="*90)
print(f"‚úì Best Mean PR-AUC (5-fold, SMOTE): {study_smote.best_value:.4f}")
print("\n‚úì Best parameters:")
for k, v in study_smote.best_params.items():
    print(f"  - {k}: {v}")

best_lgbm_params_smote = study_smote.best_params.copy()
best_lgbm_params_smote.update({
    'random_state': RANDOM_STATE,
    'verbose': -1
})



STEP 1: OPTUNA HYPERPARAMETER TUNING WITH SMOTE

Running Optuna with SMOTE (50 trials)...
This will take longer due to synthetic sample generation...


  0%|          | 0/50 [00:00<?, ?it/s]


OPTUNA RESULTS (SMOTE)
‚úì Best Mean PR-AUC (5-fold, SMOTE): 0.6063

‚úì Best parameters:
  - n_estimators: 1531
  - learning_rate: 0.038660328981200656
  - num_leaves: 100
  - max_depth: 4
  - min_child_samples: 82
  - subsample: 0.7274453329581444
  - colsample_bytree: 0.6636273421672354
  - reg_alpha: 0.7603683245942066
  - reg_lambda: 0.7904730731479439


# calibrated oof


In [51]:
## CELL 6: GET ROBUST CALIBRATED OOF PREDICTIONS (LGBM + SMOTE)

print("="*90)
print("STEP 2: GENERATE CALIBRATED OOF PREDICTIONS WITH SMOTE")
print("="*90)

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.calibration import CalibratedClassifierCV
# NEW: Import F1 metrics
from sklearn.metrics import (
    precision_recall_curve, f1_score, precision_score, recall_score,
    roc_auc_score, average_precision_score
)
import lightgbm as lgb
import numpy as np
import pandas as pd

def get_calibrated_oof_preds_lgbm_smote(params, X, y, random_state=123):
    """
    5-fold CV with SMOTE and calibration inside each fold.
    """
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(y), dtype=float)
    smote = SMOTE(random_state=random_state, sampling_strategy=1.0)

    print(f"\nGenerating calibrated OOF predictions with SMOTE...")
    print(f"Total samples: {len(y)}")
    print(f"Original class balance: {(y==1).sum()/len(y)*100:.2f}% positive\n")

    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"{'='*90}")
        print(f"Fold {fold}/5")
        print(f"{'='*90}")

        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        print(f"  Original fold: {len(X_train_fold)} train, {len(X_val_fold)} val")

        # Split for train/calibration (80/20)
        X_tr, X_cal, y_tr, y_cal = train_test_split(
            X_train_fold, y_train_fold,
            test_size=0.20,
            random_state=random_state,
            stratify=y_train_fold
        )

        # Further split for early stopping
        X_tr_base, X_es, y_tr_base, y_es = train_test_split(
            X_tr, y_tr,
            test_size=0.15,
            random_state=random_state,
            stratify=y_tr
        )

        print(f"  Before SMOTE: {len(X_tr_base)} samples ({(y_tr_base==1).sum()} pos, {(y_tr_base==0).sum()} neg)")

        # Apply SMOTE
        X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr_base, y_tr_base)
        print(f"  After SMOTE:  {len(X_tr_resampled)} samples ({(y_tr_resampled==1).sum()} pos, {(y_tr_resampled==0).sum()} neg)")
        print(f"  Ratio increase: {len(X_tr_resampled)/len(X_tr_base):.2f}x")

        # Train on SMOTE data, validate on original
        base_model = lgb.LGBMClassifier(**params)
        base_model.fit(
            X_tr_resampled, y_tr_resampled,
            eval_set=[(X_es, y_es)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(period=0)]
        )

        best_iter = base_model.best_iteration_ if hasattr(base_model, 'best_iteration_') else params.get('n_estimators', 1000)
        print(f"  Base model trained (best iteration: {best_iter})")

        # Calibrate on original calibration set
        calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv='prefit')
        calibrated_model.fit(X_cal, y_cal)
        print(f"  Model calibrated on {len(X_cal)} original samples")

        # Get OOF predictions on original validation data
        oof_preds[val_idx] = calibrated_model.predict_proba(X_val_fold)[:, 1]

        # --- NEW: Calculate F1 score for the fold ---
        prec, rec, thr = precision_recall_curve(y_val_fold, oof_preds[val_idx])
        f1s = 2 * (prec * rec) / (prec + rec + 1e-12)
        best_idx_pr = int(np.nanargmax(f1s[:-1]))
        fold_thresh = float(thr[max(0, best_idx_pr)])
        fold_f1 = float(f1s[best_idx_pr])
        # --- End NEW ---

        # Fold performance
        fold_auc = roc_auc_score(y_val_fold, oof_preds[val_idx])
        fold_ap = average_precision_score(y_val_fold, oof_preds[val_idx])

        # MODIFIED: Add F1 metrics
        fold_metrics.append({
            'fold': fold,
            'auc': fold_auc,
            'ap': fold_ap,
            'f1': fold_f1,
            'threshold': fold_thresh
        })

        # MODIFIED: Update print statement
        print(f"  Fold ROC-AUC: {fold_auc:.4f} | PR-AUC: {fold_ap:.4f} | Best F1: {fold_f1:.4f} (at Thresh={fold_thresh:.4f})")
        print(f"  ‚úì Fold {fold} complete\n")

    print("="*90)
    print("‚úì CALIBRATED OOF PREDICTIONS (SMOTE) COMPLETE")
    print("="*90)

    # Overall metrics
    oof_auc = roc_auc_score(y, oof_preds)
    oof_ap = average_precision_score(y, oof_preds)

    print(f"\nOverall OOF Performance (Threshold-Free):")
    print(f"  ROC-AUC: {oof_auc:.4f}")
    print(f"  PR-AUC:  {oof_ap:.4f}")

    # MODIFIED: Update per-fold breakdown table
    print(f"\nPer-Fold Breakdown:")
    print(f"{'Fold':<8} {'ROC-AUC':<10} {'PR-AUC':<10} {'Best F1':<10} {'Threshold':<10}")
    print("-"*50)
    for m in fold_metrics:
        print(f"{m['fold']:<8} {m['auc']:<10.4f} {m['ap']:<10.4f} {m['f1']:<10.4f} {m['threshold']:<10.4f}")

    # --- NEW: Global OOF F1 Optimization ---
    print("\n" + "="*90)
    print("GLOBAL OOF THRESHOLD OPTIMIZATION (on Calibrated SMOTE Probs)")
    print("="*90)

    prec_g, rec_g, thr_g = precision_recall_curve(y, oof_preds)
    f1s_g = 2 * (prec_g * rec_g) / (prec_g * rec_g + 1e-12)

    best_idx_g = int(np.nanargmax(f1s_g[:-1]))
    global_thresh = float(thr_g[max(0, best_idx_g)])
    global_f1 = float(f1s_g[best_idx_g])

    global_preds = (oof_preds >= global_thresh).astype(int)
    global_prec = precision_score(y, global_preds)
    global_rec = recall_score(y, global_preds)

    print(f"üéØ Best Global Threshold: {global_thresh:.4f}")
    print(f"   - OOF F1 Score:  {global_f1:.4f}")
    print(f"   - OOF Precision: {global_prec:.4f}")
    print(f"   - OOF Recall:    {global_rec:.4f}")
    # --- End NEW ---

    print(f"\nOOF Probability Statistics:")
    print(f"  Mean:   {oof_preds.mean():.4f}")
    print(f"  Std:    {oof_preds.std():.4f}")
    print(f"  Median: {np.median(oof_preds):.4f}")
    print(f"  25th percentile: {np.percentile(oof_preds, 25):.4f}")
    print(f"  75th percentile: {np.percentile(oof_preds, 75):.4f}")
    print(f"  Min:    {oof_preds.min():.4f}")
    print(f"  Max:    {oof_preds.max():.4f}")

    return oof_preds, global_thresh # NEW: Return threshold as well

# Generate OOF and get the final threshold
best_lgbm_params_smote['n_estimators'] = 2000
oof_lgbm_calibrated_smote, final_lgbm_threshold = get_calibrated_oof_preds_lgbm_smote(
    best_lgbm_params_smote,
    X_all,
    y_all,
    random_state=RANDOM_STATE
)

# NEW: Print a final confirmation of the threshold to be used
print("\n" + "="*90)
print(f"‚úÖ FINAL THRESHOLD SELECTED: {final_lgbm_threshold:.4f}")
print("="*90)

STEP 2: GENERATE CALIBRATED OOF PREDICTIONS WITH SMOTE

Generating calibrated OOF predictions with SMOTE...
Total samples: 17999
Original class balance: 22.86% positive

Fold 1/5
  Original fold: 14399 train, 3600 val
  Before SMOTE: 9791 samples (2239 pos, 7552 neg)
  After SMOTE:  15104 samples (7552 pos, 7552 neg)
  Ratio increase: 1.54x
  Base model trained (best iteration: 241)
  Model calibrated on 2880 original samples
  Fold ROC-AUC: 0.8309 | PR-AUC: 0.5808 | Best F1: 0.5828 (at Thresh=0.2152)
  ‚úì Fold 1 complete

Fold 2/5
  Original fold: 14399 train, 3600 val
  Before SMOTE: 9791 samples (2239 pos, 7552 neg)
  After SMOTE:  15104 samples (7552 pos, 7552 neg)
  Ratio increase: 1.54x
  Base model trained (best iteration: 244)
  Model calibrated on 2880 original samples
  Fold ROC-AUC: 0.8418 | PR-AUC: 0.6133 | Best F1: 0.5980 (at Thresh=0.2296)
  ‚úì Fold 2 complete

Fold 3/5
  Original fold: 14399 train, 3600 val
  Before SMOTE: 9791 samples (2239 pos, 7552 neg)
  After SMOT

# threshold selection

In [52]:
print("\n" + "="*90)
print("STEP 3: OPTIMIZE THRESHOLD ON CALIBRATED OOF (SMOTE)")
print("="*90)

print("\nSearching for optimal threshold...")

# Method 1: PR Curve
prec, rec, thr = precision_recall_curve(y_all, oof_lgbm_calibrated_smote)
f1s = 2 * (prec * rec) / (prec + rec + 1e-12)
best_idx_pr = int(np.nanargmax(f1s[:-1]))
threshold_pr = float(thr[max(0, best_idx_pr)])
f1_pr = float(f1s[best_idx_pr])

# Method 2: Grid Search
thresholds_grid = np.arange(0.15, 0.55, 0.01)
f1_scores_grid = []
for t in thresholds_grid:
    preds = (oof_lgbm_calibrated_smote >= t).astype(int)
    f1_scores_grid.append(f1_score(y_all, preds))

best_idx_grid = int(np.argmax(f1_scores_grid))
threshold_grid = float(thresholds_grid[best_idx_grid])
f1_grid = float(f1_scores_grid[best_idx_grid])

print("\n" + "="*90)
print("THRESHOLD COMPARISON (SMOTE Calibrated OOF)")
print("="*90)
print(f"{'Method':<20} {'Threshold':<12} {'F1 Score':<12} {'Precision':<12} {'Recall':<10}")
print("="*90)

methods = [
    ("PR Curve", threshold_pr, f1_pr),
    ("Grid Search", threshold_grid, f1_grid),
]

best_f1_overall = 0
for method_name, thresh, f1 in methods:
    preds = (oof_lgbm_calibrated_smote >= thresh).astype(int)
    prec = precision_score(y_all, preds, zero_division=0)
    rec = recall_score(y_all, preds, zero_division=0)

    marker = ""
    if f1 > best_f1_overall:
        best_f1_overall = f1
        final_lgbm_threshold = thresh
        best_method = method_name
        final_prec = prec
        final_rec = rec
        marker = " ‚Üê BEST"

    print(f"{method_name:<20} {thresh:<12.4f} {f1:<12.4f} {prec:<12.4f} {rec:<10.4f}{marker}")

print("="*90)

print(f"\n{'='*90}")
print("FINAL THRESHOLD SELECTION (SMOTE)")
print(f"{'='*90}")
print(f"‚úÖ Selected Method: {best_method}")
print(f"‚úÖ Optimal Threshold: {final_lgbm_threshold:.4f}")
print(f"‚úÖ Expected OOF F1: {best_f1_overall:.4f}")
print(f"\nPerformance Breakdown:")
print(f"  Precision: {final_prec:.4f}")
print(f"  Recall:    {final_rec:.4f}")
print(f"  F1 Score:  {best_f1_overall:.4f}")

preds_final = (oof_lgbm_calibrated_smote >= final_lgbm_threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_all, preds_final).ravel()
print(f"\nConfusion Matrix:")
print(f"  TN: {tn:5d} | FP: {fp:5d}")
print(f"  FN: {fn:5d} | TP: {tp:5d}")
print(f"{'='*90}\n")


STEP 3: OPTIMIZE THRESHOLD ON CALIBRATED OOF (SMOTE)

Searching for optimal threshold...

THRESHOLD COMPARISON (SMOTE Calibrated OOF)
Method               Threshold    F1 Score     Precision    Recall    
PR Curve             0.2349       0.5909       0.5055       0.7111     ‚Üê BEST
Grid Search          0.2200       0.5908       0.4956       0.7312    

FINAL THRESHOLD SELECTION (SMOTE)
‚úÖ Selected Method: PR Curve
‚úÖ Optimal Threshold: 0.2349
‚úÖ Expected OOF F1: 0.5909

Performance Breakdown:
  Precision: 0.5055
  Recall:    0.7111
  F1 Score:  0.5909

Confusion Matrix:
  TN: 11022 | FP:  2862
  FN:  1189 | TP:  2926



# train final model + prediction

In [53]:
print("="*90)
print("STEP 4: TRAIN FINAL CALIBRATED MODEL WITH SMOTE & PREDICT")
print("="*90)

# Split: 85% train, 15% calibration
X_train_final, X_cal_final, y_train_final, y_cal_final = train_test_split(
    X_all, y_all,
    test_size=0.15,
    random_state=RANDOM_STATE,
    stratify=y_all
)

print(f"\nFinal data split:")
print(f"  Training: {X_train_final.shape[0]} samples")
print(f"  Calibration: {X_cal_final.shape[0]} samples")

# Apply SMOTE to training data
print(f"\nApplying SMOTE to final training data...")
print(f"  Before: {len(y_train_final)} samples ({(y_train_final==1).sum()} pos, {(y_train_final==0).sum()} neg)")

smote_final = SMOTE(random_state=RANDOM_STATE, sampling_strategy=1.0)
X_tr_resampled_final, y_tr_resampled_final = smote_final.fit_resample(X_train_final, y_train_final)

print(f"  After:  {len(y_tr_resampled_final)} samples ({(y_tr_resampled_final==1).sum()} pos, {(y_tr_resampled_final==0).sum()} neg)")
print(f"  Ratio increase: {len(y_tr_resampled_final)/len(y_train_final):.2f}x")

# Train on SMOTE data
print(f"\nTraining final LightGBM on SMOTE data...")
final_lgbm_params = best_lgbm_params_smote.copy()
final_lgbm_params['n_estimators'] = study_smote.best_params.get('n_estimators', 1000)

final_lgbm_base = lgb.LGBMClassifier(**final_lgbm_params)
final_lgbm_base.fit(X_tr_resampled_final, y_tr_resampled_final)
print(f"‚úì Base model trained")

# Calibrate on original calibration data
print(f"Calibrating on original (non-SMOTE) calibration data...")
final_lgbm = CalibratedClassifierCV(final_lgbm_base, method='sigmoid', cv='prefit')
final_lgbm.fit(X_cal_final, y_cal_final)
print(f"‚úì Model calibrated")

# Generate test predictions
print(f"\n{'='*90}")
print(f"GENERATING TEST PREDICTIONS")
print(f"{'='*90}")

print(f"\nGenerating calibrated probabilities for {X_test_all.shape[0]} test samples...")
test_probabilities = final_lgbm.predict_proba(X_test_all)[:, 1]

print(f"\nTest Probability Statistics:")
print(f"  Mean:   {test_probabilities.mean():.4f}")
print(f"  Std:    {test_probabilities.std():.4f}")
print(f"  Median: {np.median(test_probabilities):.4f}")
print(f"  Min:    {test_probabilities.min():.4f}")
print(f"  Max:    {test_probabilities.max():.4f}")

# Apply threshold
test_predictions = (test_probabilities >= final_lgbm_threshold).astype(int)

print(f"\n‚úì Applied threshold: {final_lgbm_threshold:.4f}")
print(f"\nFinal Prediction Distribution:")
print(f"  Class 0 (No subrogation): {(test_predictions == 0).sum():5d} ({(test_predictions == 0).sum() / len(test_predictions) * 100:5.1f}%)")
print(f"  Class 1 (Subrogation):    {(test_predictions == 1).sum():5d} ({(test_predictions == 1).sum() / len(test_predictions) * 100:5.1f}%)")

# Create submission
submission = pd.DataFrame({
    'claim_number': test_fe['claim_number'],
    'subrogation': test_predictions
})

output_filename = f'TriGuard_SMOTE_CalOOF_thresh_{int(final_lgbm_threshold*10000)}.csv'
submission.to_csv(output_filename, index=False)

print(f"\n{'='*90}")
print(f"‚úì SUBMISSION FILE SAVED: {output_filename}")
print(f"{'='*90}")

try:
    from google.colab import files
    files.download(output_filename)
    print("‚úì File downloaded!")
except ImportError:
    print("(Not in Colab - file saved locally)")

print(f"\n{'='*90}")
print("SMOTE PIPELINE COMPLETE!")
print(f"{'='*90}")
print(f"\nüéØ Expected Performance: F1 ‚âà {best_f1_overall:.4f} (based on SMOTE calibrated OOF)")
print(f"üìä Method: {best_method}")
print(f"üîß Threshold: {final_lgbm_threshold:.4f}")
print(f"\n‚ö†Ô∏è  Remember to COMPARE with your non-SMOTE baseline!")
print(f"   Non-SMOTE baseline F1: ~0.59")
print(f"   SMOTE expected F1: {best_f1_overall:.4f}")
print(f"   Improvement: {best_f1_overall - 0.59:+.4f}\n")

STEP 4: TRAIN FINAL CALIBRATED MODEL WITH SMOTE & PREDICT

Final data split:
  Training: 15299 samples
  Calibration: 2700 samples

Applying SMOTE to final training data...
  Before: 15299 samples (3498 pos, 11801 neg)
  After:  23602 samples (11801 pos, 11801 neg)
  Ratio increase: 1.54x

Training final LightGBM on SMOTE data...
‚úì Base model trained
Calibrating on original (non-SMOTE) calibration data...
‚úì Model calibrated

GENERATING TEST PREDICTIONS

Generating calibrated probabilities for 12000 test samples...

Test Probability Statistics:
  Mean:   0.2290
  Std:    0.2094
  Median: 0.1237
  Min:    0.0734
  Max:    0.8834

‚úì Applied threshold: 0.2349

Final Prediction Distribution:
  Class 0 (No subrogation):  8304 ( 69.2%)
  Class 1 (Subrogation):     3696 ( 30.8%)

‚úì SUBMISSION FILE SAVED: TriGuard_SMOTE_CalOOF_thresh_2348.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úì File downloaded!

SMOTE PIPELINE COMPLETE!

üéØ Expected Performance: F1 ‚âà 0.5909 (based on SMOTE calibrated OOF)
üìä Method: PR Curve
üîß Threshold: 0.2349

‚ö†Ô∏è  Remember to COMPARE with your non-SMOTE baseline!
   Non-SMOTE baseline F1: ~0.59
   SMOTE expected F1: 0.5909
   Improvement: +0.0009



# ==============


# New Section