# setup and configuration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import pickle

# --- Setup ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('seaborn-v0_8-darkgrid')

# --- Random Seed ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)

# --- Model/Metrics ---
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# --- ML Models ---
import lightgbm as lgb
import xgboost as xgb

# --- Tuning ---
try:
    import optuna
except ImportError:
    print("Installing Optuna...")
    !pip install optuna -q
    import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- Feature Engineering Helpers ---
try:
    from pyzipcode import ZipCodeDatabase
except ImportError:
    print("Installing pyzipcode...")
    !pip install pyzipcode -q
    from pyzipcode import ZipCodeDatabase

print("\n‚úì Setup complete. All libraries loaded.")

Installing Optuna...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m400.9/400.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling pyzipcode...
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.9/1.9 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyzipcode (setup.py) ... [?25l[?25hdone

‚úì Setup complete. All libraries loaded.


# data loading


In [2]:
# --- Helper ---
zcdb = ZipCodeDatabase()
def get_state(zip_code):
    """Converts a zip code to a 2-letter state, handling errors."""
    try:
        if pd.isna(zip_code):
            return 'Unknown'
        zip_code_str = str(int(float(zip_code))).zfill(5)
        return zcdb[zip_code_str].state
    except (ValueError, KeyError, AttributeError, TypeError):
        return 'Unknown'

print("‚úì State feature helper created.")

# --- Load Data ---
# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import files
    try:
        train_df = pd.read_csv('Training_TriGuard.csv')
        test_df = pd.read_csv('Testing_TriGuard.csv')
        print("‚úì Files loaded from local environment.")
    except FileNotFoundError:
        print("Please upload Training_TriGuard.csv:")
        uploaded_train = files.upload()
        train_file = list(uploaded_train.keys())[0]
        train_df = pd.read_csv(train_file)

        print("\nPlease upload Testing_TriGuard.csv:")
        uploaded_test = files.upload()
        test_file = list(uploaded_test.keys())[0]
        test_df = pd.read_csv(test_file)
        print("‚úì Files uploaded successfully.")
else:
    # Local environment
    possible_paths = [
        'Training_TriGuard.csv',
        'data/Training_TriGuard.csv',
        '../Training_TriGuard.csv',
        './Training_TriGuard.csv'
    ]

    train_path = None
    test_path = None

    for path in possible_paths:
        if os.path.exists(path):
            train_path = path
            test_path = path.replace('Training', 'Testing')
            if os.path.exists(test_path):
                break

    if train_path and test_path:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"‚úì Files loaded from: {train_path} and {test_path}")
    else:
        train_path = input("Enter path to Training_TriGuard.csv: ").strip()
        test_path = input("Enter path to Testing_TriGuard.csv: ").strip()
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print("‚úì Files loaded successfully.")

# --- Critical Cleaning ---
initial_train_count = len(train_df)
train_df = train_df.dropna(subset=['subrogation'])
print(f"\nCleaned training data: Removed {initial_train_count - len(train_df)} rows with NaN target.")

train_df['subrogation'] = train_df['subrogation'].astype(int)

print(f"\n‚úì Train shape: {train_df.shape}")
print(f"‚úì Test shape: {test_df.shape}")
print(f"\nTarget distribution (after cleaning):")
print(train_df['subrogation'].value_counts(normalize=True).to_string())

test_ids = test_df['claim_number'].copy()

‚úì State feature helper created.
Please upload Training_TriGuard.csv:


Saving Training_TriGuard.csv to Training_TriGuard.csv

Please upload Testing_TriGuard.csv:


Saving Testing_TriGuard.csv to Testing_TriGuard.csv
‚úì Files uploaded successfully.

Cleaned training data: Removed 2 rows with NaN target.

‚úì Train shape: (17999, 29)
‚úì Test shape: (12000, 28)

Target distribution (after cleaning):
subrogation
0   0.771
1   0.229


# feature engineering


In [3]:
def feature_engineer(df):
    """Feature engineering WITHOUT vehicle_made_year/vehicle_age (data quality issues)"""
    df_fe = df.copy()

    # ========================================================================
    # TEMPORAL FEATURES
    # ========================================================================
    df_fe['claim_date'] = pd.to_datetime(df_fe['claim_date'], errors='coerce')
    df_fe['claim_year'] = df_fe['claim_date'].dt.year
    df_fe['claim_month'] = df_fe['claim_date'].dt.month
    df_fe['claim_day'] = df_fe['claim_date'].dt.day
    df_fe['claim_quarter'] = df_fe['claim_date'].dt.quarter
    df_fe['claim_dayofweek'] = df_fe['claim_date'].dt.dayofweek
    df_fe['is_weekend'] = (df_fe['claim_dayofweek'] >= 5).astype(int)
    df_fe['is_monday'] = (df_fe['claim_dayofweek'] == 0).astype(int)
    df_fe['is_friday'] = (df_fe['claim_dayofweek'] == 4).astype(int)
    df_fe['is_q4'] = (df_fe['claim_quarter'] == 4).astype(int)

    season_map = {
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall',
        12: 'Winter', 1: 'Winter', 2: 'Winter'
    }
    df_fe['season'] = df_fe['claim_month'].map(season_map).fillna('Unknown')

    # ========================================================================
    # DATA CLEANING
    # ========================================================================
    df_fe.loc[(df_fe['year_of_born'] < 1900) | (df_fe['year_of_born'] > 2025), 'year_of_born'] = np.nan

    # ========================================================================
    # BINARY CONVERSIONS (for interactions)
    # ========================================================================
    df_fe['witness_binary'] = (df_fe['witness_present_ind'] == 'Y').astype(int)
    df_fe['police_binary'] = df_fe['policy_report_filed_ind']
    df_fe['multicar_binary'] = df_fe['accident_type'].isin(['multi_vehicle_clear', 'multi_vehicle_unclear']).astype(int)
    df_fe['highrisk_site_binary'] = df_fe['accident_site'].isin(['Highway/Intersection', 'Local']).astype(int)

    # ========================================================================
    # CRITICAL INTERACTION FEATURES (2-way)
    # ========================================================================
    df_fe['liab_x_witness'] = df_fe['liab_prct'] * df_fe['witness_binary']
    df_fe['liab_x_police'] = df_fe['liab_prct'] * df_fe['police_binary']
    df_fe['liab_x_multicar'] = df_fe['liab_prct'] * df_fe['multicar_binary']
    df_fe['liab_x_highrisk_site'] = df_fe['liab_prct'] * df_fe['highrisk_site_binary']
    df_fe['liab_x_evidence'] = df_fe['liab_prct'] * (df_fe['witness_binary'] + df_fe['police_binary'])
    df_fe['liab_x_payout'] = df_fe['liab_prct'] * df_fe['claim_est_payout']
    df_fe['liab_x_mileage'] = df_fe['liab_prct'] * df_fe['vehicle_mileage']

    df_fe['witness_x_police'] = df_fe['witness_binary'] * df_fe['police_binary']
    df_fe['witness_x_multicar'] = df_fe['witness_binary'] * df_fe['multicar_binary']
    df_fe['police_x_multicar'] = df_fe['police_binary'] * df_fe['multicar_binary']
    df_fe['multicar_x_highrisk'] = df_fe['multicar_binary'] * df_fe['highrisk_site_binary']
    df_fe['weekend_highway'] = (df_fe['claim_dayofweek'] >= 5).astype(int) * (df_fe['accident_site'] == 'Highway/Intersection').astype(int)

    # 3-way interaction
    df_fe['witness_police_multicar'] = df_fe['witness_binary'] * df_fe['police_binary'] * df_fe['multicar_binary']

    # ========================================================================
    # POLYNOMIAL FEATURES (liability & key variables)
    # ========================================================================
    df_fe['liab_prct_squared'] = df_fe['liab_prct'] ** 2
    df_fe['liab_prct_cubed'] = df_fe['liab_prct'] ** 3
    df_fe['liab_prct_sqrt'] = np.sqrt(df_fe['liab_prct'])
    df_fe['liab_prct_log'] = np.log1p(df_fe['liab_prct'])
    df_fe['liab_inverse'] = 100 - df_fe['liab_prct']
    df_fe['liab_inverse_squared'] = (100 - df_fe['liab_prct']) ** 2

    df_fe['log_claim_est_payout'] = np.log1p(df_fe['claim_est_payout'])
    df_fe['log_vehicle_mileage'] = np.log1p(df_fe['vehicle_mileage'])
    df_fe['log_vehicle_price'] = np.log1p(df_fe['vehicle_price'])
    df_fe['log_annual_income'] = np.log1p(df_fe['annual_income'])
    df_fe['sqrt_vehicle_mileage'] = np.sqrt(df_fe['vehicle_mileage'])

    # ========================================================================
    # ACCIDENT TYPE FEATURES
    # ========================================================================
    df_fe['is_multi_vehicle_clear'] = (df_fe['accident_type'] == 'multi_vehicle_clear').astype(int)
    df_fe['is_multi_vehicle_unclear'] = (df_fe['accident_type'] == 'multi_vehicle_unclear').astype(int)
    df_fe['is_single_car'] = (df_fe['accident_type'] == 'single_car').astype(int)
    df_fe['has_recovery_target'] = df_fe['multicar_binary']

    df_fe['recovery_case_clarity'] = 0
    df_fe.loc[df_fe['is_multi_vehicle_clear'] == 1, 'recovery_case_clarity'] = 3
    df_fe.loc[df_fe['is_multi_vehicle_unclear'] == 1, 'recovery_case_clarity'] = 1

    # ========================================================================
    # LIABILITY BUCKETS (fine-grained)
    # ========================================================================
    df_fe['liab_under_10'] = (df_fe['liab_prct'] < 10).astype(int)
    df_fe['liab_10_to_15'] = ((df_fe['liab_prct'] >= 10) & (df_fe['liab_prct'] < 15)).astype(int)
    df_fe['liab_15_to_20'] = ((df_fe['liab_prct'] >= 15) & (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['liab_20_to_25'] = ((df_fe['liab_prct'] >= 20) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['liab_25_to_30'] = ((df_fe['liab_prct'] >= 25) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['liab_30_to_35'] = ((df_fe['liab_prct'] >= 30) & (df_fe['liab_prct'] < 35)).astype(int)
    df_fe['liab_35_to_40'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 40)).astype(int)
    df_fe['liab_40_to_50'] = ((df_fe['liab_prct'] >= 40) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['liab_over_50'] = (df_fe['liab_prct'] >= 50).astype(int)

    df_fe['not_at_fault'] = df_fe['liab_under_10']
    df_fe['minimal_fault'] = (df_fe['liab_prct'] < 25).astype(int)
    df_fe['low_fault'] = (df_fe['liab_prct'] < 35).astype(int)
    df_fe['shared_fault'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['high_fault'] = (df_fe['liab_prct'] >= 50).astype(int)

    # ========================================================================
    # EVIDENCE QUALITY FEATURES
    # ========================================================================
    df_fe['witness_present'] = df_fe['witness_binary']
    df_fe['police_report'] = df_fe['police_binary']

    df_fe['evidence_none'] = ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 0)).astype(int)
    df_fe['evidence_weak'] = (((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 0)) |
                              ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 1))).astype(int)
    df_fe['evidence_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1)).astype(int)
    df_fe['evidence_very_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1) &
                                      (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['evidence_score'] = df_fe['witness_present'] + df_fe['police_report']

    # ========================================================================
    # ACCIDENT SITE FEATURES
    # ========================================================================
    df_fe['high_risk_site'] = df_fe['highrisk_site_binary']
    df_fe['parking_accident'] = (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['unknown_site'] = (df_fe['accident_site'] == 'Unknown').astype(int)
    df_fe['highway_accident'] = (df_fe['accident_site'] == 'Highway/Intersection').astype(int)
    df_fe['local_accident'] = (df_fe['accident_site'] == 'Local').astype(int)

    # ========================================================================
    # DRIVER AGE & EXPERIENCE
    # ========================================================================
    df_fe['driver_age'] = df_fe['claim_year'] - df_fe['year_of_born']
    df_fe.loc[(df_fe['driver_age'] < 16) | (df_fe['driver_age'] > 100), 'driver_age'] = np.nan

    df_fe['young_driver'] = ((df_fe['driver_age'] >= 16) & (df_fe['driver_age'] <= 25)).astype(int)
    df_fe['prime_driver'] = ((df_fe['driver_age'] > 25) & (df_fe['driver_age'] <= 45)).astype(int)
    df_fe['middle_age_driver'] = ((df_fe['driver_age'] > 45) & (df_fe['driver_age'] <= 65)).astype(int)
    df_fe['senior_driver'] = (df_fe['driver_age'] > 65).astype(int)

    df_fe['driving_experience'] = (df_fe['driver_age'] - df_fe['age_of_DL']).clip(lower=0)
    df_fe.loc[df_fe['driving_experience'] < 0, 'driving_experience'] = np.nan

    df_fe['novice_driver'] = (df_fe['driving_experience'] < 3).astype(int)
    df_fe['experienced_driver'] = ((df_fe['driving_experience'] >= 3) & (df_fe['driving_experience'] <= 10)).astype(int)
    df_fe['veteran_driver'] = (df_fe['driving_experience'] > 10).astype(int)

    df_fe['experience_x_safety'] = df_fe['driving_experience'] * df_fe['safety_rating']
    df_fe['driver_age_x_safety'] = df_fe['driver_age'] * df_fe['safety_rating']

    # ========================================================================
    # VEHICLE FEATURES (without vehicle_age)
    # ========================================================================
    df_fe['luxury_vehicle'] = (df_fe['vehicle_price'] > 50000).astype(int)
    df_fe['mid_price_vehicle'] = ((df_fe['vehicle_price'] >= 20000) & (df_fe['vehicle_price'] <= 50000)).astype(int)
    df_fe['economy_vehicle'] = (df_fe['vehicle_price'] < 20000).astype(int)

    df_fe['heavy_vehicle'] = (df_fe['vehicle_weight'] > 30000).astype(int)
    df_fe['light_vehicle'] = (df_fe['vehicle_weight'] < 15000).astype(int)
    df_fe['medium_weight'] = ((df_fe['vehicle_weight'] >= 15000) & (df_fe['vehicle_weight'] <= 30000)).astype(int)

    df_fe['is_large_vehicle'] = (df_fe['vehicle_category'] == 'Large').astype(int)
    df_fe['is_compact_vehicle'] = (df_fe['vehicle_category'] == 'Compact').astype(int)
    df_fe['is_medium_vehicle'] = (df_fe['vehicle_category'] == 'Medium').astype(int)

    # ========================================================================
    # CLAIM CHARACTERISTICS
    # ========================================================================
    df_fe['high_mileage'] = (df_fe['vehicle_mileage'] > 100000).astype(int)
    df_fe['low_mileage'] = (df_fe['vehicle_mileage'] < 50000).astype(int)
    df_fe['very_high_mileage'] = (df_fe['vehicle_mileage'] > 150000).astype(int)
    df_fe['medium_mileage'] = ((df_fe['vehicle_mileage'] >= 50000) & (df_fe['vehicle_mileage'] <= 100000)).astype(int)

    df_fe['frequent_claimer'] = (df_fe['past_num_of_claims'] > 5).astype(int)
    df_fe['moderate_claimer'] = ((df_fe['past_num_of_claims'] >= 1) & (df_fe['past_num_of_claims'] <= 5)).astype(int)
    df_fe['first_time_claimer'] = (df_fe['past_num_of_claims'] == 0).astype(int)
    df_fe['very_frequent_claimer'] = (df_fe['past_num_of_claims'] > 10).astype(int)

    df_fe['large_payout'] = (df_fe['claim_est_payout'] > 5000).astype(int)
    df_fe['medium_payout'] = ((df_fe['claim_est_payout'] >= 2000) & (df_fe['claim_est_payout'] <= 5000)).astype(int)
    df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
    df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)

    df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
    df_fe['mileage_x_claims'] = df_fe['vehicle_mileage'] * df_fe['past_num_of_claims']

    # ========================================================================
    # RATIO FEATURES
    # ========================================================================
    df_fe['payout_to_price_ratio'] = df_fe['claim_est_payout'] / (df_fe['vehicle_price'] + 1)
    df_fe['severe_damage'] = (df_fe['payout_to_price_ratio'] > 0.3).astype(int)
    df_fe['moderate_damage'] = ((df_fe['payout_to_price_ratio'] >= 0.1) & (df_fe['payout_to_price_ratio'] <= 0.3)).astype(int)
    df_fe['minor_damage'] = (df_fe['payout_to_price_ratio'] < 0.1).astype(int)

    df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (df_fe['vehicle_price'] + 1)
    df_fe['can_afford_vehicle'] = (df_fe['income_to_vehicle_price'] >= 0.5).astype(int)
    df_fe['expensive_for_income'] = (df_fe['income_to_vehicle_price'] < 0.3).astype(int)

    df_fe['claims_per_year_driving'] = df_fe['past_num_of_claims'] / (df_fe['driving_experience'] + 1)
    df_fe['claim_frequency_high'] = (df_fe['claims_per_year_driving'] > 0.5).astype(int)

    df_fe['safety_to_liability'] = df_fe['safety_rating'] / (df_fe['liab_prct'] + 1)
    df_fe['payout_to_income'] = df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)
    df_fe['mileage_to_price'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['weight_to_price'] = df_fe['vehicle_weight'] / (df_fe['vehicle_price'] + 1)

    # ========================================================================
    # POLICYHOLDER CHARACTERISTICS
    # ========================================================================
    df_fe['high_income'] = (df_fe['annual_income'] > 70000).astype(int)
    df_fe['mid_income'] = ((df_fe['annual_income'] >= 40000) & (df_fe['annual_income'] <= 70000)).astype(int)
    df_fe['low_income'] = (df_fe['annual_income'] < 40000).astype(int)
    df_fe['very_high_income'] = (df_fe['annual_income'] > 100000).astype(int)

    df_fe['high_safety_rating'] = (df_fe['safety_rating'] > 80).astype(int)
    df_fe['low_safety_rating'] = (df_fe['safety_rating'] < 60).astype(int)
    df_fe['very_high_safety'] = (df_fe['safety_rating'] > 90).astype(int)
    df_fe['medium_safety'] = ((df_fe['safety_rating'] >= 60) & (df_fe['safety_rating'] <= 80)).astype(int)

    df_fe['contact_available'] = df_fe['email_or_tel_available']
    df_fe['has_education'] = df_fe['high_education_ind']
    df_fe['recent_move'] = df_fe['address_change_ind']
    df_fe['home_owner'] = (df_fe['living_status'] == 'Own').astype(int)
    df_fe['renter'] = (df_fe['living_status'] == 'Rent').astype(int)
    df_fe['female'] = (df_fe['gender'] == 'F').astype(int)

    # ========================================================================
    # CHANNEL FEATURES
    # ========================================================================
    df_fe['via_broker'] = (df_fe['channel'] == 'Broker').astype(int)
    df_fe['via_online'] = (df_fe['channel'] == 'Online').astype(int)
    df_fe['via_phone'] = (df_fe['channel'] == 'Phone').astype(int)
    df_fe['in_network_repair'] = (df_fe['in_network_bodyshop'] == 'yes').astype(int)
    df_fe['out_network_repair'] = (df_fe['in_network_bodyshop'] == 'no').astype(int)

    # ========================================================================
    # COMPOSITE RECOVERY SCORE
    # ========================================================================
    liability_score = np.sqrt((100 - df_fe['liab_prct']) / 100.0)
    evidence_score = (df_fe['evidence_none'] * 0.0 + df_fe['evidence_weak'] * 0.4 +
                      df_fe['evidence_strong'] * 0.7 + df_fe['evidence_very_strong'] * 1.0)
    clarity_score = df_fe['recovery_case_clarity'] / 3.0
    site_score = df_fe['high_risk_site'] * 0.7 + (1 - df_fe['unknown_site']) * 0.3

    df_fe['recovery_feasibility_score'] = (0.35 * liability_score + 0.30 * df_fe['has_recovery_target'] +
                                           0.20 * evidence_score + 0.10 * clarity_score + 0.05 * site_score)

    # ========================================================================
    # DOMAIN LOGIC FLAGS (CRITICAL FOR F1)
    # ========================================================================
    df_fe['perfect_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['witness_present'] == 1) &
                             (df_fe['police_report'] == 1) & (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['strong_case'] = ((df_fe['liab_prct'] < 25) & (df_fe['evidence_strong'] == 1) &
                            (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['good_case'] = ((df_fe['liab_prct'] < 35) & (df_fe['evidence_score'] >= 1) &
                          (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['weak_case'] = ((df_fe['liab_prct'] > 40) | (df_fe['is_single_car'] == 1) |
                          (df_fe['evidence_none'] == 1)).astype(int)

    df_fe['no_case'] = ((df_fe['liab_prct'] > 60) | ((df_fe['is_single_car'] == 1) & (df_fe['evidence_none'] == 1))).astype(int)

    df_fe['high_value_opportunity'] = ((df_fe['claim_est_payout'] > 3000) & (df_fe['liab_prct'] < 30) &
                                       (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['slam_dunk_case'] = ((df_fe['liab_prct'] < 10) & (df_fe['witness_present'] == 1) &
                               (df_fe['police_report'] == 1) & (df_fe['multicar_binary'] == 1) &
                               (df_fe['high_risk_site'] == 1)).astype(int)

    df_fe['low_liab_high_payout'] = ((df_fe['liab_prct'] < 20) & (df_fe['claim_est_payout'] > 5000)).astype(int)
    df_fe['clear_fault_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['multicar_binary'] == 1)).astype(int)
    df_fe['high_mileage_low_fault'] = ((df_fe['vehicle_mileage'] > 100000) & (df_fe['liab_prct'] < 30)).astype(int)

    # --- Temporal & Behavior Dynamics ---
    df_fe['claim_early_in_year'] = (df_fe['claim_month'] <= 3).astype(int)
    df_fe['claim_end_of_year'] = (df_fe['claim_month'] >= 10).astype(int)
    df_fe['weekend_parking'] = df_fe['is_weekend'] * (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['winter_claim_high_payout'] = ((df_fe['season'] == 'Winter') & (df_fe['claim_est_payout'] > 5000)).astype(int)

    # --- Vehicle Utilization Proxies (without vehicle_age) ---
    df_fe['mileage_x_weight'] = df_fe['vehicle_mileage'] * df_fe['vehicle_weight']
    df_fe['mileage_per_dollar'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['payout_to_weight'] = df_fe['claim_est_payout'] / (df_fe['vehicle_weight'] + 1)

    # --- Policyholder Risk Profile ---
    df_fe['unstable_policyholder'] = ((df_fe['recent_move'] == 1) & (df_fe['renter'] == 1)).astype(int)
    df_fe['financial_stress_risk'] = ((df_fe['expensive_for_income'] == 1) & (df_fe['large_payout'] == 1)).astype(int)
    df_fe['young_driver_highway'] = df_fe['young_driver'] * df_fe['highway_accident']
    df_fe['senior_driver_parking'] = df_fe['senior_driver'] * df_fe['parking_accident']

    # --- Liability & Evidence Interaction Insights ---
    df_fe['low_liab_weak_evidence'] = ((df_fe['liab_prct'] < 20) & (df_fe['evidence_weak'] == 1)).astype(int)
    df_fe['high_liab_strong_evidence'] = ((df_fe['liab_prct'] > 50) & (df_fe['evidence_strong'] == 1)).astype(int)

    # Composite confidence / case quality index
    df_fe['case_confidence_score'] = (
        0.4 * (100 - df_fe['liab_prct']) / 100 +
        0.4 * df_fe['evidence_score'] / 2 +
        0.2 * df_fe['recovery_case_clarity'] / 3
    )

    # --- Statistical Normalization & Percentile Features ---
    for col in ['claim_est_payout', 'vehicle_mileage', 'annual_income']:
        df_fe[f'{col}_z'] = (df_fe[col] - df_fe[col].mean()) / (df_fe[col].std() + 1e-9)

    try:
        df_fe['liab_percentile'] = pd.qcut(df_fe['liab_prct'], 10, labels=False, duplicates='drop')
        df_fe['payout_percentile'] = pd.qcut(df_fe['claim_est_payout'], 10, labels=False, duplicates='drop')
    except Exception:
        # If there aren't enough unique values to bin
        df_fe['liab_percentile'] = np.nan
        df_fe['payout_percentile'] = np.nan

    # --- Aggregate / Hybrid Indices ---
    df_fe['case_strength_index'] = df_fe['evidence_score'] * (1 - df_fe['liab_prct'] / 100)
    df_fe['financial_exposure_index'] = (
        (df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)) * (1 + df_fe['liab_prct'] / 100)
    )
    df_fe['behavioral_risk_index'] = (
        df_fe['claims_per_year_driving'] * (100 - df_fe['safety_rating']) / 100
    )

    return df_fe

print("‚úì Feature engineering function defined")

‚úì Feature engineering function defined


# pre-modeling with target encoding


In [4]:
print("="*80)
print("Running Feature Engineering on train and test data...")

train_fe = feature_engineer(train_df)
test_fe = feature_engineer(test_df)
print("‚úì Feature engineering complete.")

# Define Categorical Feature Lists - FIXED
features_to_target_encode = [
    'gender', 'living_status', 'accident_site',
    'channel', 'vehicle_category', 'vehicle_color', 'accident_type',
    'in_network_bodyshop', 'season', 'zip_code'  # Removed 'state' and 'claim_day_of_week'
]

# Apply Target Encoding
print(f"\nApplying Smoothed Target Encoding to {len(features_to_target_encode)} features...")
global_mean = train_fe['subrogation'].mean()
categorical_features_for_lgbm = []

for col in features_to_target_encode:
    target_mean = train_fe.groupby(col)['subrogation'].mean()
    category_counts = train_fe.groupby(col).size()
    smoothing = 20

    smoothed_mean = (target_mean * category_counts + global_mean * smoothing) / (category_counts + smoothing)

    new_col_name = f'{col}_target_enc'
    train_fe[new_col_name] = train_fe[col].map(smoothed_mean)
    test_fe[new_col_name] = test_fe[col].map(smoothed_mean)

    test_fe[new_col_name] = test_fe[new_col_name].fillna(global_mean)

    categorical_features_for_lgbm.append(new_col_name)

print("‚úì Target encoding complete.")

# Create Final X, y, and X_test
y_all = train_fe['subrogation'].copy()

drop_cols = [
    'subrogation', 'claim_number', 'claim_date', 'year_of_born',
    'witness_present_ind', 'policy_report_filed_ind',
    'vehicle_made_year'  # Bad data quality
]
drop_cols.extend(features_to_target_encode)

feature_cols = [col for col in train_fe.columns if col not in drop_cols]
X_all = train_fe[feature_cols].copy()
X_test_all = test_fe[feature_cols].copy()

# Apply Label Encoding (if any object columns remain)
other_cat_cols = X_all.select_dtypes(include='object').columns.tolist()
if other_cat_cols:
    print(f"\nApplying Label Encoding to {len(other_cat_cols)} remaining features...")
    for col in other_cat_cols:
        le = LabelEncoder()
        all_values = pd.concat([X_all[col].astype(str), X_test_all[col].astype(str)]).unique()
        le.fit(all_values)
        X_all[col] = le.transform(X_all[col].astype(str))
        X_test_all[col] = le.transform(X_test_all[col].astype(str))
    print("‚úì Label encoding complete.")

# Impute NaN values with median
print("\nImputing NaN values with the median from the training data...")
X_all_median = X_all.median()
X_all = X_all.fillna(X_all_median)
X_test_all = X_test_all.fillna(X_all_median)
print("‚úì NaN values imputed.")

# Calculate scale_pos_weight
scale_pos_weight = (y_all == 0).sum() / (y_all == 1).sum()

print("\n" + "="*80)
print("PRE-MODELING COMPLETE")
print(f"‚úì X_all shape: {X_all.shape}")
print(f"‚úì y_all shape: {y_all.shape}")
print(f"‚úì X_test_all shape: {X_test_all.shape}")
print(f"‚úì Total features: {len(feature_cols)}")
print(f"‚úì scale_pos_weight (for F1 score): {scale_pos_weight:.4f}")

Running Feature Engineering on train and test data...
‚úì Feature engineering complete.

Applying Smoothed Target Encoding to 10 features...
‚úì Target encoding complete.

Applying Label Encoding to 1 remaining features...
‚úì Label encoding complete.

Imputing NaN values with the median from the training data...
‚úì NaN values imputed.

PRE-MODELING COMPLETE
‚úì X_all shape: (17999, 190)
‚úì y_all shape: (17999,)
‚úì X_test_all shape: (12000, 190)
‚úì Total features: 190
‚úì scale_pos_weight (for F1 score): 3.3740


# training lightGBM model



In [5]:
print("="*80)
print("STEP 1: OPTUNA HYPERPARAMETER OPTIMIZATION (70/30 SPLIT)")
print("="*80)

# Create a stable 70/30 validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all,
    test_size=0.30,
    random_state=RANDOM_STATE,
    stratify=y_all
)

print(f"\nTrain: {X_train.shape}, Validation: {X_val.shape}")
print(f"Train target distribution: {y_train.value_counts().to_dict()}")
print(f"Val target distribution: {y_val.value_counts().to_dict()}")

# Define objective function with THRESHOLD OPTIMIZATION
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'scale_pos_weight': scale_pos_weight,
        'random_state': RANDOM_STATE,
        'verbose': -1
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)

    pred_proba = model.predict_proba(X_val)[:, 1]

    # Test multiple thresholds to find optimal F1
    best_f1 = 0
    for thresh in np.arange(0.25, 0.76, 0.02):
        preds = (pred_proba >= thresh).astype(int)
        f1 = f1_score(y_val, preds)
        if f1 > best_f1:
            best_f1 = f1

    return best_f1

# Run optimization
print("\nRunning Optuna optimization (100 trials)...")
print("This will test different parameter combinations with threshold optimization...\n")

study = optuna.create_study(direction='maximize', study_name='lgb_f1_optimization')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\n" + "="*80)
print("OPTUNA TUNING COMPLETE")
print("="*80)
print(f"\nüèÜ Best F1 Score: {study.best_value:.4f}")
print(f"\nüìä Best Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

best_lgbm_params = study.best_params.copy()
best_lgbm_params['scale_pos_weight'] = scale_pos_weight
best_lgbm_params['random_state'] = RANDOM_STATE
best_lgbm_params['verbose'] = -1

print("\n‚úì Best parameters saved to best_lgbm_params")

STEP 1: OPTUNA HYPERPARAMETER OPTIMIZATION (70/30 SPLIT)

Train: (12599, 190), Validation: (5400, 190)
Train target distribution: {0: 9719, 1: 2880}
Val target distribution: {0: 4165, 1: 1235}

Running Optuna optimization (100 trials)...
This will test different parameter combinations with threshold optimization...



  0%|          | 0/100 [00:00<?, ?it/s]


OPTUNA TUNING COMPLETE

üèÜ Best F1 Score: 0.5782

üìä Best Parameters:
  n_estimators: 507
  learning_rate: 0.011203676525861687
  num_leaves: 23
  max_depth: 7
  min_child_samples: 80
  subsample: 0.5609478583630461
  colsample_bytree: 0.7751922173938319
  reg_alpha: 1.601863804687629
  reg_lambda: 1.3202000908833544

‚úì Best parameters saved to best_lgbm_params


# k-fold cv

In [6]:
print("="*90)
print("STEP 2: 5-FOLD CROSS-VALIDATION WITH BEST LIGHTGBM PARAMETERS")
print("="*90)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, roc_auc_score, average_precision_score,
    precision_score, recall_score, precision_recall_curve
)
import numpy as np
import pandas as pd

print(f"\nRunning 5-Fold Stratified Cross-Validation using tuned parameters...")
print(f"Each fold finds its optimal threshold from PR curve to maximize F1\n")

# ============================================================================
# SETUP
# ============================================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

fold_results = []
fold_thresholds = []
fold_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    print(f"\n{'-'*40}")
    print(f"Fold {fold}/5")
    print(f"{'-'*40}")

    X_train_fold, X_val_fold = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train_fold, y_val_fold = y_all.iloc[train_idx], y_all.iloc[val_idx]

    # Train model on this fold
    model_fold = lgb.LGBMClassifier(**best_lgbm_params)
    model_fold.fit(X_train_fold, y_train_fold)

    # Predicted probabilities
    y_prob = model_fold.predict_proba(X_val_fold)[:, 1]

    # ========================================================================
    # OPTIMAL THRESHOLD VIA PRECISION-RECALL CURVE
    # ========================================================================
    precision, recall, thresholds = precision_recall_curve(y_val_fold, y_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.nanargmax(f1_scores)
    best_thresh = thresholds[max(0, best_idx - 1)]
    best_f1 = f1_scores[best_idx]

    fold_thresholds.append(best_thresh)

    # Final predictions at optimal threshold
    y_pred = (y_prob >= best_thresh).astype(int)

    # Metrics
    f1 = f1_score(y_val_fold, y_pred)
    roc = roc_auc_score(y_val_fold, y_prob)
    pr_auc = average_precision_score(y_val_fold, y_prob)
    precision_val = precision_score(y_val_fold, y_pred)
    recall_val = recall_score(y_val_fold, y_pred)
    acc_val = (y_pred == y_val_fold).mean()

    fold_f1_scores.append(f1)
    fold_results.append({
        'Fold': fold,
        'F1': f1,
        'ROC_AUC': roc,
        'PR_AUC': pr_auc,
        'Precision': precision_val,
        'Recall': recall_val,
        'Accuracy': acc_val,
        'Optimal_Threshold': best_thresh
    })

    print(f"  ‚úÖ F1: {f1:.4f} | ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f}")
    print(f"  Precision: {precision_val:.4f} | Recall: {recall_val:.4f} | Threshold: {best_thresh:.3f}")

# ============================================================================
# AGGREGATE RESULTS
# ============================================================================
results_df = pd.DataFrame(fold_results)
mean_f1 = np.mean(fold_f1_scores)
std_f1 = np.std(fold_f1_scores)
avg_optimal_threshold = np.mean(fold_thresholds)

print(f"\n" + "="*90)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("="*90)
print(results_df.to_string(index=False))

print(f"\n" + "="*90)
print("SUMMARY STATISTICS")
print("="*90)
print(f"Mean F1:        {mean_f1:.4f}")
print(f"Std F1:         {std_f1:.4f}")
print(f"95% CI (approx): [{mean_f1 - 1.96*std_f1:.4f}, {mean_f1 + 1.96*std_f1:.4f}]")
print(f"\n‚úÖ Average Optimal Threshold from CV: {avg_optimal_threshold:.3f}")

# ============================================================================
# FINAL THRESHOLD & EXPECTED PERFORMANCE
# ============================================================================
final_threshold = avg_optimal_threshold
print(f"\nüéØ Final Threshold to Use for Predictions: {final_threshold:.3f}")
print(f"üéØ Expected Leaderboard F1: {mean_f1:.4f} ¬± {std_f1:.4f}")
print(f"{'='*90}")


STEP 2: 5-FOLD CROSS-VALIDATION WITH BEST LIGHTGBM PARAMETERS

Running 5-Fold Stratified Cross-Validation using tuned parameters...
Each fold finds its optimal threshold from PR curve to maximize F1


----------------------------------------
Fold 1/5
----------------------------------------
  ‚úÖ F1: 0.6128 | ROC-AUC: 0.8504 | PR-AUC: 0.6113
  Precision: 0.5693 | Recall: 0.6634 | Threshold: 0.631

----------------------------------------
Fold 2/5
----------------------------------------
  ‚úÖ F1: 0.5809 | ROC-AUC: 0.8278 | PR-AUC: 0.5963
  Precision: 0.4996 | Recall: 0.6938 | Threshold: 0.591

----------------------------------------
Fold 3/5
----------------------------------------
  ‚úÖ F1: 0.6022 | ROC-AUC: 0.8456 | PR-AUC: 0.6000
  Precision: 0.5071 | Recall: 0.7412 | Threshold: 0.546

----------------------------------------
Fold 4/5
----------------------------------------
  ‚úÖ F1: 0.6149 | ROC-AUC: 0.8442 | PR-AUC: 0.6112
  Precision: 0.5529 | Recall: 0.6926 | Threshold: 0.604


# global OOF threshold optimization?

In [7]:
# ============================================================================
# STEP 2B: GLOBAL OOF THRESHOLD OPTIMIZATION (FIND SINGLE BEST THRESHOLD)
# ============================================================================

print("\n" + "="*90)
print("GLOBAL OOF THRESHOLD OPTIMIZATION")
print("="*90)

# To compute global threshold, we need to collect OOF predictions and true labels
oof_probs = np.zeros_like(y_all, dtype=float)  # placeholder for out-of-fold probabilities
oof_true = np.array(y_all)

# Second pass through folds to generate OOF probabilities
for fold, (train_idx, val_idx) in enumerate(skf.split(X_all, y_all), 1):
    X_train_fold, X_val_fold = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train_fold = y_all.iloc[train_idx]

    model_fold = lgb.LGBMClassifier(**best_lgbm_params)
    model_fold.fit(X_train_fold, y_train_fold)
    oof_probs[val_idx] = model_fold.predict_proba(X_val_fold)[:, 1]

# Now, search globally for the threshold that maximizes F1 on all OOF predictions
thresholds = np.arange(0.25, 0.76, 0.01)
best_thresh_global = 0.5
best_f1_global = 0

for t in thresholds:
    preds = (oof_probs >= t).astype(int)
    f1 = f1_score(oof_true, preds)
    if f1 > best_f1_global:
        best_f1_global = f1
        best_thresh_global = t

print(f"‚úÖ Best Global Threshold: {best_thresh_global:.3f}")
print(f"‚úÖ Best Global F1 (OOF): {best_f1_global:.4f}")

# For reference, compare it to your average fold threshold
print(f"\nAverage Fold Threshold: {avg_optimal_threshold:.3f}")
print(f"Difference: {best_thresh_global - avg_optimal_threshold:+.3f}")

# Decide final threshold (global one is more robust)
final_threshold = best_thresh_global

print(f"\nüéØ Using Final Threshold = {final_threshold:.3f} (based on global OOF optimization)")
print(f"{'='*90}")



GLOBAL OOF THRESHOLD OPTIMIZATION
‚úÖ Best Global Threshold: 0.560
‚úÖ Best Global F1 (OOF): 0.5947

Average Fold Threshold: 0.582
Difference: -0.022

üéØ Using Final Threshold = 0.560 (based on global OOF optimization)


# predicting


In [8]:
# ============================================================================
# STEP 3: FINAL MODEL - TRAINING ON 100% DATA
# ============================================================================

print("\n" + "="*90)
print("STEP 3: FINAL MODEL - TRAINING ON 100% DATA")
print("="*90)

print(f"\nTraining final model on full dataset: {X_all.shape[0]} samples...")
print("Using best parameters from Optuna (via CV tuning)")
print(f"Using final threshold from GLOBAL OOF optimization: {final_threshold:.3f}")

# Ensure columns are strings for LightGBM consistency
X_all.columns = X_all.columns.astype(str)
X_test_all.columns = X_test_all.columns.astype(str)

# ============================================================================
# Train final LightGBM model on all available data
# ============================================================================
lgb_final_model = lgb.LGBMClassifier(**best_lgbm_params)
lgb_final_model.fit(X_all, y_all)

print("‚úì Final model successfully trained on 100% of data.")

# ============================================================================
# Generate predictions on test data
# ============================================================================
print("\nGenerating final predictions on test set...")

# Make sure test columns align with training
X_test_all = X_test_all[X_all.columns]

# Get predicted probabilities
test_probas = lgb_final_model.predict_proba(X_test_all)[:, 1]

# Apply the globally optimized final threshold
test_predictions_final = (test_probas >= final_threshold).astype(int)

print(f"‚úì Test predictions generated using final threshold = {final_threshold:.3f}")

# ============================================================================
# Create and save submission file
# ============================================================================
submission_df = pd.DataFrame({
    'claim_number': test_ids,
    'subrogation': test_predictions_final
})

submission_filename = 'submission_final_global_thresh.csv'
submission_df.to_csv(submission_filename, index=False)

print("\n" + "="*90)
print("SUBMISSION FILE CREATED")
print("="*90)
print(f"‚úì File saved as: {submission_filename}")

# ============================================================================
# Distribution and sanity check
# ============================================================================
print("\nFinal prediction distribution:")
print(submission_df['subrogation'].value_counts(normalize=True).to_string())
print(f"\nTotal positive predictions: {submission_df['subrogation'].sum()} / {len(submission_df)}")

# Optional: Auto-download in Colab environment
try:
    import google.colab
    from google.colab import files
    files.download(submission_filename)
    print(f"\n‚úì Downloading {submission_filename}...")
except ImportError:
    print(f"\n‚úì Script complete. Find your file at: {submission_filename}")



STEP 3: FINAL MODEL - TRAINING ON 100% DATA

Training final model on full dataset: 17999 samples...
Using best parameters from Optuna (via CV tuning)
Using final threshold from GLOBAL OOF optimization: 0.560
‚úì Final model successfully trained on 100% of data.

Generating final predictions on test set...
‚úì Test predictions generated using final threshold = 0.560

SUBMISSION FILE CREATED
‚úì File saved as: submission_final_global_thresh.csv

Final prediction distribution:
subrogation
0   0.674
1   0.326

Total positive predictions: 3908 / 12000


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úì Downloading submission_final_global_thresh.csv...
