# Import Libraries

In [None]:
import os, random

SEED = 123
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [None]:
train_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/Training_TriGuard.csv')
train_set = train_set.dropna(subset=['subrogation'])
test_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/Testing_TriGuard.csv')

# EDA

### Training Set

In [None]:
train_set.describe(include='all')

Unnamed: 0,subrogation,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,claim_day_of_week,accident_site,past_num_of_claims,witness_present_ind,liab_prct,channel,policy_report_filed_ind,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage
count,17999.0,18001.0,18000.0,18000,18000.0,18000.0,18000.0,18000.0,18000.0,18000,18000.0,18000,18000,18000,18000.0,18000,18000.0,18000,18000.0,18000.0,18000.0,18000,18000.0,18000,18000.0,18000.0,18000,18000,18000.0
unique,,,,2,,,,,,2,,731,7,4,,2,,3,,,,3,,7,,,3,2,
top,,,,M,,,,,,Own,,11/30/2015,Monday,Local,,N,,Broker,,,,Large,,black,,,multi_vehicle_unclear,yes,
freq,,,,9273,,,,,,9879,,49,2657,4745,,10621,,9573,,,,6090,,2702,,,6555,13039,
mean,0.228624,5506903.0,1980.7705,,0.705833,74.641833,54587.451778,0.693444,0.592056,,49767.263556,,,,3.634611,,38.278278,,0.604444,3602.507022,2022.435667,,30657.923288,,22842.168655,22.051333,,,80056.858111
std,0.419958,2580766.0,12.198355,,0.45568,14.566564,26287.598879,0.461076,0.491466,,29099.666613,,,,5.603418,,9.802537,,0.488983,3609.644983,3.903423,,20636.599225,,11857.504523,3.359,,,30064.396369
min,0.0,0.0,1774.0,,0.0,2.0,30015.0,0.0,0.0,,0.0,,,,0.0,,0.0,,0.0,65.75,2003.0,,15000.0,,2450.138635,16.0,,,500.0
25%,0.0,3303928.0,1973.0,,0.0,65.0,32193.5,0.0,0.0,,20111.0,,,,0.0,,32.0,,0.0,1321.8075,2021.0,,15000.0,,14028.319375,20.0,,,59587.0
50%,0.0,5522119.0,1982.0,,1.0,75.0,46383.5,1.0,1.0,,50027.0,,,,0.0,,38.0,,1.0,2414.37,2023.0,,19637.23188,,20672.207015,22.0,,,79765.0
75%,0.0,7707404.0,1990.0,,1.0,85.0,67397.0,1.0,1.0,,80040.0,,,,6.0,,45.0,,1.0,4519.3275,2025.0,,42609.41713,,29321.49859,24.0,,,100325.75


In [None]:
train_set.isna().sum()

Unnamed: 0,0
subrogation,2
claim_number,0
year_of_born,1
gender,1
email_or_tel_available,1
safety_rating,1
annual_income,1
high_education_ind,1
address_change_ind,1
living_status,1


In [None]:
train_set = train_set.dropna(subset=['subrogation'])
train_set[train_set['subrogation'].isna()]

Unnamed: 0,subrogation,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,claim_day_of_week,accident_site,past_num_of_claims,witness_present_ind,liab_prct,channel,policy_report_filed_ind,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage


In [None]:
train_set[train_set.isna().any(axis=1)]

Unnamed: 0,subrogation,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,claim_day_of_week,accident_site,past_num_of_claims,witness_present_ind,liab_prct,channel,policy_report_filed_ind,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage


In [None]:
train_set_eda = train_set.copy()
train_set_eda['month'] = pd.to_datetime(train_set_eda['claim_date']).dt.month
pd.crosstab(train_set_eda['subrogation'], train_set_eda['month'])

month,1,2,3,4,5,6,7,8,9,10,11,12
subrogation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0.0,1209,1131,1274,1105,1124,1160,1210,1245,1097,1118,1091,1120
1.0,371,350,372,322,329,335,327,390,304,312,338,365


In [None]:
train_set_eda['year'] = pd.to_datetime(train_set_eda['claim_date']).dt.year
pd.crosstab(train_set_eda['subrogation'], train_set_eda['year'])

year,2015,2016
subrogation,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,7101,6783
1.0,2102,2013


In [None]:
pip install pyzipcode

Collecting pyzipcode
  Downloading pyzipcode-3.0.1.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyzipcode
  Building wheel for pyzipcode (setup.py) ... [?25l[?25hdone
  Created wheel for pyzipcode: filename=pyzipcode-3.0.1-py3-none-any.whl size=1932159 sha256=0890aafda131945157302f2acf6ae9c8db53d1511d6a36f0544c40a7dea6cdf0
  Stored in directory: /root/.cache/pip/wheels/5b/97/ba/42f05e3b732741d430f2c1c27e15f44b6b784aa3903efb4260
Successfully built pyzipcode
Installing collected packages: pyzipcode
Successfully installed pyzipcode-3.0.1


In [None]:
from pyzipcode import ZipCodeDatabase
zcdb = ZipCodeDatabase()

def get_state(zip_code):
  try:
    zip_code = str(int(zip_code)).zfill(5)
    return zcdb[zip_code].state
  except:
    return None

train_set_eda['state'] = train_set_eda['zip_code'].apply(get_state)
pd.crosstab(train_set_eda['subrogation'], train_set_eda['state'])

NameError: name 'train_set_eda' is not defined

In [None]:
pd.crosstab(train_set_eda['subrogation'], train_set_eda['vehicle_category'])

vehicle_category,Compact,Large,Medium
subrogation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4570,4764,4550
1.0,1413,1326,1376


In [None]:
train_set_eda['claim_number'].duplicated().sum()

np.int64(0)

In [None]:
train_set[train_set.isna().any(axis=1)]

Unnamed: 0,subrogation,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,claim_day_of_week,accident_site,past_num_of_claims,witness_present_ind,liab_prct,channel,policy_report_filed_ind,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage


### Testing Set

In [None]:
test_set.describe(include='all')

Unnamed: 0,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,claim_day_of_week,accident_site,past_num_of_claims,witness_present_ind,liab_prct,channel,policy_report_filed_ind,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage
count,12000.0,12000.0,12000,12000.0,12000.0,12000.0,12000.0,12000.0,12000,12000.0,12000,12000,12000,12000.0,12000,12000.0,12000,12000.0,12000.0,12000.0,12000,12000.0,12000,12000.0,12000.0,12000,12000,12000.0
unique,,,2,,,,,,2,,731,7,4,,2,,3,,,,3,,7,,,3,2,
top,,,M,,,,,,Own,,4/17/2015,Sunday,Highway/Intersection,,N,,Broker,,,,Compact,,white,,,multi_vehicle_unclear,yes,
freq,,,6264,,,,,,6623,,36,1767,3134,,7101,,6501,,,,4046,,1794,,,4434,8633,
mean,5508947.0,1980.932,,0.707583,74.615917,55112.173333,0.694167,0.584083,,50118.309833,,,,3.661083,,38.3315,,0.607583,3636.771014,2022.427833,,30422.497617,,22986.40628,22.020333,,,80203.440833
std,2576067.0,11.830499,,0.454892,14.330958,26243.687262,0.460779,0.4929,,29147.763315,,,,5.60812,,9.926034,,0.488309,3658.682182,3.935003,,20688.71254,,11946.348839,3.358377,,,29576.866211
min,1000385.0,1929.0,,0.0,18.0,30015.0,0.0,0.0,,0.0,,,,0.0,,7.0,,0.0,58.29,2003.0,,15000.0,,2591.881965,16.0,,,500.0
25%,3320402.0,1973.0,,0.0,65.0,32606.75,0.0,0.0,,20111.0,,,,0.0,,31.0,,0.0,1337.94,2021.0,,15000.0,,14090.72966,20.0,,,60280.75
50%,5541186.0,1982.0,,1.0,75.0,47436.5,1.0,1.0,,50029.0,,,,0.0,,38.0,,1.0,2415.825,2023.0,,18793.764395,,20861.55842,22.0,,,80116.0
75%,7710995.0,1990.0,,1.0,85.0,68431.25,1.0,1.0,,80040.0,,,,6.0,,45.0,,1.0,4502.47,2025.0,,41928.38379,,29287.666148,24.0,,,100170.25


In [None]:
test_set.isna().sum()

Unnamed: 0,0
claim_number,0
year_of_born,0
gender,0
email_or_tel_available,0
safety_rating,0
annual_income,0
high_education_ind,0
address_change_ind,0
living_status,0
zip_code,0


In [None]:
test_set['claim_number'].duplicated().sum()

np.int64(0)

# Feature Engineering

## Cecillia's version 1

In [None]:
def feature_engineer(df):
    """Feature engineering WITHOUT vehicle_made_year/vehicle_age (data quality issues)"""
    df_fe = df.copy()

    # ========================================================================
    # TEMPORAL FEATURES
    # ========================================================================
    df_fe['claim_date'] = pd.to_datetime(df_fe['claim_date'], errors='coerce')
    df_fe['claim_year'] = df_fe['claim_date'].dt.year
    df_fe['claim_month'] = df_fe['claim_date'].dt.month
    df_fe['claim_day'] = df_fe['claim_date'].dt.day
    df_fe['claim_quarter'] = df_fe['claim_date'].dt.quarter
    df_fe['is_weekend'] = df_fe['claim_day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

    season_map = {
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall',
        12: 'Winter', 1: 'Winter', 2: 'Winter'
    }
    df_fe['season'] = df_fe['claim_month'].map(season_map).fillna('Unknown')

    # ========================================================================
    # BINARY CONVERSIONS (for interactions)
    # ========================================================================
    df_fe['witness_binary'] = (df_fe['witness_present_ind'] == 'Y').astype(int)
    df_fe['police_binary'] = df_fe['policy_report_filed_ind']
    df_fe['multicar_binary'] = df_fe['accident_type'].isin(['multi_vehicle_clear', 'multi_vehicle_unclear']).astype(int)
    df_fe['highrisk_site_binary'] = df_fe['accident_site'].isin(['Highway/Intersection', 'Local']).astype(int)

    # ========================================================================
    # CRITICAL INTERACTION FEATURES (2-way)
    # ========================================================================
    df_fe['liab_x_witness'] = df_fe['liab_prct'] * df_fe['witness_binary']
    df_fe['liab_x_police'] = df_fe['liab_prct'] * df_fe['police_binary']
    df_fe['liab_x_multicar'] = df_fe['liab_prct'] * df_fe['multicar_binary']
    df_fe['liab_x_highrisk_site'] = df_fe['liab_prct'] * df_fe['highrisk_site_binary']
    df_fe['liab_x_evidence'] = df_fe['liab_prct'] * (df_fe['witness_binary'] + df_fe['police_binary'])
    df_fe['liab_x_payout'] = df_fe['liab_prct'] / (1 + df_fe['claim_est_payout'])
    df_fe['liab_x_mileage'] = df_fe['liab_prct'] * df_fe['vehicle_mileage']

    df_fe['witness_x_police'] = df_fe['witness_binary'] * df_fe['police_binary']
    df_fe['witness_x_multicar'] = df_fe['witness_binary'] * df_fe['multicar_binary']
    df_fe['police_x_multicar'] = df_fe['police_binary'] * df_fe['multicar_binary']
    df_fe['multicar_x_highrisk'] = df_fe['multicar_binary'] * df_fe['highrisk_site_binary']
    df_fe['weekend_highway'] = (df_fe['is_weekend']) * (df_fe['highrisk_site_binary'])

    # 3-way interaction
    df_fe['witness_police_multicar'] = df_fe['witness_binary'] * df_fe['police_binary'] * df_fe['multicar_binary']

    # ========================================================================
    # POLYNOMIAL FEATURES (liability & key variables)
    # ========================================================================
    df_fe['liab_prct_squared'] = df_fe['liab_prct'] ** 2
    df_fe['liab_prct_cubed'] = df_fe['liab_prct'] ** 3
    df_fe['liab_prct_sqrt'] = np.sqrt(df_fe['liab_prct'])
    df_fe['liab_prct_log'] = np.log1p(df_fe['liab_prct'])
    df_fe['liab_inverse'] = 100 - df_fe['liab_prct']
    df_fe['liab_inverse_squared'] = (100 - df_fe['liab_prct']) ** 2

    df_fe['log_claim_est_payout'] = np.log1p(df_fe['claim_est_payout'])
    df_fe['log_vehicle_mileage'] = np.log1p(df_fe['vehicle_mileage'])
    df_fe['log_vehicle_price'] = np.log1p(df_fe['vehicle_price'])
    df_fe['log_annual_income'] = np.log1p(df_fe['annual_income'])
    df_fe['sqrt_vehicle_mileage'] = np.sqrt(df_fe['vehicle_mileage'])

    # ========================================================================
    # ACCIDENT TYPE FEATURES
    # ========================================================================
    df_fe['is_multi_vehicle_clear'] = (df_fe['accident_type'] == 'multi_vehicle_clear').astype(int)
    df_fe['is_multi_vehicle_unclear'] = (df_fe['accident_type'] == 'multi_vehicle_unclear').astype(int)
    df_fe['is_single_car'] = (df_fe['accident_type'] == 'single_car').astype(int)
    df_fe['has_recovery_target'] = df_fe['multicar_binary']

    df_fe['recovery_case_clarity'] = 0
    df_fe.loc[df_fe['is_multi_vehicle_clear'] == 1, 'recovery_case_clarity'] = 3
    df_fe.loc[df_fe['is_multi_vehicle_unclear'] == 1, 'recovery_case_clarity'] = 1

    # ========================================================================
    # LIABILITY BUCKETS (fine-grained)
    # ========================================================================
    df_fe['liab_under_10'] = (df_fe['liab_prct'] < 10).astype(int)
    df_fe['liab_10_to_15'] = ((df_fe['liab_prct'] >= 10) & (df_fe['liab_prct'] < 15)).astype(int)
    df_fe['liab_15_to_20'] = ((df_fe['liab_prct'] >= 15) & (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['liab_20_to_25'] = ((df_fe['liab_prct'] >= 20) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['liab_25_to_30'] = ((df_fe['liab_prct'] >= 25) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['liab_30_to_35'] = ((df_fe['liab_prct'] >= 30) & (df_fe['liab_prct'] < 35)).astype(int)
    df_fe['liab_35_to_40'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 40)).astype(int)
    df_fe['liab_40_to_50'] = ((df_fe['liab_prct'] >= 40) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['liab_over_50'] = (df_fe['liab_prct'] >= 50).astype(int)

    df_fe['not_at_fault'] = df_fe['liab_under_10']
    df_fe['minimal_fault'] = (df_fe['liab_prct'] < 25).astype(int)
    df_fe['low_fault'] = (df_fe['liab_prct'] < 35).astype(int)
    df_fe['shared_fault'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['high_fault'] = (df_fe['liab_prct'] >= 50).astype(int)

    # ========================================================================
    # EVIDENCE QUALITY FEATURES
    # ========================================================================
    df_fe['witness_present'] = df_fe['witness_binary']
    df_fe['police_report'] = df_fe['police_binary']

    df_fe['evidence_none'] = ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 0)).astype(int)
    df_fe['evidence_weak'] = (((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 0)) |
                              ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 1))).astype(int)
    df_fe['evidence_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1)).astype(int)
    df_fe['evidence_very_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1) &
                                      (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['evidence_score'] = df_fe['witness_present'] + df_fe['police_report']

    # ========================================================================
    # ACCIDENT SITE FEATURES
    # ========================================================================
    df_fe['high_risk_site'] = df_fe['highrisk_site_binary']
    df_fe['parking_accident'] = (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['unknown_site'] = (df_fe['accident_site'] == 'Unknown').astype(int)
    df_fe['highway_accident'] = (df_fe['accident_site'] == 'Highway/Intersection').astype(int)
    df_fe['local_accident'] = (df_fe['accident_site'] == 'Local').astype(int)

    # ========================================================================
    # DRIVER AGE & EXPERIENCE
    # ========================================================================
    df_fe['driver_age'] = df_fe['claim_year'] - df_fe['year_of_born']

    df_fe['young_driver'] = (df_fe['driver_age'] <= 25).astype(int)
    df_fe['prime_driver'] = ((df_fe['driver_age'] > 25) & (df_fe['driver_age'] <= 45)).astype(int)
    df_fe['middle_age_driver'] = ((df_fe['driver_age'] > 45) & (df_fe['driver_age'] <= 65)).astype(int)
    df_fe['senior_driver'] = (df_fe['driver_age'] > 65).astype(int)

    df_fe['driving_experience'] = (df_fe['driver_age'] - df_fe['age_of_DL']).clip(lower=0)
    df_fe.loc[df_fe['driving_experience'] < 0, 'driving_experience'] = np.nan

    df_fe['novice_driver'] = (df_fe['driving_experience'] < 3).astype(int)
    df_fe['experienced_driver'] = ((df_fe['driving_experience'] >= 3) & (df_fe['driving_experience'] <= 10)).astype(int)
    df_fe['veteran_driver'] = (df_fe['driving_experience'] > 10).astype(int)

    df_fe['experience_x_safety'] = df_fe['driving_experience'] * df_fe['safety_rating']
    df_fe['driver_age_x_safety'] = df_fe['driver_age'] * df_fe['safety_rating']

    # ========================================================================
    # VEHICLE FEATURES (without vehicle_age)
    # ========================================================================
    df_fe['luxury_vehicle'] = (df_fe['vehicle_price'] > 50000).astype(int)
    df_fe['mid_price_vehicle'] = ((df_fe['vehicle_price'] >= 20000) & (df_fe['vehicle_price'] <= 50000)).astype(int)
    df_fe['economy_vehicle'] = (df_fe['vehicle_price'] < 20000).astype(int)

    df_fe['heavy_vehicle'] = (df_fe['vehicle_weight'] > 30000).astype(int)
    df_fe['light_vehicle'] = (df_fe['vehicle_weight'] < 15000).astype(int)
    df_fe['medium_weight'] = ((df_fe['vehicle_weight'] >= 15000) & (df_fe['vehicle_weight'] <= 30000)).astype(int)

    df_fe['is_large_vehicle'] = (df_fe['vehicle_category'] == 'Large').astype(int)
    df_fe['is_compact_vehicle'] = (df_fe['vehicle_category'] == 'Compact').astype(int)
    df_fe['is_medium_vehicle'] = (df_fe['vehicle_category'] == 'Medium').astype(int)

    # ========================================================================
    # CLAIM CHARACTERISTICS
    # ========================================================================
    df_fe['high_mileage'] = (df_fe['vehicle_mileage'] > 100000).astype(int)
    df_fe['low_mileage'] = (df_fe['vehicle_mileage'] < 50000).astype(int)
    df_fe['very_high_mileage'] = (df_fe['vehicle_mileage'] > 150000).astype(int)
    df_fe['medium_mileage'] = ((df_fe['vehicle_mileage'] >= 50000) & (df_fe['vehicle_mileage'] <= 100000)).astype(int)

    df_fe['frequent_claimer'] = (df_fe['past_num_of_claims'] > 5).astype(int)
    df_fe['moderate_claimer'] = ((df_fe['past_num_of_claims'] >= 1) & (df_fe['past_num_of_claims'] <= 5)).astype(int)
    df_fe['first_time_claimer'] = (df_fe['past_num_of_claims'] == 0).astype(int)
    df_fe['very_frequent_claimer'] = (df_fe['past_num_of_claims'] > 10).astype(int)

    df_fe['large_payout'] = (df_fe['claim_est_payout'] > 5000).astype(int)
    df_fe['medium_payout'] = ((df_fe['claim_est_payout'] >= 2000) & (df_fe['claim_est_payout'] <= 5000)).astype(int)
    df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
    df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)

    df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
    df_fe['mileage_x_claims'] = df_fe['vehicle_mileage'] * df_fe['past_num_of_claims']

    # ========================================================================
    # RATIO FEATURES
    # ========================================================================
    df_fe['payout_to_price_ratio'] = df_fe['claim_est_payout'] / (df_fe['vehicle_price'] + 1)
    df_fe['severe_damage'] = (df_fe['payout_to_price_ratio'] > 0.3).astype(int)
    df_fe['moderate_damage'] = ((df_fe['payout_to_price_ratio'] >= 0.1) & (df_fe['payout_to_price_ratio'] <= 0.3)).astype(int)
    df_fe['minor_damage'] = (df_fe['payout_to_price_ratio'] < 0.1).astype(int)

    df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (df_fe['vehicle_price'] + 1)
    df_fe['can_afford_vehicle'] = (df_fe['income_to_vehicle_price'] >= 0.5).astype(int)
    df_fe['expensive_for_income'] = (df_fe['income_to_vehicle_price'] < 0.3).astype(int)

    df_fe['claims_per_year_driving'] = df_fe['past_num_of_claims'] / (df_fe['driving_experience'] + 1)
    df_fe['claim_frequency_high'] = (df_fe['claims_per_year_driving'] > 0.5).astype(int)

    df_fe['safety_to_liability'] = df_fe['safety_rating'] / (df_fe['liab_prct'] + 1)
    df_fe['payout_to_income'] = df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)
    df_fe['mileage_to_price'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['weight_to_price'] = df_fe['vehicle_weight'] / (df_fe['vehicle_price'] + 1)

    # ========================================================================
    # POLICYHOLDER CHARACTERISTICS
    # ========================================================================
    df_fe['high_income'] = (df_fe['annual_income'] > 70000).astype(int)
    df_fe['mid_income'] = ((df_fe['annual_income'] >= 40000) & (df_fe['annual_income'] <= 70000)).astype(int)
    df_fe['low_income'] = (df_fe['annual_income'] < 40000).astype(int)
    df_fe['very_high_income'] = (df_fe['annual_income'] > 100000).astype(int)

    df_fe['high_safety_rating'] = (df_fe['safety_rating'] > 80).astype(int)
    df_fe['low_safety_rating'] = (df_fe['safety_rating'] < 60).astype(int)
    df_fe['very_high_safety'] = (df_fe['safety_rating'] > 90).astype(int)
    df_fe['medium_safety'] = ((df_fe['safety_rating'] >= 60) & (df_fe['safety_rating'] <= 80)).astype(int)

    df_fe['contact_available'] = df_fe['email_or_tel_available']
    df_fe['has_education'] = df_fe['high_education_ind']
    df_fe['recent_move'] = df_fe['address_change_ind']
    df_fe['home_owner'] = (df_fe['living_status'] == 'Own').astype(int)
    df_fe['renter'] = (df_fe['living_status'] == 'Rent').astype(int)
    df_fe['female'] = (df_fe['gender'] == 'F').astype(int)

    # ========================================================================
    # CHANNEL FEATURES
    # ========================================================================
    df_fe['via_broker'] = (df_fe['channel'] == 'Broker').astype(int)
    df_fe['via_online'] = (df_fe['channel'] == 'Online').astype(int)
    df_fe['via_phone'] = (df_fe['channel'] == 'Phone').astype(int)
    df_fe['in_network_repair'] = (df_fe['in_network_bodyshop'] == 'yes').astype(int)
    df_fe['out_network_repair'] = (df_fe['in_network_bodyshop'] == 'no').astype(int)

    # ========================================================================
    # COMPOSITE RECOVERY SCORE
    # ========================================================================
    liability_score = np.sqrt((100 - df_fe['liab_prct']) / 100.0)
    evidence_score = (df_fe['evidence_none'] * 0.0 + df_fe['evidence_weak'] * 0.4 +
                      df_fe['evidence_strong'] * 0.7 + df_fe['evidence_very_strong'] * 1.0)
    clarity_score = df_fe['recovery_case_clarity'] / 3.0
    site_score = df_fe['high_risk_site'] * 0.7 + (1 - df_fe['unknown_site']) * 0.3

    df_fe['recovery_feasibility_score'] = (0.35 * liability_score + 0.30 * df_fe['has_recovery_target'] +
                                           0.20 * evidence_score + 0.10 * clarity_score + 0.05 * site_score)

    # ========================================================================
    # DOMAIN LOGIC FLAGS (CRITICAL FOR F1)
    # ========================================================================
    df_fe['perfect_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['witness_present'] == 1) &
                             (df_fe['police_report'] == 1) & (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['strong_case'] = ((df_fe['liab_prct'] < 25) & (df_fe['evidence_strong'] == 1) &
                            (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['good_case'] = ((df_fe['liab_prct'] < 35) & (df_fe['evidence_score'] >= 1) &
                          (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['weak_case'] = ((df_fe['liab_prct'] > 40) | (df_fe['is_single_car'] == 1) |
                          (df_fe['evidence_none'] == 1)).astype(int)

    df_fe['no_case'] = ((df_fe['liab_prct'] > 60) | ((df_fe['is_single_car'] == 1) & (df_fe['evidence_none'] == 1))).astype(int)

    df_fe['high_value_opportunity'] = ((df_fe['claim_est_payout'] > 3000) & (df_fe['liab_prct'] < 30) &
                                       (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['slam_dunk_case'] = ((df_fe['liab_prct'] < 10) & (df_fe['witness_present'] == 1) &
                               (df_fe['police_report'] == 1) & (df_fe['multicar_binary'] == 1) &
                               (df_fe['high_risk_site'] == 1)).astype(int)

    df_fe['low_liab_high_payout'] = ((df_fe['liab_prct'] < 20) & (df_fe['claim_est_payout'] > 5000)).astype(int)
    df_fe['clear_fault_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['multicar_binary'] == 1)).astype(int)
    df_fe['high_mileage_low_fault'] = ((df_fe['vehicle_mileage'] > 100000) & (df_fe['liab_prct'] < 30)).astype(int)

    # --- Temporal & Behavior Dynamics ---
    df_fe['claim_early_in_year'] = (df_fe['claim_month'] <= 3).astype(int)
    df_fe['claim_end_of_year'] = (df_fe['claim_month'] >= 10).astype(int)
    df_fe['weekend_parking'] = df_fe['is_weekend'] * (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['winter_claim_high_payout'] = ((df_fe['season'] == 'Winter') & (df_fe['claim_est_payout'] > 5000)).astype(int)

    # --- Vehicle Utilization Proxies (without vehicle_age) ---
    df_fe['mileage_x_weight'] = df_fe['vehicle_mileage'] * df_fe['vehicle_weight']
    df_fe['mileage_per_dollar'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['payout_to_weight'] = df_fe['claim_est_payout'] / (df_fe['vehicle_weight'] + 1)

    # --- Policyholder Risk Profile ---
    df_fe['unstable_policyholder'] = ((df_fe['recent_move'] == 1) & (df_fe['renter'] == 1)).astype(int)
    df_fe['financial_stress_risk'] = ((df_fe['expensive_for_income'] == 1) & (df_fe['large_payout'] == 1)).astype(int)
    df_fe['young_driver_highway'] = df_fe['young_driver'] * df_fe['highway_accident']
    df_fe['senior_driver_parking'] = df_fe['senior_driver'] * df_fe['parking_accident']

    # --- Liability & Evidence Interaction Insights ---
    df_fe['low_liab_weak_evidence'] = ((df_fe['liab_prct'] < 20) & (df_fe['evidence_weak'] == 1)).astype(int)
    df_fe['high_liab_strong_evidence'] = ((df_fe['liab_prct'] > 50) & (df_fe['evidence_strong'] == 1)).astype(int)

    # Composite confidence / case quality index
    df_fe['case_confidence_score'] = (
        0.4 * (100 - df_fe['liab_prct']) / 100 +
        0.4 * df_fe['evidence_score'] / 2 +
        0.2 * df_fe['recovery_case_clarity'] / 3
    )

    # --- Statistical Normalization & Percentile Features ---
    for col in ['claim_est_payout', 'vehicle_mileage', 'annual_income']:
        df_fe[f'{col}_z'] = (df_fe[col] - df_fe[col].mean()) / (df_fe[col].std() + 1e-9)

    try:
        df_fe['liab_percentile'] = pd.qcut(df_fe['liab_prct'], 10, labels=False, duplicates='drop')
        df_fe['payout_percentile'] = pd.qcut(df_fe['claim_est_payout'], 10, labels=False, duplicates='drop')
    except Exception:
        # If there aren't enough unique values to bin
        df_fe['liab_percentile'] = np.nan
        df_fe['payout_percentile'] = np.nan

    # --- Aggregate / Hybrid Indices ---
    df_fe['case_strength_index'] = df_fe['evidence_score'] * (1 - df_fe['liab_prct'] / 100)
    df_fe['financial_exposure_index'] = (
        (df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)) * (1 + df_fe['liab_prct'] / 100)
    )
    df_fe['behavioral_risk_index'] = (
        df_fe['claims_per_year_driving'] * (100 - df_fe['safety_rating']) / 100
    )

    return df_fe

print("✓ Feature engineering function defined")

✓ Feature engineering function defined


In [None]:
train_fe = feature_engineer(train_set)
test_fe = feature_engineer(test_set)

  df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
  df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)
  df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
  df_fe['mileage_x_claims'] = df_fe['vehicle_mileage'] * df_fe['past_num_of_claims']
  df_fe['payout_to_price_ratio'] = df_fe['claim_est_payout'] / (df_fe['vehicle_price'] + 1)
  df_fe['severe_damage'] = (df_fe['payout_to_price_ratio'] > 0.3).astype(int)
  df_fe['moderate_damage'] = ((df_fe['payout_to_price_ratio'] >= 0.1) & (df_fe['payout_to_price_ratio'] <= 0.3)).astype(int)
  df_fe['minor_damage'] = (df_fe['payout_to_price_ratio'] < 0.1).astype(int)
  df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (df_fe['vehicle_price'] + 1)
  df_fe['can_afford_vehicle'] = (df_fe['income_to_vehicle_price'] >= 0.5).astype(int)
  df_fe['expensive_for_income'] = (df_fe['income_to_vehicle_price'] < 0.3).astype(int)
  df_fe['claims_per_year_driving']

## Carol's version

In [None]:
def carol_feature(df):
  df_fe = df.copy()

  df_fe['year'] = pd.to_datetime(df_fe['claim_date']).dt.year
  df_fe['month'] = pd.to_datetime(df_fe['claim_date']).dt.month
  df_fe['is_weekend'] = df_fe['claim_day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
  df_fe['driver_age'] = df_fe['year'] - df_fe['year_of_born']
  df_fe['driving_experience'] = (df_fe['driver_age'] - df_fe['age_of_DL']).clip(lower=0)

  season_map = {
      3: 'Spring', 4: 'Spring', 5: 'Spring',
      6: 'Summer', 7: 'Summer', 8: 'Summer',
      9: 'Fall', 10: 'Fall', 11: 'Fall',
      12: 'Winter', 1: 'Winter', 2: 'Winter'
  }
  df_fe['season'] = df_fe['month'].map(season_map)

  df_fe['log_claim_est_payout'] = np.log(df_fe['claim_est_payout'])
  df_fe['log_vehicle_mileage'] = np.log(df_fe['vehicle_mileage'])
  df_fe['log_vehicle_price'] = np.log(df_fe['vehicle_price'])

  df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (1 + df_fe['vehicle_price'])
  df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
  df_fe['experience_x_safety'] = df_fe['driving_experience'] * df_fe['safety_rating']
  df_fe['liab_x_witness'] = df_fe['liab_prct'] * df_fe['witness_present_ind'].map({'Y':1, 'N':0})

  return df_fe

train_fe = carol_feature(train_set)
test_fe = carol_feature(test_set)

## Cecillia's version 5

In [None]:
def feature_engineer(df):
    """Feature engineering WITHOUT vehicle_made_year/vehicle_age/state (data quality issues)"""
    df_fe = df.copy()

    # ========================================================================
    # TEMPORAL FEATURES
    # ========================================================================
    df_fe['claim_date'] = pd.to_datetime(df_fe['claim_date'], errors='coerce')
    df_fe['claim_year'] = df_fe['claim_date'].dt.year
    df_fe['claim_month'] = df_fe['claim_date'].dt.month
    df_fe['claim_day'] = df_fe['claim_date'].dt.day
    df_fe['claim_quarter'] = df_fe['claim_date'].dt.quarter
    df_fe['claim_dayofweek'] = df_fe['claim_date'].dt.dayofweek
    df_fe['is_weekend'] = (df_fe['claim_dayofweek'] >= 5).astype(int)
    df_fe['is_monday'] = (df_fe['claim_dayofweek'] == 0).astype(int)
    df_fe['is_friday'] = (df_fe['claim_dayofweek'] == 4).astype(int)
    df_fe['is_q4'] = (df_fe['claim_quarter'] == 4).astype(int)

    # NEW: Time-of-day features from Doc 8
    df_fe['claim_hour'] = df_fe['claim_date'].dt.hour
    df_fe['rush_hour'] = df_fe['claim_hour'].isin([7, 8, 9, 16, 17, 18]).astype(int)
    df_fe['late_night'] = df_fe['claim_hour'].isin([0, 1, 2, 3, 4, 5]).astype(int)

    season_map = {
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Fall', 10: 'Fall', 11: 'Fall',
        12: 'Winter', 1: 'Winter', 2: 'Winter'
    }
    df_fe['season'] = df_fe['claim_month'].map(season_map).fillna('Unknown')

    # ========================================================================
    # DATA CLEANING
    # ========================================================================
    df_fe.loc[(df_fe['year_of_born'] < 1900) | (df_fe['year_of_born'] > 2025), 'year_of_born'] = np.nan

    # ========================================================================
    # BINARY CONVERSIONS (for interactions)
    # ========================================================================
    df_fe['witness_binary'] = (df_fe['witness_present_ind'] == 'Y').astype(int)
    df_fe['police_binary'] = df_fe['policy_report_filed_ind']
    df_fe['multicar_binary'] = df_fe['accident_type'].isin(['multi_vehicle_clear', 'multi_vehicle_unclear']).astype(int)
    df_fe['highrisk_site_binary'] = df_fe['accident_site'].isin(['Highway/Intersection', 'Local']).astype(int)

    # ========================================================================
    # CRITICAL INTERACTION FEATURES (2-way)
    # ========================================================================
    df_fe['liab_x_witness'] = df_fe['liab_prct'] * df_fe['witness_binary']
    df_fe['liab_x_police'] = df_fe['liab_prct'] * df_fe['police_binary']
    df_fe['liab_x_multicar'] = df_fe['liab_prct'] * df_fe['multicar_binary']
    df_fe['liab_x_highrisk_site'] = df_fe['liab_prct'] * df_fe['highrisk_site_binary']
    df_fe['liab_x_evidence'] = df_fe['liab_prct'] * (df_fe['witness_binary'] + df_fe['police_binary'])
    df_fe['liab_x_payout'] = df_fe['liab_prct'] * df_fe['claim_est_payout']
    df_fe['liab_x_mileage'] = df_fe['liab_prct'] * df_fe['vehicle_mileage']

    df_fe['witness_x_police'] = df_fe['witness_binary'] * df_fe['police_binary']
    df_fe['witness_x_multicar'] = df_fe['witness_binary'] * df_fe['multicar_binary']
    df_fe['police_x_multicar'] = df_fe['police_binary'] * df_fe['multicar_binary']
    df_fe['multicar_x_highrisk'] = df_fe['multicar_binary'] * df_fe['highrisk_site_binary']
    df_fe['weekend_highway'] = (df_fe['claim_dayofweek'] >= 5).astype(int) * (df_fe['accident_site'] == 'Highway/Intersection').astype(int)

    # 3-way interaction
    df_fe['witness_police_multicar'] = df_fe['witness_binary'] * df_fe['police_binary'] * df_fe['multicar_binary']

    # ========================================================================
    # POLYNOMIAL FEATURES (liability & key variables)
    # ========================================================================
    df_fe['liab_prct_squared'] = df_fe['liab_prct'] ** 2
    df_fe['liab_prct_cubed'] = df_fe['liab_prct'] ** 3
    df_fe['liab_prct_sqrt'] = np.sqrt(df_fe['liab_prct'])
    df_fe['liab_prct_log'] = np.log1p(df_fe['liab_prct'])
    df_fe['liab_inverse'] = 100 - df_fe['liab_prct']
    df_fe['liab_inverse_squared'] = (100 - df_fe['liab_prct']) ** 2

    df_fe['log_claim_est_payout'] = np.log1p(df_fe['claim_est_payout'])
    df_fe['log_vehicle_mileage'] = np.log1p(df_fe['vehicle_mileage'])
    df_fe['log_vehicle_price'] = np.log1p(df_fe['vehicle_price'])
    df_fe['log_annual_income'] = np.log1p(df_fe['annual_income'])
    df_fe['sqrt_vehicle_mileage'] = np.sqrt(df_fe['vehicle_mileage'])

    # ========================================================================
    # ACCIDENT TYPE FEATURES
    # ========================================================================
    df_fe['is_multi_vehicle_clear'] = (df_fe['accident_type'] == 'multi_vehicle_clear').astype(int)
    df_fe['is_multi_vehicle_unclear'] = (df_fe['accident_type'] == 'multi_vehicle_unclear').astype(int)
    df_fe['is_single_car'] = (df_fe['accident_type'] == 'single_car').astype(int)
    df_fe['has_recovery_target'] = df_fe['multicar_binary']

    df_fe['recovery_case_clarity'] = 0
    df_fe.loc[df_fe['is_multi_vehicle_clear'] == 1, 'recovery_case_clarity'] = 3
    df_fe.loc[df_fe['is_multi_vehicle_unclear'] == 1, 'recovery_case_clarity'] = 1

    # ========================================================================
    # LIABILITY BUCKETS (fine-grained)
    # ========================================================================
    df_fe['liab_under_10'] = (df_fe['liab_prct'] < 10).astype(int)
    df_fe['liab_10_to_15'] = ((df_fe['liab_prct'] >= 10) & (df_fe['liab_prct'] < 15)).astype(int)
    df_fe['liab_15_to_20'] = ((df_fe['liab_prct'] >= 15) & (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['liab_20_to_25'] = ((df_fe['liab_prct'] >= 20) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['liab_25_to_30'] = ((df_fe['liab_prct'] >= 25) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['liab_30_to_35'] = ((df_fe['liab_prct'] >= 30) & (df_fe['liab_prct'] < 35)).astype(int)
    df_fe['liab_35_to_40'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 40)).astype(int)
    df_fe['liab_40_to_50'] = ((df_fe['liab_prct'] >= 40) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['liab_over_50'] = (df_fe['liab_prct'] >= 50).astype(int)

    df_fe['not_at_fault'] = df_fe['liab_under_10']
    df_fe['minimal_fault'] = (df_fe['liab_prct'] < 25).astype(int)
    df_fe['low_fault'] = (df_fe['liab_prct'] < 35).astype(int)
    df_fe['shared_fault'] = ((df_fe['liab_prct'] >= 35) & (df_fe['liab_prct'] < 50)).astype(int)
    df_fe['high_fault'] = (df_fe['liab_prct'] >= 50).astype(int)

    # ========================================================================
    # EVIDENCE QUALITY FEATURES
    # ========================================================================
    df_fe['witness_present'] = df_fe['witness_binary']
    df_fe['police_report'] = df_fe['police_binary']

    df_fe['evidence_none'] = ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 0)).astype(int)
    df_fe['evidence_weak'] = (((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 0)) |
                              ((df_fe['witness_present'] == 0) & (df_fe['police_report'] == 1))).astype(int)
    df_fe['evidence_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1)).astype(int)
    df_fe['evidence_very_strong'] = ((df_fe['witness_present'] == 1) & (df_fe['police_report'] == 1) &
                                      (df_fe['liab_prct'] < 20)).astype(int)
    df_fe['evidence_score'] = df_fe['witness_present'] + df_fe['police_report']

    # ========================================================================
    # ACCIDENT SITE FEATURES
    # ========================================================================
    df_fe['high_risk_site'] = df_fe['highrisk_site_binary']
    df_fe['parking_accident'] = (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['unknown_site'] = (df_fe['accident_site'] == 'Unknown').astype(int)
    df_fe['highway_accident'] = (df_fe['accident_site'] == 'Highway/Intersection').astype(int)
    df_fe['local_accident'] = (df_fe['accident_site'] == 'Local').astype(int)

    # ========================================================================
    # DRIVER AGE & EXPERIENCE
    # ========================================================================
    df_fe['driver_age'] = df_fe['claim_year'] - df_fe['year_of_born']
    df_fe.loc[(df_fe['driver_age'] < 16) | (df_fe['driver_age'] > 100), 'driver_age'] = np.nan

    df_fe['young_driver'] = ((df_fe['driver_age'] >= 16) & (df_fe['driver_age'] <= 25)).astype(int)
    df_fe['prime_driver'] = ((df_fe['driver_age'] > 25) & (df_fe['driver_age'] <= 45)).astype(int)
    df_fe['middle_age_driver'] = ((df_fe['driver_age'] > 45) & (df_fe['driver_age'] <= 65)).astype(int)
    df_fe['senior_driver'] = (df_fe['driver_age'] > 65).astype(int)

    df_fe['driving_experience'] = (df_fe['driver_age'] - df_fe['age_of_DL']).clip(lower=0)
    df_fe.loc[df_fe['driving_experience'] < 0, 'driving_experience'] = np.nan

    df_fe['novice_driver'] = (df_fe['driving_experience'] < 3).astype(int)
    df_fe['experienced_driver'] = ((df_fe['driving_experience'] >= 3) & (df_fe['driving_experience'] <= 10)).astype(int)
    df_fe['veteran_driver'] = (df_fe['driving_experience'] > 10).astype(int)

    df_fe['experience_x_safety'] = df_fe['driving_experience'] * df_fe['safety_rating']
    df_fe['driver_age_x_safety'] = df_fe['driver_age'] * df_fe['safety_rating']

    # NEW: Driver risk interactions from Doc 8
    df_fe['young_novice'] = df_fe['young_driver'] * df_fe['novice_driver']

    # ========================================================================
    # VEHICLE FEATURES (without vehicle_age)
    # ========================================================================
    df_fe['luxury_vehicle'] = (df_fe['vehicle_price'] > 50000).astype(int)
    df_fe['mid_price_vehicle'] = ((df_fe['vehicle_price'] >= 20000) & (df_fe['vehicle_price'] <= 50000)).astype(int)
    df_fe['economy_vehicle'] = (df_fe['vehicle_price'] < 20000).astype(int)

    df_fe['heavy_vehicle'] = (df_fe['vehicle_weight'] > 30000).astype(int)
    df_fe['light_vehicle'] = (df_fe['vehicle_weight'] < 15000).astype(int)
    df_fe['medium_weight'] = ((df_fe['vehicle_weight'] >= 15000) & (df_fe['vehicle_weight'] <= 30000)).astype(int)

    df_fe['is_large_vehicle'] = (df_fe['vehicle_category'] == 'Large').astype(int)
    df_fe['is_compact_vehicle'] = (df_fe['vehicle_category'] == 'Compact').astype(int)
    df_fe['is_medium_vehicle'] = (df_fe['vehicle_category'] == 'Medium').astype(int)

    # ========================================================================
    # CLAIM CHARACTERISTICS
    # ========================================================================
    df_fe['high_mileage'] = (df_fe['vehicle_mileage'] > 100000).astype(int)
    df_fe['low_mileage'] = (df_fe['vehicle_mileage'] < 50000).astype(int)
    df_fe['very_high_mileage'] = (df_fe['vehicle_mileage'] > 150000).astype(int)
    df_fe['medium_mileage'] = ((df_fe['vehicle_mileage'] >= 50000) & (df_fe['vehicle_mileage'] <= 100000)).astype(int)

    df_fe['frequent_claimer'] = (df_fe['past_num_of_claims'] > 5).astype(int)
    df_fe['moderate_claimer'] = ((df_fe['past_num_of_claims'] >= 1) & (df_fe['past_num_of_claims'] <= 5)).astype(int)
    df_fe['first_time_claimer'] = (df_fe['past_num_of_claims'] == 0).astype(int)
    df_fe['very_frequent_claimer'] = (df_fe['past_num_of_claims'] > 10).astype(int)

    df_fe['large_payout'] = (df_fe['claim_est_payout'] > 5000).astype(int)
    df_fe['medium_payout'] = ((df_fe['claim_est_payout'] >= 2000) & (df_fe['claim_est_payout'] <= 5000)).astype(int)
    df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
    df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)

    df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
    df_fe['mileage_x_claims'] = df_fe['vehicle_mileage'] * df_fe['past_num_of_claims']

    # NEW: Claims risk interactions from Doc 8
    df_fe['senior_frequent_claimer'] = df_fe['senior_driver'] * df_fe['frequent_claimer']
    df_fe['low_safety_high_claims'] = ((df_fe['safety_rating'] < 60) & (df_fe['past_num_of_claims'] > 3)).astype(int)

    # ========================================================================
    # RATIO FEATURES
    # ========================================================================
    df_fe['payout_to_price_ratio'] = df_fe['claim_est_payout'] / (df_fe['vehicle_price'] + 1)
    df_fe['severe_damage'] = (df_fe['payout_to_price_ratio'] > 0.3).astype(int)
    df_fe['moderate_damage'] = ((df_fe['payout_to_price_ratio'] >= 0.1) & (df_fe['payout_to_price_ratio'] <= 0.3)).astype(int)
    df_fe['minor_damage'] = (df_fe['payout_to_price_ratio'] < 0.1).astype(int)

    df_fe['income_to_vehicle_price'] = df_fe['annual_income'] / (df_fe['vehicle_price'] + 1)
    df_fe['can_afford_vehicle'] = (df_fe['income_to_vehicle_price'] >= 0.5).astype(int)
    df_fe['expensive_for_income'] = (df_fe['income_to_vehicle_price'] < 0.3).astype(int)

    df_fe['claims_per_year_driving'] = df_fe['past_num_of_claims'] / (df_fe['driving_experience'] + 1)
    df_fe['claim_frequency_high'] = (df_fe['claims_per_year_driving'] > 0.5).astype(int)

    df_fe['safety_to_liability'] = df_fe['safety_rating'] / (df_fe['liab_prct'] + 1)
    df_fe['payout_to_income'] = df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)
    df_fe['mileage_to_price'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['weight_to_price'] = df_fe['vehicle_weight'] / (df_fe['vehicle_price'] + 1)

    # ========================================================================
    # POLICYHOLDER CHARACTERISTICS
    # ========================================================================
    df_fe['high_income'] = (df_fe['annual_income'] > 70000).astype(int)
    df_fe['mid_income'] = ((df_fe['annual_income'] >= 40000) & (df_fe['annual_income'] <= 70000)).astype(int)
    df_fe['low_income'] = (df_fe['annual_income'] < 40000).astype(int)
    df_fe['very_high_income'] = (df_fe['annual_income'] > 100000).astype(int)

    df_fe['high_safety_rating'] = (df_fe['safety_rating'] > 80).astype(int)
    df_fe['low_safety_rating'] = (df_fe['safety_rating'] < 60).astype(int)
    df_fe['very_high_safety'] = (df_fe['safety_rating'] > 90).astype(int)
    df_fe['medium_safety'] = ((df_fe['safety_rating'] >= 60) & (df_fe['safety_rating'] <= 80)).astype(int)

    df_fe['contact_available'] = df_fe['email_or_tel_available']
    df_fe['has_education'] = df_fe['high_education_ind']
    df_fe['recent_move'] = df_fe['address_change_ind']
    df_fe['home_owner'] = (df_fe['living_status'] == 'Own').astype(int)
    df_fe['renter'] = (df_fe['living_status'] == 'Rent').astype(int)
    df_fe['female'] = (df_fe['gender'] == 'F').astype(int)

    # ========================================================================
    # CHANNEL FEATURES
    # ========================================================================
    df_fe['via_broker'] = (df_fe['channel'] == 'Broker').astype(int)
    df_fe['via_online'] = (df_fe['channel'] == 'Online').astype(int)
    df_fe['via_phone'] = (df_fe['channel'] == 'Phone').astype(int)
    df_fe['in_network_repair'] = (df_fe['in_network_bodyshop'] == 'yes').astype(int)
    df_fe['out_network_repair'] = (df_fe['in_network_bodyshop'] == 'no').astype(int)

    # ========================================================================
    # COMPOSITE RECOVERY SCORES
    # ========================================================================
    liability_score = np.sqrt((100 - df_fe['liab_prct']) / 100.0)
    evidence_score_composite = (df_fe['evidence_none'] * 0.0 + df_fe['evidence_weak'] * 0.4 +
                      df_fe['evidence_strong'] * 0.7 + df_fe['evidence_very_strong'] * 1.0)
    clarity_score = df_fe['recovery_case_clarity'] / 3.0
    site_score = df_fe['high_risk_site'] * 0.7 + (1 - df_fe['unknown_site']) * 0.3

    df_fe['recovery_feasibility_score'] = (0.35 * liability_score + 0.30 * df_fe['has_recovery_target'] +
                                           0.20 * evidence_score_composite + 0.10 * clarity_score + 0.05 * site_score)

    # NEW: Alternative recovery potential score from Doc 8
    df_fe['recovery_potential'] = (
        (100 - df_fe['liab_prct']) * 0.4 +
        df_fe['evidence_score'] * 20 * 0.3 +
        df_fe['multicar_binary'] * 30 * 0.2 +
        (df_fe['claim_est_payout'] / 100) * 0.1
    )

    # ========================================================================
    # DOMAIN LOGIC FLAGS (CRITICAL FOR F1)
    # ========================================================================
    df_fe['perfect_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['witness_present'] == 1) &
                             (df_fe['police_report'] == 1) & (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['strong_case'] = ((df_fe['liab_prct'] < 25) & (df_fe['evidence_strong'] == 1) &
                            (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['good_case'] = ((df_fe['liab_prct'] < 35) & (df_fe['evidence_score'] >= 1) &
                          (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['weak_case'] = ((df_fe['liab_prct'] > 40) | (df_fe['is_single_car'] == 1) |
                          (df_fe['evidence_none'] == 1)).astype(int)

    df_fe['no_case'] = ((df_fe['liab_prct'] > 60) | ((df_fe['is_single_car'] == 1) & (df_fe['evidence_none'] == 1))).astype(int)

    df_fe['high_value_opportunity'] = ((df_fe['claim_est_payout'] > 3000) & (df_fe['liab_prct'] < 30) &
                                       (df_fe['has_recovery_target'] == 1)).astype(int)

    df_fe['slam_dunk_case'] = ((df_fe['liab_prct'] < 10) & (df_fe['witness_present'] == 1) &
                               (df_fe['police_report'] == 1) & (df_fe['multicar_binary'] == 1) &
                               (df_fe['high_risk_site'] == 1)).astype(int)

    df_fe['low_liab_high_payout'] = ((df_fe['liab_prct'] < 20) & (df_fe['claim_est_payout'] > 5000)).astype(int)
    df_fe['clear_fault_case'] = ((df_fe['liab_prct'] < 15) & (df_fe['multicar_binary'] == 1)).astype(int)
    df_fe['high_mileage_low_fault'] = ((df_fe['vehicle_mileage'] > 100000) & (df_fe['liab_prct'] < 30)).astype(int)

    # NEW: More interaction flags from Doc 8
    df_fe['low_liab_witness_police'] = ((df_fe['liab_prct'] < 20) & (df_fe['witness_binary'] == 1) &
                                         (df_fe['police_binary'] == 1)).astype(int)
    df_fe['multicar_low_liab'] = ((df_fe['multicar_binary'] == 1) & (df_fe['liab_prct'] < 25)).astype(int)
    df_fe['high_payout_evidence'] = ((df_fe['claim_est_payout'] > 5000) & (df_fe['evidence_score'] >= 1)).astype(int)
    df_fe['severe_damage_low_fault'] = ((df_fe['payout_to_price_ratio'] > 0.3) & (df_fe['liab_prct'] < 30)).astype(int)
    df_fe['minor_damage_high_fault'] = ((df_fe['payout_to_price_ratio'] < 0.1) & (df_fe['liab_prct'] > 50)).astype(int)

    # --- Temporal & Behavior Dynamics ---
    df_fe['claim_early_in_year'] = (df_fe['claim_month'] <= 3).astype(int)
    df_fe['claim_end_of_year'] = (df_fe['claim_month'] >= 10).astype(int)
    df_fe['weekend_parking'] = df_fe['is_weekend'] * (df_fe['accident_site'] == 'Parking Area').astype(int)
    df_fe['winter_claim_high_payout'] = ((df_fe['season'] == 'Winter') & (df_fe['claim_est_payout'] > 5000)).astype(int)

    # --- Vehicle Utilization Proxies (without vehicle_age) ---
    df_fe['mileage_x_weight'] = df_fe['vehicle_mileage'] * df_fe['vehicle_weight']
    df_fe['mileage_per_dollar'] = df_fe['vehicle_mileage'] / (df_fe['vehicle_price'] + 1)
    df_fe['payout_to_weight'] = df_fe['claim_est_payout'] / (df_fe['vehicle_weight'] + 1)

    # --- Policyholder Risk Profile ---
    df_fe['unstable_policyholder'] = ((df_fe['recent_move'] == 1) & (df_fe['renter'] == 1)).astype(int)
    df_fe['financial_stress_risk'] = ((df_fe['expensive_for_income'] == 1) & (df_fe['large_payout'] == 1)).astype(int)
    df_fe['young_driver_highway'] = df_fe['young_driver'] * df_fe['highway_accident']
    df_fe['senior_driver_parking'] = df_fe['senior_driver'] * df_fe['parking_accident']

    # --- Liability & Evidence Interaction Insights ---
    df_fe['low_liab_weak_evidence'] = ((df_fe['liab_prct'] < 20) & (df_fe['evidence_weak'] == 1)).astype(int)
    df_fe['high_liab_strong_evidence'] = ((df_fe['liab_prct'] > 50) & (df_fe['evidence_strong'] == 1)).astype(int)

    # Composite confidence / case quality index
    df_fe['case_confidence_score'] = (
        0.4 * (100 - df_fe['liab_prct']) / 100 +
        0.4 * df_fe['evidence_score'] / 2 +
        0.2 * df_fe['recovery_case_clarity'] / 3
    )

    # --- Statistical Normalization & Percentile Features ---
    for col in ['claim_est_payout', 'vehicle_mileage', 'annual_income']:
        df_fe[f'{col}_z'] = (df_fe[col] - df_fe[col].mean()) / (df_fe[col].std() + 1e-9)

    try:
        df_fe['liab_percentile'] = pd.qcut(df_fe['liab_prct'], 10, labels=False, duplicates='drop')
        df_fe['payout_percentile'] = pd.qcut(df_fe['claim_est_payout'], 10, labels=False, duplicates='drop')
    except Exception:
        df_fe['liab_percentile'] = np.nan
        df_fe['payout_percentile'] = np.nan

    # --- Aggregate / Hybrid Indices ---
    df_fe['case_strength_index'] = df_fe['evidence_score'] * (1 - df_fe['liab_prct'] / 100)
    df_fe['financial_exposure_index'] = (
        (df_fe['claim_est_payout'] / (df_fe['annual_income'] + 1)) * (1 + df_fe['liab_prct'] / 100)
    )
    df_fe['behavioral_risk_index'] = (
        df_fe['claims_per_year_driving'] * (100 - df_fe['safety_rating']) / 100
    )

    return df_fe

print("✓ Feature engineering function defined (190+ features)")

✓ Feature engineering function defined (190+ features)


In [None]:
train_fe = feature_engineer(train_set)
test_fe = feature_engineer(test_set)

  df_fe['very_high_mileage'] = (df_fe['vehicle_mileage'] > 150000).astype(int)
  df_fe['medium_mileage'] = ((df_fe['vehicle_mileage'] >= 50000) & (df_fe['vehicle_mileage'] <= 100000)).astype(int)
  df_fe['frequent_claimer'] = (df_fe['past_num_of_claims'] > 5).astype(int)
  df_fe['moderate_claimer'] = ((df_fe['past_num_of_claims'] >= 1) & (df_fe['past_num_of_claims'] <= 5)).astype(int)
  df_fe['first_time_claimer'] = (df_fe['past_num_of_claims'] == 0).astype(int)
  df_fe['very_frequent_claimer'] = (df_fe['past_num_of_claims'] > 10).astype(int)
  df_fe['large_payout'] = (df_fe['claim_est_payout'] > 5000).astype(int)
  df_fe['medium_payout'] = ((df_fe['claim_est_payout'] >= 2000) & (df_fe['claim_est_payout'] <= 5000)).astype(int)
  df_fe['small_payout'] = (df_fe['claim_est_payout'] < 2000).astype(int)
  df_fe['very_large_payout'] = (df_fe['claim_est_payout'] > 8000).astype(int)
  df_fe['safety_x_prior_claims'] = df_fe['safety_rating'] / (1 + df_fe['past_num_of_claims'])
  df_fe['mileage_x

# Logistic Regression

## First Try

In [None]:
X_train = train_set.iloc[:, 1:]
y_train = train_set.iloc[:, 0]

In [None]:
X_train['gender'] = np.where(X_train['gender'] == 'F', 1, 0)
X_train['living_status'] = np.where(X_train['living_status'] == 'Own', 1, 0)

In [None]:
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
num_cols = X_train.select_dtypes(exclude=['object', 'category']).columns

# Preprocessor
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols)]
)

# Model pipeline
model = Pipeline([
    ('preprocess', preprocess),
    ('logreg', LogisticRegression(
        solver='lbfgs',
        max_iter=5000,
        random_state=SEED,
        class_weight='balanced'  # good for imbalanced data
    ))
])

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(test_set)

In [None]:
import pickle

filename = '/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/logit_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# loaded_model = pickle.load(open(filename, 'rb'))

## Second Try

In [None]:
X_train = train_fe.iloc[:, 1:]
X_train = X_train.drop(columns = ['claim_number', 'vehicle_made_year', 'claim_date', 'year'])

y_train = train_fe.iloc[:, 0]

X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

X_test = test_fe.copy()
X_test = X_test.drop(columns = ['claim_number', 'vehicle_made_year', 'claim_date', 'year'])

### Original

In [None]:
cat_cols = X_train_train.select_dtypes(include=['object', 'category']).columns
num_cols = X_train_train.select_dtypes(exclude=['object', 'category']).columns

# Preprocessor
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols)]
)

# Model pipeline
model = Pipeline([
    ('preprocess', preprocess),
    ('logreg', LogisticRegression(
        solver='lbfgs',
        max_iter=5000,
        random_state=SEED,
        class_weight='balanced'  # good for imbalanced data
    ))
])

model.fit(X_train_train, y_train_train)

In [None]:
y_pred = model.predict(X_train_test)

In [None]:
accuracy = accuracy_score(y_train_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.7487037037037036


### Feature Selection

In [None]:
cat_cols = X_train_train.select_dtypes(include=['object', 'category']).columns
num_cols = X_train_train.select_dtypes(exclude=['object', 'category']).columns

preprocess = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),

        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols)
    ]
)

lasso_selector = SelectFromModel(
    LogisticRegression(
        penalty='l1',
        solver='saga',
        max_iter=5000,
        class_weight='balanced',
        random_state=SEED,
        C=0.1  # strength of regularization (lower = stronger feature elimination)
    )
)

final_model = LogisticRegression(
    solver='lbfgs',
    max_iter=5000,
    class_weight='balanced',
    random_state=SEED
)

pipeline_lasso = Pipeline([
    ('preprocess', preprocess),
    ('lasso_select', lasso_selector),
    ('model', final_model)
])

pipeline_lasso.fit(X_train_train, y_train_train)

In [None]:
feature_mask = pipeline_lasso.named_steps['lasso_select'].get_support()
feature_names = pipeline_lasso.named_steps['preprocess'].get_feature_names_out()
selected_features = np.array(feature_names)[feature_mask]

print(selected_features)

['cat__gender_F' 'cat__gender_M' 'cat__living_status_Own'
 'cat__living_status_Rent' 'cat__claim_day_of_week_Friday'
 'cat__claim_day_of_week_Thursday'
 'cat__accident_site_Highway/Intersection' 'cat__accident_site_Local'
 'cat__accident_site_Parking Area' 'cat__accident_site_Unknown'
 'cat__witness_present_ind_N' 'cat__witness_present_ind_Y'
 'cat__channel_Online' 'cat__channel_Phone'
 'cat__vehicle_category_Compact' 'cat__vehicle_category_Large'
 'cat__vehicle_color_black' 'cat__vehicle_color_blue'
 'cat__vehicle_color_gray' 'cat__vehicle_color_other'
 'cat__vehicle_color_silver' 'cat__vehicle_color_white'
 'cat__accident_type_multi_vehicle_unclear'
 'cat__accident_type_single_car' 'cat__in_network_bodyshop_no'
 'cat__in_network_bodyshop_yes' 'cat__season_Winter'
 'num__email_or_tel_available' 'num__safety_rating'
 'num__high_education_ind' 'num__address_change_ind' 'num__zip_code'
 'num__past_num_of_claims' 'num__liab_prct' 'num__policy_report_filed_ind'
 'num__vehicle_price' 'num__

In [None]:
y_pred = pipeline_lasso.predict(X_train_test)

accuracy = accuracy_score(y_train_test, y_pred)
print(f"Model Accuracy: {accuracy}")

precision = precision_score(y_train_test, y_pred)
print(f"Precision: {precision}")

recall = recall_score(y_train_test, y_pred)
print(f"Recall: {recall}")

f1 = f1_score(y_train_test, y_pred)
print("F1 Score:", f1)

Model Accuracy: 0.7485185185185185
Precision: 0.4712368168744008
Recall: 0.7940226171243941
F1 Score: 0.5914560770156438


In [None]:
y_pred = pipeline_lasso.predict(X_test)

### Select Threshold

In [None]:
from sklearn.metrics import f1_score

# Get probability predictions
y_proba = pipeline_lasso.predict_proba(X_train_test)[:, 1]

# Find optimal threshold
best_threshold = 0.5
best_f1 = 0

for threshold in np.arange(0.3, 0.7, 0.01):
    y_pred_threshold = (y_proba >= threshold).astype(int)
    f1 = f1_score(y_train_test, y_pred_threshold)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best threshold: {best_threshold}, F1: {best_f1}")

# Use this threshold for predictions
y_pred_optimized = (y_proba >= best_threshold).astype(int)

Best threshold: 0.5700000000000003, F1: 0.5995989304812834


In [None]:
accuracy = accuracy_score(y_train_test, y_pred_optimized)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7781


In [None]:
y_train = test_set.drop(columns = ['claim_number', 'vehicle_made_year', 'claim_date', 'year'])

In [None]:
y_proba_test = pipeline_lasso.predict_proba(y_train)[:, 1]
y_pred_test = (y_proba_test >= best_threshold).astype(int)

In [None]:
df = pd.DataFrame({"claim_number": list(test_set['claim_number']), "subrogation": list(y_pred_test)})
df.to_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/output4.csv')

### Handle inbalanced better

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

cat_cols = X_train_train.select_dtypes(include=['object', 'category']).columns
num_cols = X_train_train.select_dtypes(exclude=['object', 'category']).columns

preprocess = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),

        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols)
    ]
)

lasso_selector = SelectFromModel(
    LogisticRegression(
        penalty='l1',
        solver='saga',
        max_iter=5000,
        class_weight='balanced',
        random_state=SEED,
        C=0.1  # strength of regularization (lower = stronger feature elimination)
    )
)

final_model = LogisticRegression(
    solver='lbfgs',
    max_iter=5000,
    class_weight='balanced',
    random_state=SEED
)

# Apply SMOTE after preprocessing
pipeline_smote = ImbPipeline([
    ('preprocess', preprocess),
    ('smote', SMOTE(random_state=SEED)),
    ('lasso_select', lasso_selector),
    ('model', final_model)
])

pipeline_smote.fit(X_train_train, y_train_train)

In [None]:
y_pred = pipeline_smote.predict(X_train_test)

accuracy = accuracy_score(y_train_test, y_pred)
print(f"Model Accuracy: {accuracy}")

f1 = f1_score(y_train_test, y_pred)
print("F1 Score:", f1)

Model Accuracy: 0.7553703703703704
F1 Score: 0.591401175378905


In [None]:
y_pred = pipeline_lasso.predict(y_train)

### Ensemble

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=20,
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1
)

pipeline_rf = Pipeline([
    ('preprocess', preprocess),
    ('model', rf_model)
])

pipeline_rf.fit(X_train_train, y_train_train)

y_pred_rf = pipeline_rf.predict(X_train_test)

rf_acc = accuracy_score(y_train_test, y_pred_rf)
rf_f1 = f1_score(y_train_test, y_pred_rf)

print(f"Random Forest Accuracy: {rf_acc:.4f}")
print(f"Random Forest F1 Score: {rf_f1:.4f}")

Random Forest Accuracy: 0.7767
Random Forest F1 Score: 0.5904


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED,
    scale_pos_weight=(y_train_train.value_counts()[0] / y_train_train.value_counts()[1]),
    eval_metric='logloss',
    n_jobs=-1
)

pipeline_xgb = Pipeline([
    ('preprocess', preprocess),
    ('model', xgb_model)
])

pipeline_xgb.fit(X_train_train, y_train_train)

y_pred_xgb = pipeline_xgb.predict(X_train_test)

xgb_acc = accuracy_score(y_train_test, y_pred_xgb)
xgb_f1 = f1_score(y_train_test, y_pred_xgb)

print(f"XGBoost Accuracy: {xgb_acc:.4f}")
print(f"XGBoost F1 Score: {xgb_f1:.4f}")

XGBoost Accuracy: 0.7659
XGBoost F1 Score: 0.5923


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_soft = VotingClassifier(
    estimators=[
        ('lasso', pipeline_lasso),
        ('rf', pipeline_rf),
        ('xgb', pipeline_xgb)
    ],
    voting='soft',     # average probabilities
    weights=[1, 1, 2], # optional: give XGBoost slightly more weight
    n_jobs=-1
)

ensemble_soft.fit(X_train_train, y_train_train)

y_pred_ens = ensemble_soft.predict(X_train_test)

from sklearn.metrics import accuracy_score, f1_score
acc_ens = accuracy_score(y_train_test, y_pred_ens)
f1_ens = f1_score(y_train_test, y_pred_ens)

print(f"Ensemble Accuracy: {acc_ens:.4f}")
print(f"Ensemble F1: {f1_ens:.4f}")

Ensemble Accuracy: 0.7691
Ensemble F1: 0.6015


In [None]:
y_pred = ensemble_soft.predict(X_test)

## Third Try

In [None]:
train_set_filtered = train_fe.copy()

X_train = train_set_filtered.iloc[:, 1:]
X_train = X_train.drop(columns = ['claim_number', 'vehicle_made_year'])

y_train = train_set_filtered.iloc[:, 0]

X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

In [None]:
X_test = test_fe.copy()
X_test = X_test.drop(columns = ['claim_number', 'vehicle_made_year'])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return feature_engineer(X)

lasso_selector = SelectFromModel(
    LogisticRegression(
        penalty='l1',
        solver='saga',
        max_iter=5000,
        class_weight='balanced',
        random_state=SEED,
        C=0.1  # strength of regularization (lower = stronger feature elimination)
    )
)

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

def build_preprocessor(df_sample):
    numeric_cols = df_sample.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df_sample.select_dtypes(include=["object", "category"]).columns.tolist()

    num_transformer = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ])

    cat_transformer = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", num_transformer, numeric_cols),
        ("cat", cat_transformer, categorical_cols)
    ])

    return preprocessor

In [None]:
from sklearn.pipeline import Pipeline

final_model = LogisticRegression(
    solver="lbfgs",
    max_iter=5000,
    class_weight="balanced",
    random_state=123
)

pipeline_lasso = Pipeline([
    ("feat_eng", FeatureEngineer()),
    ("preprocess", build_preprocessor(X_train_train.copy())),
    ("lasso_select", lasso_selector),
    ("model", final_model)
])

pipeline_lasso.fit(X_train_train, y_train_train)

In [None]:
y_pred = pipeline_lasso.predict(X_train_test)

accuracy = accuracy_score(y_train_test, y_pred)
print(f"Model Accuracy: {accuracy}")

precision = precision_score(y_train_test, y_pred)
print(f"Precision: {precision}")

recall = recall_score(y_train_test, y_pred)
print(f"Recall: {recall}")

f1 = f1_score(y_train_test, y_pred)
print("F1 Score:", f1)

Model Accuracy: 0.7485185185185185
Precision: 0.4712368168744008
Recall: 0.7940226171243941
F1 Score: 0.5914560770156438


In [None]:
y_test_pred = pipeline_lasso.predict(X_test)

## Export prediction csv and model

In [None]:
df = pd.DataFrame({"claim_number": list(test_set['claim_number']), "subrogation": list(y_pred)})
df.to_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/lasso_logreg_v2_features.csv', index=False)

In [None]:
import pickle

filename = '/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/lasso_logreg.pkl'
pickle.dump(pipeline_lasso, open(filename, 'wb'))

In [None]:
filename = '/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/rf.pkl'
pickle.dump(pipeline_rf, open(filename, 'wb'))

filename = '/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/xgb.pkl'
pickle.dump(pipeline_xgb, open(filename, 'wb'))

filename = '/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/ensemble.pkl'
pickle.dump(ensemble_soft, open(filename, 'wb'))

# loaded_model = pickle.load(open(filename, 'rb'))

# Extra

In [None]:
import pandas as pd

output1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/output.csv')
output2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2025-travelers-umc/output2.csv')
output2['are_equal'] = (output1['subrogation'] == output2['subrogation'])

np.int64(11713)

In [None]:
len(output2) - output2['are_equal'].sum()

np.int64(287)