In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv("./data/Training_TriGuard.csv")
test = pd.read_csv("./data/Testing_TriGuard.csv")

X_train = train.drop(columns=["subrogation"]).copy()
y_train = train["subrogation"].copy()
X_test  = test.copy()

pre = Preprocessor()
pre.fit(X_train.copy())
X_train_proc = pre.transform(X_train.copy())
X_test_proc  = pre.transform(X_test.copy())

from xgboost import XGBClassifier

class Preprocessor:
    def __init__(self):
        self.label_encoders = {}
        self.feature_stats = {}
        self.id_column = ['claim_number']

    def fit(self, df):
        # float
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if col not in ['claim_number']:
                self.feature_stats[col] = {
                    'median': df[col].median()
                    }

        # object
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            self.feature_stats[col] = {
                'mode': df[col].mode()[0]
                }
            le = LabelEncoder()
            le.fit(df[col].astype(str))
            self.label_encoders[col] = le
        return self

    def _fill_missing_values(self, df):
        for col in df.columns:
            if col in self.id_column:
                continue
        
            if pd.api.types.is_numeric_dtype(df[col]):
                df[col].fillna(self.feature_stats.get(col, {}).get('median'), inplace=True)
            else:
                df[col].fillna(self.feature_stats.get(col, {}).get('mode'), inplace=True)
        return df
    
    
    def _create_subrogation_features(self, df):
        # 代位追偿只在我方无责任或责任很低时发生
        df['not_at_fault'] = (df['liab_prct'] < 10).astype(int)
        df['minimal_fault'] = ((df['liab_prct'] >= 10) & (df['liab_prct'] <= 20)).astype(int)
        df['shared_fault'] = (df['liab_prct'] > 20).astype(int)
        
        if 'accident_type' in df.columns:
            df['is_multi_vehicle_clear'] = (df['accident_type'] == 'multi_vehicle_clear').astype(int)
            df['is_multi_vehicle_unclear'] = (df['accident_type'] == 'multi_vehicle_unclear').astype(int)
            df['is_single_car'] = (df['accident_type'] == 'single_car').astype(int)
            
        df['has_recovery_target'] = (
            df['is_multi_vehicle_clear'] | 
            df['is_multi_vehicle_unclear']
        ).astype(int)

        df['recovery_case_clarity'] = 0
        df.loc[df['is_multi_vehicle_clear'] == 1, 'recovery_case_clarity'] = 3  # 高度明确
        df.loc[df['is_multi_vehicle_unclear'] == 1, 'recovery_case_clarity'] = 1  # 不明确
        df.loc[df['is_single_car'] == 1, 'recovery_case_clarity'] = 0  # 无追偿对象
        
        df['witness_present'] = (df['witness_present_ind'] == 'Y').astype(int)

        df['evidence_none'] = (
        (df['witness_present'] == 0) & 
        (df['policy_report_filed_ind'] == 0)
        ).astype(int)

        df['evidence_weak'] = (
        ((df['witness_present'] == 1) & (df['policy_report_filed_ind'] == 0)) | ((df['witness_present'] == 0) & (df['policy_report_filed_ind'] == 1))
        ).astype(int)

        df['evidence_strong'] = (
        (df['witness_present'] == 1) & (df['policy_report_filed_ind'] == 1)
        ).astype(int)

        df['evidence_very_strong'] = (
        (df['witness_present'] == 1) & 
        (df['policy_report_filed_ind'] == 1) &
        (df['liab_prct'] < 20)
        ).astype(int)
        
        if 'claim_day_of_week' in df.columns:
            df['is_weekend'] = df['claim_day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
            df['is_monday'] = (df['claim_day_of_week'] == 'Monday').astype(int)
            df['is_friday'] = (df['claim_day_of_week'] == 'Friday').astype(int)

        if 'year_of_born' in df.columns and 'claim_date' in df.columns:
            df['claim_date'] = pd.to_datetime(df['claim_date'], errors='coerce')
            df['driver_age'] = df['claim_date'].dt.year - df['year_of_born']
            df['young_driver_18_25'] = ((df['driver_age'] >= 18) & (df['driver_age'] <= 25)).astype(int)
            df['adult_driver_26_45'] = ((df['driver_age'] >= 26) & (df['driver_age'] <= 45)).astype(int)
            df['middle_age_driver_46_65'] = ((df['driver_age'] >= 46) & (df['driver_age'] <= 65)).astype(int)
            df['senior_driver_65plus'] = (df['driver_age'] > 65).astype(int)

        if 'age_of_DL' in df.columns and 'driver_age' in df.columns:
            df['driving_experience'] = df['driver_age'] - df['age_of_DL']
            df['novice_driver'] = (df['driving_experience'] < 2).astype(int)
            df['experienced_2_5y'] = ((df['driving_experience'] >= 2) & (df['driving_experience'] <= 5)).astype(int)
            df['experienced_5_10y'] = ((df['driving_experience'] > 5) & (df['driving_experience'] <= 10)).astype(int)
            df['veteran_driver'] = (df['driving_experience'] > 10).astype(int)

        if 'safety_rating' in df.columns:
            df['safety_very_low'] = (df['safety_rating'] < 30).astype(int)
            df['safety_low'] = ((df['safety_rating'] >= 30) & (df['safety_rating'] < 50)).astype(int)
            df['safety_medium'] = ((df['safety_rating'] >= 50) & (df['safety_rating'] < 70)).astype(int)
            df['safety_high'] = ((df['safety_rating'] >= 70) & (df['safety_rating'] < 90)).astype(int)
            df['safety_very_high'] = (df['safety_rating'] >= 90).astype(int)

        if 'past_num_of_claims' in df.columns:
            df['first_time_claimer'] = (df['past_num_of_claims'] == 0).astype(int)
            df['occasional_claimer'] = ((df['past_num_of_claims'] >= 1) & (df['past_num_of_claims'] <= 3)).astype(int)
            df['frequent_claimer'] = ((df['past_num_of_claims'] > 3) & (df['past_num_of_claims'] <= 7)).astype(int)
            df['heavy_claimer'] = (df['past_num_of_claims'] > 7).astype(int)

        if 'vehicle_made_year' in df.columns and 'claim_date' in df.columns:
            df['claim_date'] = pd.to_datetime(df['claim_date'], errors='coerce')
            df['claim_year'] = df['claim_date'].dt.year
            df['vehicle_age'] = df['claim_year'] - df['vehicle_made_year']
            df['brand_new_vehicle'] = (df['vehicle_age'] <= 1).astype(int)
            df['new_vehicle_2_3y'] = ((df['vehicle_age'] >= 2) & (df['vehicle_age'] <= 3)).astype(int)
            df['mid_age_vehicle_4_7y'] = ((df['vehicle_age'] >= 4) & (df['vehicle_age'] <= 7)).astype(int)
            df['old_vehicle_8_12y'] = ((df['vehicle_age'] >= 8) & (df['vehicle_age'] <= 12)).astype(int)
            df['very_old_vehicle'] = (df['vehicle_age'] > 12).astype(int)

        if 'vehicle_category' in df.columns:
            df['is_compact_vehicle'] = (df['vehicle_category'] == 'Compact').astype(int)
            df['is_medium_vehicle'] = (df['vehicle_category'] == 'Medium').astype(int)
            df['is_large_vehicle'] = (df['vehicle_category'] == 'Large').astype(int)

        if 'vehicle_mileage' in df.columns:
            df['very_low_mileage'] = (df['vehicle_mileage'] < 10000).astype(int)
            df['low_mileage'] = ((df['vehicle_mileage'] >= 10000) & (df['vehicle_mileage'] < 50000)).astype(int)
            df['medium_mileage'] = ((df['vehicle_mileage'] >= 50000) & (df['vehicle_mileage'] < 100000)).astype(int)
            df['high_mileage'] = ((df['vehicle_mileage'] >= 100000) & (df['vehicle_mileage'] < 150000)).astype(int)
            df['very_high_mileage'] = (df['vehicle_mileage'] >= 150000).astype(int)
        
        if 'channel' in df.columns:
            df['via_broker'] = (df['channel'] == 'Broker').astype(int)
            df['via_online'] = (df['channel'] == 'Online').astype(int)
            df['via_phone'] = (df['channel'] == 'Phone').astype(int)
            df['channel_good_documentation'] = df['channel'].isin(['Broker', 'Online']).astype(int)

        if 'annual_income' in df.columns:
            income_25 = df['annual_income'].quantile(0.25)
            income_75 = df['annual_income'].quantile(0.75)
            df['low_income'] = (df['annual_income'] <= income_25).astype(int)
            df['middle_income'] = ((df['annual_income'] > income_25) & (df['annual_income'] <= income_75)).astype(int)
            df['high_income'] = (df['annual_income'] > income_75).astype(int)
    
        df['has_high_education'] = df['high_education_ind']
        df['recent_address_change'] = df['address_change_ind']
        df['home_owner'] = (df['living_status'] == 'Own').astype(int)
        df['renter'] = (df['living_status'] == 'Rent').astype(int)
    
        df['contact_info_available'] = df['email_or_tel_available']

        if 'in_network_bodyshop' in df.columns:
            df['in_network_repair'] = (df['in_network_bodyshop'] == 'yes').astype(int)
            df['out_of_network_repair'] = (df['in_network_bodyshop'] == 'no').astype(int)
        
        
        recovery_factors = []
        weights = []
        
        # 1. 责任因素（权重最高）
        if 'liab_prct' in df.columns:
            liability_score = np.sqrt((100 - np.minimum(df['liab_prct'], 100)) / 100)
            recovery_factors.append(liability_score)
            weights.append(0.35)
        
        # 2. 事故类型因素
        if 'has_recovery_target' in df.columns:
            recovery_factors.append(df['has_recovery_target'])
            weights.append(0.25)
        
        # 3. 证据强度因素
        evidence_score = (
            df['evidence_none'] * 0.0 + 
            df['evidence_weak'] * 0.5 + 
            df['evidence_strong'] * 0.8 + 
            df['evidence_very_strong'] * 1.0
        )
        recovery_factors.append(evidence_score)
        weights.append(0.20)
        
        # 4. 案件明确性因素 (15%)
        if 'recovery_case_clarity' in df.columns:
            clarity_score = df['recovery_case_clarity'] / 3.0
            recovery_factors.append(clarity_score)
            weights.append(0.15)
        
        # 5. 信息完整性因素 (5%)
        info_score = (
            df.get('channel_good_documentation', 0) * 0.7 + 
            df.get('contact_info_available', 0) * 0.3
        )
        recovery_factors.append(info_score)
        weights.append(0.05)

        if recovery_factors and weights:
            total_weight = sum(weights)
            normalized_weights = [w / total_weight for w in weights]
            df['recovery_feasibility_score'] = sum(factor * weight for factor, weight in zip(recovery_factors, normalized_weights))


        # 代位追偿资格标识
        df['high_subrogation_potential'] = (
            (df['liab_prct'] < 20) &
            (df.get('has_recovery_target', 0) == 1) &
            (df.get('evidence_strong', 0) == 1) &
            (df.get('recovery_feasibility_score', 0) > 0.7)
        ).astype(int)

        # 这些情况基本不可能代位追偿
        df['likely_no_subrogation'] = (
            (df['liab_prct'] > 50) |  # 我方责任大
            (df['is_single_car'] == 1) |  # 单车事故且无特殊原因
            (df.get('evidence_none', 0) == 1)
        ).astype(int)
        
        df['potential_subrogation_case'] = (df['high_subrogation_potential'] == 1).astype(int)
        
        return df



    def transform(self, df):
        df_transformed = df.copy()
        df_processed = self._fill_missing_values(df_transformed)
        df_processed = self._create_subrogation_features(df_processed)

        for col, le in self.label_encoders.items():
            if col in df_processed.columns:
                df_processed[col] = le.transform(df_processed[col].astype(str))
        return df_processed

    def fit_transform(self, df):
        return self.fit(df).transform(df)