# Imports

In [4]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

In [5]:
import os
import joblib
import pandas as pd
import numpy as np

# Feature Engineering
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    RobustScaler,
    StandardScaler,
    TargetEncoder
)

# Machine Learning
from sklearn import model_selection as ms

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Machine Learning - Algorithms
from xgboost import XGBClassifier

from src.sanitizer import sanitizer

# Load Train Data

In [6]:
df_train = pd.read_csv('../data/train.csv')

# Data Sanitize

In [7]:
#def sanitizer(df):
#    df = df.copy()
#    df.columns = [c.lower() for c in df.columns]
#    for col in ['policy_sales_channel', 'region_code']:
#        if col in df.columns:
#            df[col] = df[col].astype('Int64')
#    return df

In [85]:
#df_train = sanitizer(df_train)

# Preprocess Pipeline

In [86]:
# Preparing sanitizer function to pipeline
sanitizer = FunctionTransformer(sanitizer)

# ------------------------- #
# --- Column Groups ------- #
# ------------------------- #
num_std = ['vintage']                                               # StandardScaler
num_minmax = ['age']                                                # MinMaxScaler
num_log_robust = ['annual_premium']                                 # log1p + RobustScaler
cat_vehicle_age = ['vehicle_age']                                   # OrdinalEncoder
cat_damage = ['vehicle_damage']                                     # OrdinalEncoder
cat_te = ['region_code', 'policy_sales_channel']                    # TargetEncoder


# ------------------------- #
# --- Transformers -------- #
# ------------------------- #
log1p = FunctionTransformer(np.log1p, validate=False)

damage_ord = OrdinalEncoder(
    categories=[['No', 'Yes']],
)

veh_age_ord = OrdinalEncoder(
    categories=[['< 1 Year', '1-2 Year', '> 2 Years']],
)

te = TargetEncoder(
    smooth=20.0,
    random_state=42
)


# ------------------------- #
# --- ColumnTransformer --- #
# ------------------------- #
preprocess = ColumnTransformer(
    transformers=[
        # Numerical
        ('std_vintage', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_std),
        
        ('minmax_age', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
        ]), num_minmax),
         
        ('log_robust_premium', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('log1p', log1p),
            ('robust', RobustScaler())
        ]), num_log_robust),

        # Categorical
        ('ord_vehicle_age', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', veh_age_ord)
        ]), cat_vehicle_age),
         
        ('ord_damage', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', damage_ord)
        ]), cat_damage),
        ('cat_te', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', te),
        ]), cat_te)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Model Training

## Features/Label Separation

In [87]:
X_train = df_train.drop('Response', axis=1)
y_train = df_train['Response']

## Model Definition

In [88]:
# Define the model using best parameters from GridSearchCV
model = XGBClassifier(
    objective='binary:logistic',  # Logistic regression for binary classification
    eval_metric='logloss',        # Evaluation metric
    tree_method='hist',           # Efficient histogram-based algorithm
    n_estimators=200,             # Number of boosting rounds
    max_depth=3,                  # Maximum tree depth
    learning_rate=0.1,            # Shrinkage (step size)
    subsample=0.8,                # Row subsampling
    colsample_bytree=1.0,         # Feature subsampling
    random_state=42,              # Reproducibility
    n_jobs=-1                     # Use all CPU cores
)

In [89]:
# Pipeline: Sanitize + Preprocess + Model.
pipe = Pipeline([
    ('sanitizer', sanitizer),
    ('preprocess', preprocess),
    ('model', model)
])

In [90]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('sanitizer', ...), ('preprocess', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function san...001F8F7D9EA20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('std_vintage', ...), ('minmax_age', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['< 1 Year', '1-2 Year', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['No', 'Yes']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,20.0
,cv,5
,shuffle,True
,random_state,42

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False


In [92]:
joblib.dump(pipe, "../src/models/full_pipeline.joblib")

['../src/models/full_pipeline.joblib']