In [1]:
import optuna
import yaml
import json
from sklearn.metrics import f1_score
from typing import Any, Dict, Tuple
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler, OrdinalEncoder, PowerTransformer, RobustScaler, MinMaxScaler,
    FunctionTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import  accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
import numpy as np
from utils_machine_learning import rename_columns_to_snake_case

from preprocessor_transformers import (
    DropColumns,
    DomainFeatureCreator,
    LogTransformer,
    PowerTransformerWrapper,
    ReplaceValueTransformer,
    ModeImputer,
    MedianImputer,
    MeanImputer,
    CustomLabelEncoder
)
from optuna.samplers import TPESampler
from sklearn.decomposition import PCA

import joblib
from typing import Union

from imblearn.over_sampling import (
    RandomOverSampler,
    ADASYN,

)
from imblearn.under_sampling import (
    RandomUnderSampler,
    NearMiss,
)
from imblearn.combine import (
    SMOTEENN,
    SMOTETomek
)
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# load the dataset
def load_dataset() -> pd.DataFrame:
    """
    Load the dataset from the CSV file and return it as a pandas DataFrame.

    Args:
        None

    Returns:
        pd.DataFrame: The dataset loaded from the CSV file.
    """

    data_path = 'https://github.com/donadviser/datasets/raw/master/data-don/auto_insurance_claim_fraud.csv'
    data = pd.read_csv(data_path, sep=",")
    return (data
            .pipe(rename_columns_to_snake_case)
            #.dropna()
            )

In [3]:
data_raw = load_dataset()
data_raw.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [4]:
onehot_features = ['policy_state', 'collision_type', 'property_damage', 'police_report_available',
                  'insured_sex', 'insured_education_level', 'insured_relationship', 'incident_type',
                  'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city',
                  'policy_deductable', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
                  'incident_period_of_day']

numerical_features = ['months_as_customer',  'age', 'policy_annual_premium', 'injury_claim',
                      'property_claim', 'vehicle_claim', 'vehicle_age','total_claim_amount']

ordinal_features = ['insured_occupation', 'insured_hobbies', 'auto_make']

transform_features = ['umbrella_limit', 'capital_gains', 'capital_loss']

drop_columns = ['policy_number','policy_bind_date','policy_csl', 'insured_zip','incident_date',
                'incident_location','auto_model','auto_year', 'incident_hour_of_the_day',
                ]

bins_hour = [0, 6, 11, 16, 21, 24]  # Time bins for different periods of the day
names_period = ["early_morning", "morning", "afternoon", "evening", "night"]

target_col = 'fraud_reported'

In [5]:
# Assuming dataset is loaded in a pandas dataframe
X, y = data_raw.drop(columns=[target_col]), data_raw[target_col]

# Splitting the dataset
y = y.map({'Y': 1, 'N': 0})  # Map target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
preprocessor = ImbPipeline([

    # Custom feature engineering steps
    ('create_new_features', DomainFeatureCreator(bins_hour=bins_hour, names_period=names_period)),
    ('drop_cols', DropColumns(columns_to_drop=drop_columns)),
    ('replace_class', ReplaceValueTransformer(old_value="?", new_value=np.nan)),
    ('category_imputer', ModeImputer(columns=onehot_features)),
    ('cat_label_encoder',CustomLabelEncoder(columns=onehot_features)),
])

In [7]:
preprocessor

In [8]:
X_transformed = preprocessor.fit_transform(X_train)
X_transformed.head()

Unnamed: 0,months_as_customer,age,policy_state,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_occupation,insured_hobbies,...,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,vehicle_age,incident_period_of_day
887,441,55,0,2,1270.29,4000000,0,0,armed-forces,exercise,...,2,1,1,6400,640,640,5120,Honda,22,4
317,275,45,0,0,1447.77,0,1,4,adm-clerical,camping,...,0,3,1,64320,5360,10720,48240,Accura,26,1
796,421,56,0,2,1935.85,4000000,0,3,machine-op-inspct,reading,...,1,0,1,92730,16860,8430,67440,Mercedes,20,1
425,4,34,2,2,1282.93,0,1,0,exec-managerial,basketball,...,2,1,1,66880,6080,12160,48640,Chevrolet,28,4
991,257,44,2,1,1280.88,0,0,2,other-service,basketball,...,2,3,1,46980,0,5220,41760,Accura,22,2
