In [184]:
from abc import ABC, abstractmethod
import optuna
import yaml
from sklearn.metrics import f1_score
from typing import Any, Dict, List, Optional
import copy
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler, OrdinalEncoder, PowerTransformer, RobustScaler, MinMaxScaler,
    FunctionTransformer,MinMaxScaler)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
from utils_machine_learning import rename_columns_to_snake_case
import matplotlib.pyplot as plt
import seaborn as sns
from custom_transformers import (
    DropRedundantColumns,
    CreateNewFeature,
    ReplaceClassTransformer,
)
from optuna.samplers import TPESampler
from sklearn.decomposition import PCA
import yaml
import joblib
from typing import Dict, Any, Union, Callable, List
import copy
from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE,
    KMeansSMOTE,
)
from imblearn.under_sampling import (
    RandomUnderSampler,
    TomekLinks,
    NearMiss,
)
from imblearn.combine import (
    SMOTEENN,
    SMOTETomek
)

In [185]:
# load the dataset
def load_dataset() -> pd.DataFrame:
    """
    Load the dataset from the CSV file and return it as a pandas DataFrame.

    Args:
        None

    Returns:
        pd.DataFrame: The dataset loaded from the CSV file.
    """
    
    data_path = 'https://github.com/donadviser/datasets/raw/master/data-don/auto_insurance_claim_fraud.csv'
    data = pd.read_csv(data_path, sep=",")
    return (data
            .pipe(rename_columns_to_snake_case)
            #.dropna()
            )

In [186]:
data_raw = load_dataset()
data_raw.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [187]:
categorical_columns = ['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 
                       'insured_occupation', 'insured_hobbies', 'insured_relationship', 
                       'incident_type', 'collision_type', 'incident_severity', 
                       'authorities_contacted', 'incident_state', 'incident_city', 
                       'property_damage', 'police_report_available', 'auto_make', 
                       'policy_deductable', 'number_of_vehicles_involved', 'bodily_injuries', 
                       'witnesses', 'incident_hour_of_the_day']

onehot_features = ['policy_state', 'collision_type', 'property_damage', 'police_report_available', 
                  'insured_sex', 'insured_education_level', 'insured_relationship', 'incident_type', 
                  'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 
                  'policy_deductable', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 
                  'incident_period_of_day']

numerical_features = ['months_as_customer',  'age', 'policy_annual_premium', 'injury_claim', 
                      'property_claim', 'vehicle_claim', 'vehicle_age',]

ordinal_features = ['insured_occupation', 'insured_hobbies', 'auto_make']

transform_features = ['umbrella_limit', 'capital_gains', 'capital_loss']

drop_columns = ['policy_number','policy_bind_date','policy_csl', 'insured_zip','incident_date',
                'incident_location','auto_model','auto_year', 'incident_hour_of_the_day',
                'total_claim_amount']

bins_hour = [-1, 5, 11, 17, 20, 24]  # Time bins for different periods of the day
names_period = ["early_morning", "morning", "afternoon", "evening", "night"] 

target_col = 'fraud_reported'

In [188]:
# Define the SMOTENN for resampling
smoteenn = SMOTEENN(random_state=42,sampling_strategy='minority' )

# Define the final pipeline with all transformations and resampling
pipeline_imbalance = ImbPipeline(
    steps = [  ]
    )

# Insert the transformations and resampling steps before the model in the pipeline
pipeline_imbalance.steps.insert(0, ['create_new_features', CreateNewFeature(bins_hour=bins_hour, names_period=names_period)])
pipeline_imbalance.steps.insert(1, ['replace_class', ReplaceClassTransformer(target_value="?", replacement_value='unknown')])
pipeline_imbalance.steps.insert(2,['drop_cols', DropRedundantColumns(redundant_cols=drop_columns)])
pipeline_imbalance.steps.insert(3, 
                                ['column_transformer', ColumnTransformer(
                                    transformers=[
                                        ('numerical', Pipeline([
                                            ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation for numerical features
                                            #('scaler', StandardScaler())
                                            ]), numerical_features),
                                        
                                        ('categorical', Pipeline([
                                            ('imputer', SimpleImputer(strategy='most_frequent')),  # Mode imputation for categorical features
                                            ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
                                            ]), onehot_features),
                                        
                                        ('ordinal', Pipeline([
                                            ('imputer', SimpleImputer(strategy='most_frequent')),  # Mode imputation for ordinal features
                                            ('ordinal', OrdinalEncoder())
                                            ]), ordinal_features),
                                        
                                        ('power_transform', Pipeline([
                                            ('imputer', SimpleImputer(strategy='mean')),  # Mean imputation for features needing power transformation
                                            ('power_transformer', PowerTransformer(method='yeo-johnson'))
                                            ]), transform_features),
                                        ],
                                    remainder='passthrough'
                                    )
                                 ]
                                )

#pipeline_imbalance.steps.append( ['resample', smoteenn])
#pipeline_imbalance.steps.append( ['scaler', StandardScaler()])

In [190]:
# Assuming dataset is loaded in a pandas dataframe
X, y = data_raw.drop(columns=[target_col]), data_raw[target_col]

# Splitting the dataset
y = y.map({'Y': 0, 'N': 1})  # Map target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [191]:
# Load YAML configuration file
def load_yaml_config(config_path: str) -> Dict[str, Any]:
    """
    Load the YAML configuration file containing model and hyperparameter definitions.

    Args:
        config_path (str): Path to the YAML configuration file.

    Returns:
        Dict[str, Any]: The loaded configuration as a dictionary.
    """
    with open(config_path, "r") as file:
        return yaml.safe_load(file)

In [208]:
# Objective function with dynamic classifier selection
def objective(trial: optuna.Trial, classifier_name: str, scoring='f1') -> float:
    """
    Objective function to optimize classifiers dynamically using Optuna.

    Args:
        trial (optuna.Trial): Optuna trial object for suggesting hyperparameters.
        model_config (Dict[str, Any]): Model configuration loaded from YAML file.

    Returns:
        float: The mean F1 score from cross-validation.
    """
    # Hyperparameters for each classifier (optimized for clarity)
    if classifier_name == "RandomForest":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 2, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    elif classifier_name == "DecisionTree":
        params = {
            "max_depth": trial.suggest_int("max_depth", 2, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }
    elif classifier_name == "LGBM":
        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        }
    elif classifier_name == "XGBoost":
        params = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        }
    elif classifier_name == "CatBoost":
        # Hyperparameters for CatBoostClassifier
        params = {
            "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        }
    elif classifier_name == 'LogisticRegression':
        params = {
            "C": trial.suggest_float('C', 1e-10, 1000, log=True),
            "max_iter": trial.suggest_int('max_iter', 1, 1000, log=False),
            "l1_ratio": trial.suggest_float('l1_ratio', 0, 1, log=False),
            "solver": trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'sag', 'saga']),
        }
        if params['solver'] == 'lbfgs':
            params['penalty'] = trial.suggest_categorical('lbfgs', ['l2', None])
        elif params['solver'] == 'liblinear':
            params['penalty'] = trial.suggest_categorical('liblinear', ['l1', 'l2'])
        elif params['solver'] == 'sag':
            params['penalty'] = trial.suggest_categorical('sag', ['l2', None])
        else:
            params['penalty'] = trial.suggest_categorical('saga', ['elasticnet', 'l1', 'l2', None])
        
        
    else:
        raise ValueError(f"Invalid parameter")
    

    # Create the appropriate model instance
    if classifier_name in ["CatBoost"]:
        model = {
            "LGBM": LGBMClassifier,
            "XGBoost": XGBClassifier,
            "CatBoost": CatBoostClassifier,
            "RandomForest": RandomForestClassifier,
            "DecisionTree": DecisionTreeClassifier,
            "LogisticRegression": LogisticRegression
        }[classifier_name](**params, verbose=0)
    else:
        model = {
            "LGBM": LGBMClassifier,
            "XGBoost": XGBClassifier,
            "CatBoost": CatBoostClassifier,
            "RandomForest": RandomForestClassifier,
            "DecisionTree": DecisionTreeClassifier,
            "LogisticRegression": LogisticRegression
        }[classifier_name](**params)
    
    
    # Scaler and dimensionality reduction (optional)
    scalers = trial.suggest_categorical("scaler", ['minmax', 'standard', 'robust'])
    scaler = {
        'minmax': MinMaxScaler(),
        'standard': StandardScaler(),
        'robust': RobustScaler()
    }[scalers]

    dim_red = trial.suggest_categorical("dim_red", ["PCA", None])
    if dim_red == "PCA":
        pca_n_components = trial.suggest_int("pca_n_components", 2, 30)
        dimen_red_algorithm = PCA(n_components=pca_n_components)
    else:
        dimen_red_algorithm = 'passthrough'
        
    resampler = trial.suggest_categorical('resampler', ['RandomOverSampler', 'ADASYN', 
                                                        'RandomUnderSampler', 'NearMiss',
                                                        'SMOTEENN', 'SMOTETomek'])    
    if resampler == 'RandomOverSampler':
        resampler_obj = RandomOverSampler(random_state=48)
    elif resampler == 'ADASYN':
        resampler_obj = ADASYN(random_state=48)
    elif resampler == 'RandomUnderSampler':
        resampler_obj = RandomUnderSampler(random_state=48)
    elif resampler == 'NearMiss':
        resampler_obj = NearMiss()
    elif resampler == 'SMOTEENN':
        resampler_obj = SMOTEENN(random_state=48)
    else:
        resampler_obj = SMOTETomek(random_state=48)

    # Create and modify the pipeline with the selected steps
    pipe = copy.deepcopy(pipeline_imbalance)
    pipe.steps.insert(4, ['resampler', resampler_obj])
    pipe.steps.insert(5, ['scaler', scaler])
    pipe.steps.insert(6, ['dimen_red_algorithm', dimen_red_algorithm])
    pipe.steps.insert(7, ['model', model])

    # Cross-validation
    kfold = StratifiedKFold(n_splits=10)
    score = cross_val_score(pipe, X_train, y_train, scoring=scoring, n_jobs=-1, cv=kfold, verbose=0)


    # Return the mean scoring score
    return score

In [214]:

def run_optimization(config_path: str, n_trials: int = 100, scoring: str ='f1') -> None:
    """
    Run Optuna study for hyperparameter tuning and model selection.

    Args:
        config_path (str): Path to the YAML configuration file.
        n_trials (int): Number of optimization trials to run. Defaults to 100.
    
    Returns:
        None: Outputs the best model, parameters, and results.
    """
    # Load the configuration from YAML
    model_configs = load_yaml_config(config_path)
    
    best_f1 = 0
    best_model = None
    best_params = None
    results = []
    
    all_models = ["RandomForest", "DecisionTree", "XGBoost", "LogisticRegression", "LGBM", "CatBoost"]

    # Loop through models in YAML config and perform optimization
    for model_config in all_models:
        print(f"\nOptimizing model: {model_config}")
        study = optuna.create_study(direction="maximize", sampler=TPESampler())
        study.optimize(lambda trial: objective(trial, model_config, scoring), n_trials=n_trials)

        best_trial = study.best_trial
        results.append({
            "model": model_config,
            "params": best_trial.params,
            "model_score_params": best_trial.params,
            "model_score_trial_number": best_trial.number,
            "model_score_datetime": best_trial.datetime_start,
            "model_score_duration": best_trial.duration,
            "model_score_status": best_trial.state,
            "model_score_key": scoring,
            "model_score_value": best_trial.value
            
        })

        current_score = best_trial.value
        print(f"Model: {model_config}, F1 Score: {current_score}")
        
        if current_score and current_score > best_f1:
            best_f1 = best_trial.value
            best_model = model_config
            best_params = best_trial.params

    # Display all results and the best model
    for result in results:
        print(result)
    
    print("Best model:", best_model)
    print("Best parameters:", best_params)
    
    # Save the best model to a pickle file
    pipe_predict = copy.deepcopy(pipeline_imbalance)
    pipe_predict.steps.insert(4, ['resampler', smoteenn])
    
    print(f"best_params.type: {type(best_params)}")
    
    # Check if scalar exists in best_params before trying to extract it
        # Access the scalar without popping it out of the dictionary
    scalar_key = best_params['scaler']  # No need to use .pop()

    # Map the scalar key to the correct scaler
    scaler = {
        'minmax': MinMaxScaler(),
        'standard': StandardScaler(),
        'robust': RobustScaler()
    }[scalar_key]

    # Insert the scaler into the pipeline
    pipe_predict.steps.insert(5, ['scaler', StandardScaler()])
    
    best_params.pop('scaler')
    best_params.pop('resampler')
    best_params.pop('dim_red')
    print(f"best_params: {best_params}")
    
    # Create the appropriate model instance
    if best_model in ["CatBoost"]:
        best_model_instance = {
            "LGBM": LGBMClassifier,
            "XGBoost": XGBClassifier,
            "CatBoost": CatBoostClassifier,
            "RandomForest": RandomForestClassifier,
            "DecisionTree": DecisionTreeClassifier,
            "LogisticRegression": LogisticRegression
        }[best_model](**best_params, verbose=0)
    else:
        best_model_instance = {
            "LGBM": LGBMClassifier,
            "XGBoost": XGBClassifier,
            "CatBoost": CatBoostClassifier,
            "RandomForest": RandomForestClassifier,
            "DecisionTree": DecisionTreeClassifier,
            "LogisticRegression": LogisticRegression
        }[best_model](**best_params)
        
    
    
    
    
    # Save the best model to a pickle file
    pipe_predict.steps.insert(6, ['model', best_model_instance])
    
    pipe_predict.fit(X_train, y_train)  # Ensure both X and y are passed
    joblib.dump(pipe_predict, "best_model.pkl")

In [215]:
if __name__ == "__main__":
    # Example usage
    config_path = "model_config.yaml"
    scoring = 'roc_auc'
    run_optimization(config_path, n_trials=100, scoring=scoring)

[I 2024-10-16 08:02:05,689] A new study created in memory with name: no-name-5903abd7-38c3-4bba-925d-c6de6ab44699
[I 2024-10-16 08:02:05,852] Trial 0 finished with value: 0.7246376811594203 and parameters: {'n_estimators': 60, 'max_depth': 13, 'min_samples_split': 20, 'min_samples_leaf': 17, 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 12, 'resampler': 'SMOTETomek'}. Best is trial 0 with value: 0.7246376811594203.



Optimizing model: RandomForest


[I 2024-10-16 08:02:06,082] Trial 1 finished with value: 0.514018691588785 and parameters: {'n_estimators': 178, 'max_depth': 28, 'min_samples_split': 16, 'min_samples_leaf': 2, 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 10, 'resampler': 'NearMiss'}. Best is trial 0 with value: 0.7246376811594203.
[I 2024-10-16 08:02:07,191] Trial 2 finished with value: 0.7763157894736842 and parameters: {'n_estimators': 283, 'max_depth': 24, 'min_samples_split': 8, 'min_samples_leaf': 7, 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 26, 'resampler': 'SMOTETomek'}. Best is trial 2 with value: 0.7763157894736842.
[I 2024-10-16 08:02:07,406] Trial 3 finished with value: 0.44660194174757284 and parameters: {'n_estimators': 113, 'max_depth': 21, 'min_samples_split': 4, 'min_samples_leaf': 9, 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 18, 'resampler': 'NearMiss'}. Best is trial 2 with value: 0.7763157894736842.
[I 2024-10-16 08:02:07,820] Trial 4 finished with value

Model: RandomForest, F1 Score: 0.8881578947368421

Optimizing model: DecisionTree


[I 2024-10-16 08:02:33,310] Trial 4 finished with value: 0.4411764705882353 and parameters: {'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 3, 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 14, 'resampler': 'NearMiss'}. Best is trial 0 with value: 0.9047619047619048.
[I 2024-10-16 08:02:33,384] Trial 5 finished with value: 0.7896678966789668 and parameters: {'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 5, 'scaler': 'robust', 'dim_red': None, 'resampler': 'RandomUnderSampler'}. Best is trial 0 with value: 0.9047619047619048.
[I 2024-10-16 08:02:33,439] Trial 6 finished with value: 0.9047619047619048 and parameters: {'max_depth': 14, 'min_samples_split': 5, 'min_samples_leaf': 15, 'scaler': 'standard', 'dim_red': None, 'resampler': 'RandomUnderSampler'}. Best is trial 0 with value: 0.9047619047619048.
[I 2024-10-16 08:02:33,511] Trial 7 finished with value: 0.7317073170731707 and parameters: {'max_depth': 30, 'min_samples_split': 20, 'min_samples_l

Model: DecisionTree, F1 Score: 0.9054054054054054

Optimizing model: XGBoost


[I 2024-10-16 08:02:38,817] Trial 0 finished with value: 0.8451612903225807 and parameters: {'booster': 'gbtree', 'lambda': 1.5265272445197526e-06, 'alpha': 3.994215448919692e-07, 'subsample': 0.2066530240557377, 'colsample_bytree': 0.8728165356665254, 'scaler': 'robust', 'dim_red': None, 'resampler': 'RandomOverSampler'}. Best is trial 0 with value: 0.8451612903225807.
[I 2024-10-16 08:02:40,005] Trial 1 finished with value: 0.6441947565543071 and parameters: {'booster': 'dart', 'lambda': 0.0010007015067251554, 'alpha': 0.004270884134130908, 'subsample': 0.5234657160831733, 'colsample_bytree': 0.5349581521606062, 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 13, 'resampler': 'SMOTEENN'}. Best is trial 0 with value: 0.8451612903225807.
[I 2024-10-16 08:02:40,284] Trial 2 finished with value: 0.5493562231759657 and parameters: {'booster': 'gbtree', 'lambda': 0.1548126962411679, 'alpha': 0.023258896694330494, 'subsample': 0.950304337461491, 'colsample_bytree': 0.7349020500290

Model: XGBoost, F1 Score: 0.8930817610062893

Optimizing model: LogisticRegression


[I 2024-10-16 08:03:58,556] Trial 3 finished with value: 0.5589519650655022 and parameters: {'C': 0.01023695152693874, 'max_iter': 313, 'l1_ratio': 0.011180647504524854, 'solver': 'liblinear', 'liblinear': 'l1', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 13, 'resampler': 'RandomOverSampler'}. Best is trial 1 with value: 0.8316151202749141.
[I 2024-10-16 08:03:58,633] Trial 4 finished with value: 0.43781094527363185 and parameters: {'C': 0.0009438116241585153, 'max_iter': 652, 'l1_ratio': 0.35003852856548734, 'solver': 'lbfgs', 'lbfgs': 'l2', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 25, 'resampler': 'NearMiss'}. Best is trial 1 with value: 0.8316151202749141.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/l

Model: LogisticRegression, F1 Score: 0.8603988603988604

Optimizing model: LGBM


[I 2024-10-16 08:04:07,720] Trial 1 finished with value: 0.7101449275362319 and parameters: {'lambda_l1': 1.3968449917343459e-05, 'lambda_l2': 3.313971601565267e-08, 'num_leaves': 8, 'feature_fraction': 0.7935119393586056, 'bagging_fraction': 0.7942770533655346, 'bagging_freq': 7, 'min_child_samples': 63, 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 17, 'resampler': 'RandomUnderSampler'}. Best is trial 1 with value: 0.7101449275362319.
[I 2024-10-16 08:04:07,960] Trial 2 finished with value: 0.4930232558139535 and parameters: {'lambda_l1': 0.32217853385197126, 'lambda_l2': 2.083801902054794e-08, 'num_leaves': 23, 'feature_fraction': 0.7510762714750823, 'bagging_fraction': 0.945574798744002, 'bagging_freq': 2, 'min_child_samples': 39, 'scaler': 'standard', 'dim_red': 'PCA', 'pca_n_components': 2, 'resampler': 'NearMiss'}. Best is trial 1 with value: 0.7101449275362319.
[I 2024-10-16 08:04:08,785] Trial 3 finished with value: 0.8587896253602305 and parameters: {'lambda_l1': 

Model: LGBM, F1 Score: 0.9054054054054054

Optimizing model: CatBoost


[I 2024-10-16 08:04:29,268] Trial 0 finished with value: 0.8676923076923077 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.043653803429542205, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'scaler': 'minmax', 'dim_red': None, 'resampler': 'SMOTETomek'}. Best is trial 0 with value: 0.8676923076923077.
[I 2024-10-16 08:04:34,896] Trial 1 finished with value: 0.8454258675078864 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09551613603149922, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'scaler': 'standard', 'dim_red': None, 'resampler': 'SMOTEENN'}. Best is trial 0 with value: 0.8676923076923077.
[I 2024-10-16 08:04:36,642] Trial 2 finished with value: 0.8695652173913043 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.07125941148044403, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'scaler': 'standard', 'dim_red': None, 'resampler': 'SMOTETomek'}. Best is trial 

Model: CatBoost, F1 Score: 0.9016949152542373
{'model': 'RandomForest', 'params': {'n_estimators': 255, 'max_depth': 11, 'min_samples_split': 11, 'min_samples_leaf': 14, 'scaler': 'standard', 'dim_red': None, 'resampler': 'RandomOverSampler'}, 'model_score_params': {'n_estimators': 255, 'max_depth': 11, 'min_samples_split': 11, 'min_samples_leaf': 14, 'scaler': 'standard', 'dim_red': None, 'resampler': 'RandomOverSampler'}, 'model_score_trial_number': 19, 'model_score_datetime': datetime.datetime(2024, 10, 16, 8, 2, 11, 355240), 'model_score_duration': datetime.timedelta(microseconds=286362), 'model_score_status': 1, 'model_score_key': 'roc_auc', 'model_score_value': 0.8881578947368421}
{'model': 'DecisionTree', 'params': {'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 9, 'scaler': 'robust', 'dim_red': None, 'resampler': 'RandomUnderSampler'}, 'model_score_params': {'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 9, 'scaler': 'robust', 'dim_red': None, 'resampl

In [211]:
model = joblib.load('best_model.pkl')
model

In [216]:
ModelFactory
PipelineBuilder
PipelineModifier


NameError: name 'ModelFactory' is not defined

In [212]:

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred.tolist())
print(y_test.iloc[19:32].to_list())

[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1]
[1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]


In [213]:
accuracy = accuracy_score(y_test.iloc[19:32], y_pred)
accuracy

0.8461538461538461