In [1]:
import optuna
import yaml
from sklearn.metrics import f1_score
from typing import Any, Dict
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler, OrdinalEncoder, PowerTransformer, RobustScaler, MinMaxScaler,
    FunctionTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import  accuracy_score, precision_score, recall_score
import numpy as np
from utils_machine_learning import rename_columns_to_snake_case

from custom_transformers import (
    DropRedundantColumns,
    CreateNewFeature,

    LogTransforms,
    ReplaceValueTransformer,

)
from optuna.samplers import TPESampler
from sklearn.decomposition import PCA

import joblib
from typing import Union

from imblearn.over_sampling import (
    RandomOverSampler,
    ADASYN,
    
)
from imblearn.under_sampling import (
    RandomUnderSampler,
    NearMiss,
)
from imblearn.combine import (
    SMOTEENN,
    SMOTETomek
)

In [2]:
def log_transformer(X):
    # Avoid log(0) by using log1p (log(1 + x)) for all columns
    try:
        X_transformed = X.copy()
        X_transformed = np.where(X_transformed == 0, 1e-6, X_transformed)
        X_transformed['capital_gains'] = np.log1p(X_transformed['capital_gains']+1e-6)
        X_transformed['capital_loss'] = np.log1p(-X_transformed['capital_loss']+1e-6)  # Handle negative losses
        X_transformed['umbrella_limit'] = np.log1p(X_transformed['umbrella_limit']+1e-6)
    except Exception as e:
        print(f"Error during log transformation: {e}")
        raise (e)
    return X_transformed

In [3]:
# load the dataset
def load_dataset() -> pd.DataFrame:
    """
    Load the dataset from the CSV file and return it as a pandas DataFrame.

    Args:
        None

    Returns:
        pd.DataFrame: The dataset loaded from the CSV file.
    """
    
    data_path = 'https://github.com/donadviser/datasets/raw/master/data-don/auto_insurance_claim_fraud.csv'
    data = pd.read_csv(data_path, sep=",")
    return (data
            .pipe(rename_columns_to_snake_case)
            #.dropna()
            )

In [4]:
data_raw = load_dataset()
data_raw.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [29]:
data_raw[['capital_gains', 'capital_loss', 'umbrella_limit']]

Unnamed: 0,capital_gains,capital_loss,umbrella_limit
0,53300,0,0
1,0,0,5000000
2,35100,0,5000000
3,48900,-62400,6000000
4,66000,-46000,6000000
...,...,...,...
995,0,0,0
996,70900,0,0
997,35100,0,3000000
998,0,0,5000000


In [5]:
onehot_features = ['policy_state', 'collision_type', 'property_damage', 'police_report_available', 
                  'insured_sex', 'insured_education_level', 'insured_relationship', 'incident_type', 
                  'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 
                  'policy_deductable', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 
                  'incident_period_of_day']

numerical_features = ['months_as_customer',  'age', 'policy_annual_premium', 'injury_claim', 
                      'property_claim', 'vehicle_claim', 'vehicle_age',]

ordinal_features = ['insured_occupation', 'insured_hobbies', 'auto_make']

transform_features = ['umbrella_limit', 'capital_gains', 'capital_loss']

drop_columns = ['policy_number','policy_bind_date','policy_csl', 'insured_zip','incident_date',
                'incident_location','auto_model','auto_year', 'incident_hour_of_the_day',
                'total_claim_amount']

bins_hour = [-1, 5, 11, 17, 20, 24]  # Time bins for different periods of the day
names_period = ["early_morning", "morning", "afternoon", "evening", "night"] 

target_col = 'fraud_reported'

In [101]:
# Assuming dataset is loaded in a pandas dataframe
X, y = data_raw.drop(columns=[target_col]), data_raw[target_col]

# Splitting the dataset
y = y.map({'Y': 1, 'N': 0})  # Map target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Load YAML configuration file
def load_yaml_config(config_path: str) -> Dict[str, Any]:
    """
    Load the YAML configuration file containing model and hyperparameter definitions.

    Args:
        config_path (str): Path to the YAML configuration file.

    Returns:
        Dict[str, Any]: The loaded configuration as a dictionary.
    """
    with open(config_path, "r") as file:
        return yaml.safe_load(file)

In [8]:
class HyperparameterTuner:
    """
    HyperparameterTuner to return hyperparameters for each classifier.
    """
    def get_params(self, trial: optuna.Trial, classifier_name: str):
        if classifier_name == "RandomForest":
            return {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 30),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
            }
        elif classifier_name == "DecisionTree":
            return {
                "max_depth": trial.suggest_int("max_depth", 2, 30),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
            }
        elif classifier_name == "LGBM":
            return {
                "objective": "binary",
                "metric": "binary_logloss",
                "verbosity": -1,
                "boosting_type": "gbdt",
                "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            }
        elif classifier_name == "XGBoost":
            return {
                "verbosity": 0,
                "objective": "binary:logistic",
                "eval_metric": "auc",
                "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
                "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
                "subsample": trial.suggest_float("subsample", 0.2, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            }
        elif classifier_name == "CatBoost":
            return {
                "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
                "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
                "depth": trial.suggest_int("depth", 1, 12),
                "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            }
        elif classifier_name == "LogisticRegression":
            # Basic hyperparameters
            params = {
                "solver": trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'sag', 'saga']),
                "max_iter": trial.suggest_int('max_iter', 1000, 10000),  # Increased max_iter to allow for better convergence
            }

            # Adjust penalties based on solver
            if params['solver'] == 'lbfgs':
                params['penalty'] = trial.suggest_categorical('penalty_lbfgs', ['l2', None])
            elif params['solver'] == 'liblinear':
                params['penalty'] = trial.suggest_categorical('penalty_liblinear', ['l1', 'l2'])
            elif params['solver'] == 'sag':
                params['penalty'] = trial.suggest_categorical('penalty_sag', ['l2', None])
            else:
                # For 'saga', which supports 'elasticnet'
                params['penalty'] = trial.suggest_categorical('penalty_saga', ['elasticnet', 'l1', 'l2', None])

            # Only suggest C and l1_ratio if penalty is not None
            if params['penalty'] is not None:
                params["C"] = trial.suggest_float('C', 1e-10, 1000, log=True)
            
            # Only suggest l1_ratio if penalty is 'elasticnet'
            if params['penalty'] == 'elasticnet':
                params['l1_ratio'] = trial.suggest_float('l1_ratio', 0, 1)

            return params
        elif classifier_name == "GradientBoosting":
            return {
                "learning_rate" : trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
                "n_estimators" : trial.suggest_int('n_estimators', 100, 1000),
                "max_depth" : trial.suggest_int('max_depth', 3, 10),
                "min_samples_split" : trial.suggest_int('min_samples_split', 2, 20),
                "min_samples_leaf" : trial.suggest_int('min_samples_leaf', 1, 20),
                "max_features" : trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
            }
        elif classifier_name == "KNeighbors":
            params = {
                "n_neighbors": trial.suggest_int('n_neighbors', 1, 50),
                "weights": trial.suggest_categorical('weights', ['uniform', 'distance']),
                "p": trial.suggest_int('p', 1, 2),  # 1: Manhattan, 2: Euclidean
                "leaf_size": trial.suggest_int('leaf_size', 10, 100),
                "metric": trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski', 'chebyshev'])
            }
            return params
        else:
            raise ValueError(f"Invalid classifier name: {classifier_name}")

In [9]:
class ModelFactory:
    """
    A class to create model instances with additional parameters for specific classifiers.

    Attributes:
        model_name (str): The name of the model to be instantiated.
        best_params (dict): The best hyperparameters for the model.
    """

    def __init__(self, model_name: str, best_params: dict):
        """
        Initialize the ModelFactory with a model name and parameters.
        
        Args:
            model_name (str): The name of the model.
            best_params (dict): Hyperparameters for the model.
        """
        self.model_name = model_name
        self.best_params = best_params

    def get_model_instance(self):
        """
        Creates a model instance based on the model name with additional classifier-specific parameters.

        Returns:
            A model instance with the appropriate parameters.
        """
        # Dictionary of model classes
        model_dict = {
            "LGBM": LGBMClassifier,
            "XGBoost": XGBClassifier,
            "CatBoost": CatBoostClassifier,
            "RandomForest": RandomForestClassifier,
            "DecisionTree": DecisionTreeClassifier,
            "LogisticRegression": LogisticRegression,
            "SVC": SVC,
            "GradientBoosting": GradientBoostingClassifier,
            "KNeighbors": KNeighborsClassifier
        }

        # Check if the model exists in the model_dict
        if self.model_name not in model_dict:
            raise ValueError(f"Model {self.model_name} is not supported.")

        # Create a model instance with specific parameters
        if self.model_name == "LGBM":
            return model_dict[self.model_name](**self.best_params, random_state=42, verbose=-1)  # Add verbose for LGBM
        elif self.model_name == "RandomForest":
            return model_dict[self.model_name](**self.best_params, random_state=42, n_jobs=-1)  # Add n_jobs for RandomForest
        elif self.model_name == "SVC":
            return model_dict[self.model_name](**self.best_params, random_state=42, probability=True)  # Add probability for SVC
        elif self.model_name == "CatBoost":
            return model_dict[self.model_name](**self.best_params, random_state=42, verbose=0)  # Suppress CatBoost verbosity
        elif self.model_name == "KNeighbors":
            return model_dict[self.model_name](**self.best_params)  # Suppress CatBoost verbosity
        else:
            return model_dict[self.model_name](**self.best_params, random_state=42)  # Default for other models

In [10]:
class PipelineManager:
    """
    A class that handles both building and modifying pipelines dynamically.
    This class supports both scikit-learn's Pipeline and imbalanced-learn's Pipeline.

    It allows the construction of the initial pipeline and the insertion of steps 
    at any position within the pipeline.
    """

    def __init__(self, pipeline_type='ImbPipeline'):
        """
        Initialize the PipelineManager with a specified pipeline type.

        Args:
            pipeline_type (str): The type of pipeline to use ('ImbPipeline' or 'Pipeline').
        """
        if pipeline_type == 'ImbPipeline':
            self.pipeline = ImbPipeline(steps=[])
        elif pipeline_type == 'Pipeline':
            self.pipeline = Pipeline(steps=[])
        else:
            raise ValueError("Unsupported pipeline type. Choose 'ImbPipeline' or 'Pipeline'.")

    def add_step(self, step_name, step_object, position=None):
        """
        Add a transformation step to the pipeline.

        Args:
            step_name (str): Name of the step to add.
            step_object (object): The transformer or estimator object (e.g., scaler, classifier).
            position (int or None): Optional; the position to insert the step.
                                    If None, the step is appended at the end of the pipeline.
        """
        if position is None:
            self.pipeline.steps.append((step_name, step_object))
        else:
            self.pipeline.steps.insert(position, (step_name, step_object))

    def remove_step(self, step_name):
        """
        Remove a step from the pipeline by its name.

        Args:
            step_name (str): The name of the step to remove.
        """
        self.pipeline.steps = [(name, step) for name, step in self.pipeline.steps if name != step_name]

    def replace_step(self, step_name, new_step_object):
        """
        Replace an existing step in the pipeline with a new step.

        Args:
            step_name (str): The name of the step to replace.
            new_step_object (object): The new transformer or estimator object.
        """
        for i, (name, step) in enumerate(self.pipeline.steps):
            if name == step_name:
                self.pipeline.steps[i] = (step_name, new_step_object)
                break

    def get_pipeline(self):
        """
        Get the constructed or modified pipeline.

        Returns:
            Pipeline: The constructed or modified pipeline object.
        """
        return self.pipeline

In [54]:
class PreprocessingPipeline:
    """
    A class that encapsulates the preprocessing steps for feature engineering,
    imputation, scaling, encoding, and transformations. This can be inserted into
    the overall pipeline before the model fitting step.
    """
    def __init__(self, bins_hour, names_period, drop_columns, numerical_features,
                 onehot_features, ordinal_features, transform_features, trial: optuna.Trial=None):
        """
        Initialize the PreprocessingPipeline with necessary parameters.

        Args:
            bins_hour: Parameters for creating new features from hourly bins.
            names_period: Period names for feature creation.
            drop_columns: Columns to be dropped from the dataset.
            numerical_features: List of numerical features for processing.
            onehot_features: List of categorical features for OneHot encoding.
            ordinal_features: List of ordinal features for Ordinal encoding.
            transform_features: Features that require power transformation.
        """
        self.bins_hour = bins_hour
        self.names_period = names_period
        self.drop_columns = drop_columns
        self.numerical_features = numerical_features
        self.onehot_features = onehot_features
        self.ordinal_features = ordinal_features
        self.transform_features = transform_features
        
    def instantiate_numerical_simple_imputer(self, trial: optuna.Trial=None, strategy: str='mean', fill_value: int=-1) -> SimpleImputer:
        if strategy is None and trial:
            strategy = trial.suggest_categorical(
                'numerical_strategy', ['mean', 'median', 'most_frequent', 'constant']
            )
        #print(f"instantiate_numerical_simple_imputer: strategy= {strategy}")
        return SimpleImputer(strategy=strategy, fill_value=fill_value)

    def instantiate_categorical_simple_imputer(self, trial: optuna.Trial=None, strategy: str='most_frequent', fill_value: str='missing') -> SimpleImputer:
        if strategy is None and trial:
            strategy = trial.suggest_categorical('categorical_strategy', ['most_frequent', 'constant'])
        #print(f"instantiate_categorical_simple_imputer: strategy= {strategy}")
        return SimpleImputer(strategy=strategy, fill_value=fill_value)
    
    def instantiate_outliers(self, trial: optuna.Trial=None, strategy: str='power_transform') -> Union[PowerTransformer, LogTransforms, str]:
        if strategy is None and trial:
            strategy = trial.suggest_categorical(
                'outlier_strategy', ['power_transform', 'log_transform']
            )
        #print(f"instantiate_outliers: strategy= {strategy}")
        if strategy == 'power_transform':
            #print("instantiate_outliers: Entered PowerTransformer() ")
            return PowerTransformer(method='yeo-johnson')
        elif strategy == 'log_transform':
            #print("instantiate_outliers: Entered FunctionTransformer()")
            return FunctionTransformer(log_transformer, validate=False)
        else:
            #print("instantiate_outliers: Entered 'passthrough'")
            return "passthrough"
         
    def build(self, step_name=None, trial: optuna.Trial=None):
        """
        Build the preprocessing pipeline with feature creation, transformation, 
        imputation, scaling, and encoding steps.
        
        Returns:
            Transformer: The appropriate transformer for the given step.
        """
        
        if step_name == "create_new_features":
            return CreateNewFeature(bins_hour=self.bins_hour, names_period=self.names_period)
        
        if step_name == "replace_class":
            return ReplaceValueTransformer(old_value="?", new_value=np.nan)
        
        if step_name == "drop_cols":
            return DropRedundantColumns(redundant_cols=self.drop_columns)
        
        if step_name == 'column_transformer':
            return ColumnTransformer(
                transformers=[
                    ('numerical', Pipeline([
                        ('imputer', self.instantiate_numerical_simple_imputer(trial=trial, strategy='mean')),
                        #('scaler', StandardScaler())  # Add scaler if needed
                    ]), self.numerical_features),
                    
                    ('categorical', Pipeline([
                        ('imputer', self.instantiate_categorical_simple_imputer(trial=trial, strategy='most_frequent', fill_value='missing')),   
                        ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
                    ]), self.onehot_features),
                    
                    ('ordinal', Pipeline([
                        ('imputer', self.instantiate_categorical_simple_imputer(trial=trial, strategy='most_frequent', fill_value='missing')),
                        ('ordinal', OrdinalEncoder())
                    ]), self.ordinal_features),
                    
                    ('outlier_transform', Pipeline([
                        ('imputer', self.instantiate_numerical_simple_imputer(trial=trial)),
                        ('outlier_transformer', self.instantiate_outliers(trial=None, strategy='power_transform'))
                    ]), self.transform_features),
                ],
                remainder='passthrough'
            )


In [12]:
class ResamplerSelector:
    """
    A class to select and return a resampling algorithm based on a given parameter or 
    from a trial suggestion if available.

    Attributes:
        trial (optuna.trial, optional): The trial object for hyperparameter optimization.
    """

    def __init__(self, trial=None, random_state=42):
        """
        Initialize the ResamplerSelector with an optional trial for hyperparameter optimization.

        Args:
            trial (optuna.trial, optional): An optional trial object for suggesting resampling strategies.
            random_state (int): Random seed for reproducibility. Default is 42.
        """
        self.trial = trial
        self.random_state = random_state

    def get_resampler(self, resampler=None):
        """
        Return the resampling algorithm based on the provided `resampler` parameter.
        If `resampler` is not given, it is suggested from the trial.

        Args:
            resampler (str, optional): The resampling method ('RandomOverSampler', 'ADASYN', etc.). 
                                       If not provided, it will be suggested from the trial (if available).

        Returns:
            resampler_obj (object): The resampling instance based on the selected method.
        """
        if resampler is None and self.trial:
            resampler = self.trial.suggest_categorical(
                'resampler', ['RandomOverSampler', 'ADASYN', 'RandomUnderSampler', 'NearMiss', 
                              'SMOTEENN', 'SMOTETomek']
            )

        if resampler == 'RandomOverSampler':
            return RandomOverSampler(random_state=self.random_state)
        elif resampler == 'ADASYN':
            return ADASYN(random_state=self.random_state)
        elif resampler == 'RandomUnderSampler':
            return RandomUnderSampler(random_state=self.random_state)
        elif resampler == 'NearMiss':
            return NearMiss()
        elif resampler == 'SMOTEENN':
            return SMOTEENN(random_state=self.random_state, sampling_strategy='minority' )
        elif resampler == 'SMOTETomek':
            return SMOTETomek(random_state=self.random_state)
        else:
            raise ValueError(f"Unknown resampler: {resampler}")

In [13]:
class ScalerSelector:
    """
    A class to select and return a scaling algorithm based on a given parameter or 
    from a trial suggestion if available.

    Attributes:
        trial (optuna.trial, optional): The trial object for hyperparameter optimization.
    """

    def __init__(self, trial=None):
        """
        Initialize the ScalerSelector with an optional trial for hyperparameter optimization.

        Args:
            trial (optuna.trial, optional): An optional trial object for suggesting resampling strategies.
        """
        self.trial = trial

    def get_scaler(self, scaler_name=None):
        """
        Return the scaling algorithm based on the provided `scaler_name` parameter.
        If `scaler_name` is not given, it is suggested from the trial.

        Args:
            scaler_name (str, optional): The scalring method ('MinMaxScaler', 'StandardScaler', etc.). 
                                       If not provided, it will be suggested from the trial (if available).

        Returns:
            rscaler_obj (object): The scaling instance based on the selected method.
        """ 
         
        # -- Instantiate scaler (skip scaler for CatBoost as it handles categorical features internally)
        if scaler_name is None and self.trial:
            scaler_name = self.trial.suggest_categorical("scaler", ['minmax', 'standard', 'robust'])
            
        if scaler_name == "minmax":
            return MinMaxScaler()
        elif scaler_name == "standard":
            return StandardScaler()
        elif scaler_name == "robust":
            return RobustScaler()
        else:
            raise ValueError(f"Unknown scaler: {scaler_name}")

In [14]:
class DimensionalityReductionSelector:
    """
    A class to select and return a dimensionality reduction algorithm based on a given parameter 
    or from a trial suggestion if available.

    Attributes:
        trial (optuna.trial, optional): The trial object for hyperparameter optimization.
    """

    def __init__(self, trial=None):
        """
        Initialize the DimensionalityReductionSelector with an optional trial for hyperparameter optimization.

        Args:
            trial (optuna.trial, optional): An optional trial object for suggesting dimensionality reduction strategies.
        """
        self.trial = trial

    def get_dimensionality_reduction(self, dim_red=None):
        """
        Return the dimensionality reduction algorithm based on the provided `dim_red` parameter.
        If `dim_red` is not given, it is suggested from the trial.

        Args:
            dim_red (str, optional): The dimensionality reduction method ('PCA' or None). If not provided,
                                     it will be suggested from the trial (if available).

        Returns:
            dimen_red_algorithm (object or str): PCA algorithm or 'passthrough'.
        """
        if dim_red is None and self.trial:
            dim_red = self.trial.suggest_categorical("dim_red", ["PCA", None])

        if dim_red == "PCA":
            if self.trial:
                pca_n_components = self.trial.suggest_int("pca_n_components", 2, 30)
            else:
                pca_n_components = 5  # Default value if trial is not provided
            dimen_red_algorithm = PCA(n_components=pca_n_components)
        else:
            dimen_red_algorithm = 'passthrough'

        return dimen_red_algorithm


In [55]:
# Define objective function for Optuna
def objective(trial: optuna.Trial, classifier_name: str, scoring='f1') -> float:
    """
    Objective function to optimize classifiers dynamically using Optuna.

    Args:
        trial (optuna.Trial): Optuna trial object for suggesting hyperparameters.
        classifier_name (str): Classifier to optimize.
        scoring (str): Scoring metric for cross-validation.

    Returns:
        float: The mean score from cross-validation.
    """
    
    # Get hyperparameters for the classifier from HyperparameterTuner
    hyperparameter_tuner = HyperparameterTuner()
    params = hyperparameter_tuner.get_params(trial, classifier_name)
    #print("hyperparameter parameters obtained from HyperparameterTuner class")
    
    # Got the Preprocessed Pipeline containting Data Cleaning and Column Transformation
    preprocessing_pipeline = PreprocessingPipeline(
        bins_hour=bins_hour,
        names_period=names_period,
        drop_columns=drop_columns,
        numerical_features=numerical_features,
        onehot_features=onehot_features,
        ordinal_features=ordinal_features,
        transform_features=transform_features
    )
    
    # Initialize the manager with the preferred pipeline type ('ImbPipeline' or 'Pipeline')
    pipeline_manager = PipelineManager(pipeline_type='ImbPipeline')
    
    # Add transformation steps: Option 1 = Does not work, see the error message    
    pipeline_manager.add_step('create_new_features', preprocessing_pipeline.build(step_name='create_new_features', trial=None), position=0)
    pipeline_manager.add_step('replace_class', preprocessing_pipeline.build(step_name='replace_class', trial=None), position=1)
    pipeline_manager.add_step('drop_cols', preprocessing_pipeline.build(step_name='drop_cols', trial=None), position=2)
    pipeline_manager.add_step('column_transformer', preprocessing_pipeline.build(step_name='column_transformer', trial=trial), position=3)
    
    # Add the resampler step based on the provided resample name or trial suggestion
    resample_selector = ResamplerSelector(trial=trial)    
    resampler_obj = resample_selector.get_resampler()
    pipeline_manager.add_step('resampler', resampler_obj, position=4)
    
    
    # Add the scaler step based on the provided resample name or trial suggestion
    #scaler_selector = ScalerSelector(trial=trial)    
    #scaler_obj = scaler_selector.get_scaler()
    #pipeline_manager.add_step('scaler', scaler_obj, position=5)
    
    
    # Add the Dimensional Reduction step based on the provided parameter or trial suggestion
    dim_red_selector = DimensionalityReductionSelector(trial=trial)
    dim_reduction = dim_red_selector.get_dimensionality_reduction(dim_red=None)
    pipeline_manager.add_step('dim_reduction', dim_reduction, position=6)

    # Create an instance of the ModelFactory class with best_model and best_params
    model_factory = ModelFactory(classifier_name, params)
    model_instance = model_factory.get_model_instance()
    pipeline_manager.add_step('model', model_instance, position=7)
    
    pipeline = pipeline_manager.get_pipeline()
    #print(f"pipeline: {pipeline.steps[2:4]}")

    # Cross-validation
    kfold = StratifiedKFold(n_splits=10)
    score = cross_val_score(pipeline, X_train, y_train, scoring=scoring, n_jobs=-1, cv=kfold, verbose=0, error_score='raise')
    result = score.mean()
    
    """pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    result = accuracy"""

    return result

In [105]:
# Run the Optuna study
def run_optimization(config_path: str, n_trials: int = 100, scoring: str = 'f1') -> None:
    """
    Run Optuna study for hyperparameter tuning and model selection.
    
    Args:
        config_path (str): Path to the YAML configuration file.
        n_trials (int): Number of trials for optimization. Defaults to 100.
        scoring (str): Scoring metric for optimization. Defaults to 'f1'.
    """
    
    best_model_score = 0
    best_model = None
    best_params = None
    results = []

    all_models = ["RandomForest", "DecisionTree", 
                  "XGBoost", "LGBM", "GradientBoosting", 
                  "LogisticRegression", "KNeighbors", "CatBoost"]
    
    #all_models = ['DecisionTree']

    for model_name in all_models:
        print(f"\nOptimizing model: {model_name}")
        study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
        study.optimize(lambda trial: objective(trial, model_name, scoring), n_trials=n_trials)
        
        optuna.visualization.plot_optimization_history(study)

        best_trial = study.best_trial
        results.append({
            "model": model_name,
            "params": best_trial.params,
            "model_score_params": best_trial.params,
            "model_score_trial_number": best_trial.number,
            "model_score_datetime": best_trial.datetime_start,
            "model_score_duration": best_trial.duration,
            "model_score_status": best_trial.state,
            "model_score_key": scoring,
            "model_score_value": best_trial.value
            
        })

        current_score = best_trial.value
        
        
        if current_score and current_score > best_model_score:
            best_model_score = best_trial.value
            best_model = model_name
            best_params = best_trial.params
            
    print(f"Model: {model_name}, Current Score: {current_score} | Best Model: {best_model}, Best Score: {best_model_score}")

    # Display all results and the best model
    for result in results:
        print(result)
    
    print("Best model:", best_model)
    print("Best parameters:", best_params)
    
    
    # Save the variables to a file
    with open("best_model_and_params.pkl", "wb") as f:
        joblib.dump((best_model, best_params), f)
    

    # Use list comprehension to gather keys that start with "pipe"
    keys_to_remove = ["resampler", "scaler", "dim_red", "pca_n_components"]

    # Pop those keys from the dictionary
    for key in keys_to_remove:
        best_params.pop(key, None)

    # Resulting dictionary after removal
    print(f'cleaned best params: {best_params}')
    
    # Create an instance of the ModelFactory class with best_model and best_params
    model_factory = ModelFactory(best_model, best_params)
    # Get the model instance with appropriate parameters
    model_instance = model_factory.get_model_instance()
    print(f"model_instance: {model_instance}")
    
    """# Save the best model to a pickle file
    pipe_predict.steps.insert(6, ['model', best_model_instance])
    
    pipe_predict.fit(X_train, y_train)  # Ensure both X and y are passed
    joblib.dump(pipe_predict, "best_model.pkl")"""
    
    return model_instance

In [106]:
if __name__ == "__main__":
    config_path = "model_config.yaml"
    model_instance = run_optimization(config_path, n_trials=100, scoring='f1')

[I 2024-10-17 21:25:59,366] A new study created in memory with name: no-name-b98f1576-814f-4fb6-8ba3-34f7e851b4cd



Optimizing model: RandomForest


[I 2024-10-17 21:26:01,614] Trial 0 finished with value: 0.4297011130908766 and parameters: {'n_estimators': 144, 'max_depth': 29, 'min_samples_split': 15, 'min_samples_leaf': 12, 'resampler': 'NearMiss', 'scaler': 'standard', 'dim_red': 'PCA', 'pca_n_components': 7}. Best is trial 0 with value: 0.4297011130908766.
[I 2024-10-17 21:26:01,995] Trial 1 finished with value: 0.29096108621915073 and parameters: {'n_estimators': 126, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 6, 'resampler': 'SMOTETomek', 'scaler': 'robust', 'dim_red': None}. Best is trial 0 with value: 0.4297011130908766.
[I 2024-10-17 21:26:02,208] Trial 2 finished with value: 0.6047328038293134 and parameters: {'n_estimators': 92, 'max_depth': 3, 'min_samples_split': 20, 'min_samples_leaf': 20, 'resampler': 'RandomOverSampler', 'scaler': 'robust', 'dim_red': None}. Best is trial 2 with value: 0.6047328038293134.
[I 2024-10-17 21:26:02,568] Trial 3 finished with value: 0.5654834179776972 and parameters: 


Optimizing model: DecisionTree


[I 2024-10-17 21:26:43,671] Trial 2 finished with value: 0.40231035343925897 and parameters: {'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 4, 'resampler': 'RandomUnderSampler', 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 28}. Best is trial 1 with value: 0.6986991130941006.
[I 2024-10-17 21:26:43,752] Trial 3 finished with value: 0.5479663888762495 and parameters: {'max_depth': 9, 'min_samples_split': 14, 'min_samples_leaf': 7, 'resampler': 'NearMiss', 'scaler': 'robust', 'dim_red': None}. Best is trial 1 with value: 0.6986991130941006.
[I 2024-10-17 21:26:43,835] Trial 4 finished with value: 0.28207866052838737 and parameters: {'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 8, 'resampler': 'ADASYN', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 2}. Best is trial 1 with value: 0.6986991130941006.
[I 2024-10-17 21:26:43,917] Trial 5 finished with value: 0.6208210161129416 and parameters: {'max_depth': 25, 'min_samples_split': 15, 'mi


Optimizing model: XGBoost


[I 2024-10-17 21:26:52,678] Trial 1 finished with value: 0.19057345781876306 and parameters: {'booster': 'dart', 'lambda': 1.3060231803531604e-07, 'alpha': 2.1734877073417355e-06, 'subsample': 0.4930894746349534, 'colsample_bytree': 0.5648559873736287, 'resampler': 'RandomOverSampler', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 10}. Best is trial 0 with value: 0.5178882944055309.
[I 2024-10-17 21:26:52,763] Trial 2 finished with value: 0.44722641042190514 and parameters: {'booster': 'gblinear', 'lambda': 9.469038421774442e-08, 'alpha': 9.149877525022172e-05, 'subsample': 0.22751081689217473, 'colsample_bytree': 0.9274563216630256, 'resampler': 'ADASYN', 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 28}. Best is trial 0 with value: 0.5178882944055309.
[I 2024-10-17 21:26:52,865] Trial 3 finished with value: 0.5119755521055896 and parameters: {'booster': 'gblinear', 'lambda': 4.005370050283172e-06, 'alpha': 1.2865252594826764e-05, 'subsample': 0.417079225419116


Optimizing model: LGBM


[I 2024-10-17 21:27:36,258] Trial 0 finished with value: 0.5199071225541814 and parameters: {'lambda_l1': 2.348881295853308e-05, 'lambda_l2': 3.6010467344475403, 'num_leaves': 188, 'feature_fraction': 0.759195090518222, 'bagging_fraction': 0.4936111842654619, 'bagging_freq': 2, 'min_child_samples': 10, 'resampler': 'SMOTEENN', 'scaler': 'minmax', 'dim_red': None}. Best is trial 0 with value: 0.5199071225541814.
[I 2024-10-17 21:27:37,589] Trial 1 finished with value: 0.21133904239167398 and parameters: {'lambda_l1': 7.71800699380605e-05, 'lambda_l2': 4.17890272377219e-06, 'num_leaves': 158, 'feature_fraction': 0.4836963163912251, 'bagging_fraction': 0.5752867891211308, 'bagging_freq': 3, 'min_child_samples': 48, 'resampler': 'RandomOverSampler', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 10}. Best is trial 0 with value: 0.5199071225541814.
[I 2024-10-17 21:27:38,655] Trial 2 finished with value: 0.4145679116392841 and parameters: {'lambda_l1': 7.569183361880229e-08, 'lam


Optimizing model: GradientBoosting


[I 2024-10-17 21:31:01,853] Trial 0 finished with value: 0.4224223683448125 and parameters: {'learning_rate': 0.008468008575248327, 'n_estimators': 956, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_features': None, 'resampler': 'NearMiss', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 10}. Best is trial 0 with value: 0.4224223683448125.
[I 2024-10-17 21:31:12,886] Trial 1 finished with value: 0.42386661292138655 and parameters: {'learning_rate': 0.032781876533976156, 'n_estimators': 225, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'resampler': 'SMOTETomek', 'scaler': 'minmax', 'dim_red': None}. Best is trial 1 with value: 0.42386661292138655.
[I 2024-10-17 21:31:13,194] Trial 2 finished with value: 0.4534302946664706 and parameters: {'learning_rate': 0.012311503632415646, 'n_estimators': 209, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 19, 'max_features': 'log2', 'resampler': 'NearMiss', 'sc


Optimizing model: LogisticRegression


[I 2024-10-17 22:04:09,538] Trial 2 finished with value: 0.39650490338127514 and parameters: {'solver': 'sag', 'max_iter': 9184, 'penalty_sag': None, 'resampler': 'SMOTEENN', 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 7}. Best is trial 1 with value: 0.4868761261689416.
[I 2024-10-17 22:04:09,618] Trial 3 finished with value: 0.43413585444875125 and parameters: {'solver': 'sag', 'max_iter': 8459, 'penalty_sag': 'l2', 'C': 0.0011351390942507622, 'resampler': 'NearMiss', 'scaler': 'standard', 'dim_red': None}. Best is trial 1 with value: 0.4868761261689416.
[I 2024-10-17 22:04:09,689] Trial 4 finished with value: 0.3851135298062499 and parameters: {'solver': 'saga', 'max_iter': 6610, 'penalty_saga': 'elasticnet', 'C': 0.3054079362964481, 'l1_ratio': 0.6375574713552131, 'resampler': 'RandomOverSampler', 'scaler': 'minmax', 'dim_red': 'PCA', 'pca_n_components': 5}. Best is trial 1 with value: 0.4868761261689416.
[I 2024-10-17 22:04:09,769] Trial 5 finished with value: 0.50551


Optimizing model: KNeighbors


[I 2024-10-17 22:05:31,515] Trial 2 finished with value: 0.25460731153825783 and parameters: {'n_neighbors': 2, 'weights': 'uniform', 'p': 2, 'leaf_size': 38, 'metric': 'chebyshev', 'resampler': 'ADASYN', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 26}. Best is trial 0 with value: 0.40563035606217657.
[I 2024-10-17 22:05:31,604] Trial 3 finished with value: 0.2829612727368057 and parameters: {'n_neighbors': 18, 'weights': 'distance', 'p': 1, 'leaf_size': 82, 'metric': 'manhattan', 'resampler': 'ADASYN', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 3}. Best is trial 0 with value: 0.40563035606217657.
[I 2024-10-17 22:05:31,696] Trial 4 finished with value: 0.3919798739730108 and parameters: {'n_neighbors': 16, 'weights': 'distance', 'p': 2, 'leaf_size': 90, 'metric': 'chebyshev', 'resampler': 'ADASYN', 'scaler': 'robust', 'dim_red': None}. Best is trial 0 with value: 0.40563035606217657.
[I 2024-10-17 22:05:31,796] Trial 5 finished with value: 0.36928514525003


Optimizing model: CatBoost


[I 2024-10-17 22:05:46,546] Trial 0 finished with value: 0.4010802540654147 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.07587945476302646, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'resampler': 'RandomUnderSampler', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 19}. Best is trial 0 with value: 0.4010802540654147.

A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.

[I 2024-10-17 22:05:51,230] Trial 1 finished with value: 0.3949436850524827 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04297256589643226, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'resampler': 'SMOTEENN', 'scaler': 'robust', 'dim_red': 'PCA', 'pca_n_components': 16}. Best is trial 0 with value: 0.4010802540654147.
[I 2024-10-17 22:05:51,870] Trial 2 finished with value: 0.3783052197748823 and parameters: {'objective': 'CrossEnt

Model: CatBoost, Current Score: 0.6877691395443251 | Best Model: GradientBoosting, Best Score: 0.7469467718739711
{'model': 'RandomForest', 'params': {'n_estimators': 173, 'max_depth': 22, 'min_samples_split': 3, 'min_samples_leaf': 20, 'resampler': 'RandomOverSampler', 'scaler': 'minmax', 'dim_red': None}, 'model_score_params': {'n_estimators': 173, 'max_depth': 22, 'min_samples_split': 3, 'min_samples_leaf': 20, 'resampler': 'RandomOverSampler', 'scaler': 'minmax', 'dim_red': None}, 'model_score_trial_number': 77, 'model_score_datetime': datetime.datetime(2024, 10, 17, 21, 26, 33, 911315), 'model_score_duration': datetime.timedelta(microseconds=413122), 'model_score_status': 1, 'model_score_key': 'f1', 'model_score_value': 0.6326421518204143}
{'model': 'DecisionTree', 'params': {'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 20, 'resampler': 'RandomOverSampler', 'scaler': 'standard', 'dim_red': None}, 'model_score_params': {'max_depth': 4, 'min_samples_split': 4, 'min_sa

In [91]:
# Load the variables from the file
with open("best_model_and_params.pkl", "rb") as f:
    best_model, best_params = joblib.load(f)



In [92]:
# Use the loaded variables
print("Best model:", best_model)
print("Best parameters:", best_params)

Best model: DecisionTree
Best parameters: {'max_depth': 6, 'min_samples_split': 11, 'min_samples_leaf': 17, 'resampler': 'ADASYN', 'scaler': 'robust', 'dim_red': None}


In [93]:
# Got the Preprocessed Pipeline containting Data Cleaning and Column Transformation
preprocessing_pipeline = PreprocessingPipeline(
    bins_hour=bins_hour,
    names_period=names_period,
    drop_columns=drop_columns,
    numerical_features=numerical_features,
    onehot_features=onehot_features,
    ordinal_features=ordinal_features,
    transform_features=transform_features
)


# Initialize the manager with the preferred pipeline type ('ImbPipeline' or 'Pipeline')
pipeline_manager = PipelineManager(pipeline_type='ImbPipeline')

# Add transformation steps: Option 1 = Does not work, see the error message    
pipeline_manager.add_step('create_new_features', preprocessing_pipeline.build(step_name='create_new_features', trial=None), position=0)
pipeline_manager.add_step('replace_class', preprocessing_pipeline.build(step_name='replace_class', trial=None), position=1)
pipeline_manager.add_step('drop_cols', preprocessing_pipeline.build(step_name='drop_cols', trial=None), position=2)
pipeline_manager.add_step('column_transformer', preprocessing_pipeline.build(step_name='column_transformer', trial=None), position=3)

# Add the resampler step based on the provided resample name or trial suggestion
resample_selector = ResamplerSelector(trial=None) 
resampler =    best_params.pop('resampler')
resampler_obj = resample_selector.get_resampler(resampler)
pipeline_manager.add_step('resampler', resampler_obj, position=4)


# Add the scaler step based on the provided resample name or trial suggestion
scaler_selector = ScalerSelector(trial=None)   
scaler =    best_params.pop('scaler') 
scaler_obj = scaler_selector.get_scaler(scaler)
pipeline_manager.add_step('scaler', scaler_obj, position=5)


# Add the Dimensional Reduction step based on the provided parameter or trial suggestion
dim_red_selector = DimensionalityReductionSelector(trial=None)


dim_red = None
if dim_red is not None: 
    dim_reduction = dim_red_selector.get_dimensionality_reduction(dim_red=None)  
    pipeline_manager.add_step('dim_reduction', dim_reduction, position=6)

# Create an instance of the ModelFactory class with best_model and best_params
# Use list comprehension to gather keys that start with "pipe"
keys_to_remove = ["resampler", "scaler", "dim_red", "pca_n_components"]


# Pop those keys from the dictionary
for key in keys_to_remove:
    best_params.pop(key, None)
        
model_factory = ModelFactory(best_model, best_params)
model_instance = model_factory.get_model_instance()
pipeline_manager.add_step('model', model_instance)

pipeline = pipeline_manager.get_pipeline()
#print(f"pipeline: {pipeline.steps[2:4]}")

In [94]:
pipeline

In [95]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

test_results = {
    'f1_score': f1,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
}

In [96]:
test_results

{'f1_score': 0.9055374592833876,
 'accuracy': 0.855,
 'precision': 0.8910256410256411,
 'recall': 0.9205298013245033}

In [70]:
test_results

{'f1_score': 0.9047619047619048,
 'accuracy': 0.86,
 'precision': 0.9300699300699301,
 'recall': 0.8807947019867549}

In [97]:
print(y_test[15:35].tolist())
print(y_pred[15:35].tolist())

[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1]


In [28]:
test_results

{'f1_score': 0.8888888888888888,
 'accuracy': 0.825,
 'precision': 0.8536585365853658,
 'recall': 0.9271523178807947}