# MAIN LIBRARIES

In [46]:
# STANDARD LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import os
import warnings
warnings.filterwarnings('ignore')

# MISC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import shap

# CLASSIFICATION MODELS
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# REGRESSION MDOELS
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# EDA

In [47]:
from dataprep.eda import create_report, plot, plot_correlation, plot_missing , plot_diff
def interactive_eda(df):
    # Create Dataprep EDA report
    report = create_report(df)

    # Show the report
    report.show()

# BINARYFLOW

In [None]:
# Define your custom model and parameter dictionaries Example
#custom_model_dict = {'rf': RandomForestClassifier(random_state=42), 
#                     'custom_model': MyCustomClassifier()
#                     }

#custom_params_dict = {'rf': {'n_estimators': [100, 150]}, 
#                      'custom_model': {'some_param': [1, 2, 3]}
#                      }

def BinaryFlow(X_train, X_test, y_train, y_test, models=['xgb', 'lgbm', 'rf', 'gbm', 'svc'], custom_model_dict=None, custom_params_dict=None):
    # Always fit and evaluate basic models: Logistic Regression and Naive Bayes
    lr_model = LogisticRegression(random_state=42)
    lr_model.fit(X_train, y_train)

    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    evaluate_binary_model(lr_model, 'LOGISTIC REGRESSION CLASSIFIER', X_train, X_test, y_train, y_test)
    evaluate_binary_model(nb_model, 'NAIVE BAYES CLASSIFIER', X_train, X_test, y_train, y_test)

    # Default model and parameter dictionaries
    model_dict = {
        'xgb': XGBClassifier(random_state=42),
        'lgbm': LGBMClassifier(random_state=42),
        'rf': RandomForestClassifier(random_state=42),
        'gbm': GradientBoostingClassifier(random_state=42),
        'svc': SVC(random_state=42, probability=True)
    }

    params_dict = {
        'xgb': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'lgbm': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'rf': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
        'gbm': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
        'svc': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly', 'sigmoid']}
    }

    # Use custom dictionaries if provided
    if custom_model_dict is not None:
        model_dict = custom_model_dict

    if custom_params_dict is not None:
        params_dict = custom_params_dict

    grid_dict = {}
    best_estimators = []
    selected_models = {}

    # Fit and evaluate models specified in the 'models' list
    print("\n[CREATING BASE MODELS]")
    for model_name in models:
        if model_name in model_dict:
            print(f"\n- Optimizing {model_name.upper()}")
            grid = GridSearchCV(model_dict[model_name], params_dict[model_name], cv=5, verbose=0, n_jobs=-1)
            grid.fit(X_train, y_train)
            grid_dict[model_name] = grid
            best_estimators.append((model_name, grid.best_estimator_))
            selected_models[model_name.upper()] = grid.best_estimator_
            print(f'{model_name.upper()} Best Parameters:', grid.best_params_)
            evaluate_binary_model(grid.best_estimator_, f'{model_name.upper()} CLASSIFIER', X_train, X_test, y_train, y_test)

    # If additional models are specified, create a voting and stacking classifier, but only if there is more than one model
    if best_estimators and len(models) > 1:
        print("\n[CREATING VOTING & STACKING ENSEMBLES]")

        # Voting Classifier
        print("\n- Creating Voting Classifier")
        voting_clf = VotingClassifier(estimators=best_estimators, voting='soft')
        voting_clf.fit(X_train, y_train)
        evaluate_binary_model(voting_clf, 'VOTING CLASSIFIER', X_train, X_test, y_train, y_test)

        # Stacking Classifier
        print("\n- Creating Stacking Classifier")
        stacking_clf = StackingClassifier(classifiers=[est[1] for est in best_estimators], meta_classifier=LogisticRegression())
        stacking_clf.fit(X_train, y_train)
        evaluate_binary_model(stacking_clf, 'STACKING CLASSIFIER', X_train, X_test, y_train, y_test)

    # Feature importances
    if selected_models:
        mean_importances = np.mean([model.feature_importances_ for model in selected_models.values() if hasattr(model, 'feature_importances_')], axis=0)

        plt.figure(figsize=(12, 8))
        sns.barplot(x=mean_importances, y=X_train.columns)
        plt.title('Mean Feature Importance Plot')
        plt.xlabel('Mean Feature Importance')
        plt.ylabel('Features')
        plt.show()

        for model_name, model in selected_models.items():
            plot_feature_importance(model, model_name, X_train)

        print("\n NOTE: Feature importances are not available for SVC models due to how they work.")

    # Shapley values
    for model_name, model in selected_models.items():
        plot_shap_values(model, model_name, X_train, multi_class=False)

    print("\n NOTE: Shap values are not available for SVC models due to how they work.")
    print("\n NOTE: Shap values are only available for GBM for binary classification at this time.")

# MULTICLASSFLOW

In [None]:
# Define your custom model and parameter dictionaries Example
#custom_model_dict = {'rf': RandomForestClassifier(random_state=42), 
#                     'custom_model': MyCustomClassifier()
#                     }

#custom_params_dict = {'rf': {'n_estimators': [100, 150]}, 
#                      'custom_model': {'some_param': [1, 2, 3]}
#                      }

def MultiClassFlow(X_train, X_test, y_train, y_test, models=['xgb', 'lgbm', 'rf', 'gbm', 'svc'], custom_model_dict=None, custom_params_dict=None):
    # Always fit and evaluate basic models: Logistic Regression and Naive Bayes
    lr_model = LogisticRegression(random_state=42, multi_class='auto')
    lr_model.fit(X_train, y_train)

    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    evaluate_multiclass_model(lr_model, 'LOGISTIC REGRESSION CLASSIFIER', X_train, X_test, y_train, y_test)
    evaluate_multiclass_model(nb_model, 'NAIVE BAYES CLASSIFIER', X_train, X_test, y_train, y_test)

    # Default model and parameter dictionaries
    model_dict = {
        'xgb': XGBClassifier(random_state=42),
        'lgbm': LGBMClassifier(random_state=42),
        'rf': RandomForestClassifier(random_state=42),
        'gbm': GradientBoostingClassifier(random_state=42),
        'svc': SVC(random_state=42, probability=True)
    }

    params_dict = {
        'xgb': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'lgbm': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'rf': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
        'gbm': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
        'svc': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly', 'sigmoid']}
    }

    # Use custom dictionaries if provided
    if custom_model_dict is not None:
        model_dict = custom_model_dict

    if custom_params_dict is not None:
        params_dict = custom_params_dict

    grid_dict = {}
    best_estimators = []
    selected_models = {}

    # Fit and evaluate models specified in the 'models' list
    print("\n[CREATING BASE MODELS]")
    for model_name in models:
        if model_name in model_dict:
            print(f"\n- Optimizing {model_name.upper()}")
            grid = GridSearchCV(model_dict[model_name], params_dict[model_name], cv=5, verbose=0, n_jobs=-1)
            grid.fit(X_train, y_train)
            grid_dict[model_name] = grid
            best_estimators.append((model_name, grid.best_estimator_))
            selected_models[model_name.upper()] = grid.best_estimator_
            print(f'{model_name.upper()} Best Parameters:', grid.best_params_)
            evaluate_multiclass_model(grid.best_estimator_, f'{model_name.upper()} CLASSIFIER', X_train, X_test, y_train, y_test)

    # Ensemble methods
    if best_estimators and len(models) > 1:
        print("\n[CREATING VOTING & STACKING ENSEMBLES]")

        print("\n- Creating Voting Classifier")
        voting_clf = VotingClassifier(estimators=best_estimators, voting='soft')
        voting_clf.fit(X_train, y_train)
        evaluate_multiclass_model(voting_clf, 'VOTING CLASSIFIER', X_train, X_test, y_train, y_test)

        print("\n- Creating Stacking Classifier")
        stacking_clf = StackingClassifier(classifiers=[est[1] for est in best_estimators], meta_classifier=LogisticRegression())
        stacking_clf.fit(X_train, y_train)
        evaluate_multiclass_model(stacking_clf, 'STACKING CLASSIFIER', X_train, X_test, y_train, y_test)

    # Feature Importances
    if selected_models:
        mean_importances = np.mean([model.feature_importances_ for model in selected_models.values() if hasattr(model, 'feature_importances_')], axis=0)

        plt.figure(figsize=(12, 8))
        sns.barplot(x=mean_importances, y=X_train.columns)
        plt.title('Mean Feature Importance Plot')
        plt.xlabel('Mean Feature Importance')
        plt.ylabel('Features')
        plt.show()

        for model_name, model in selected_models.items():
            # Assuming you have a function called plot_feature_importance
            plot_feature_importance(model, model_name, X_train)

    # Shapley Values
    for model_name, model in selected_models.items():
        
        # Check if the model is gradient boosting machine skip it (not available for multiclass)
        if model_name == 'gbm' or model_name == 'GBM': 
            continue
            
        # Assuming you have a function called plot_shap_values
        plot_shap_values(model, model_name, X_train, multi_class=True)

    print("\n NOTE: Shap values are not available for SVC models due to how they work.")
    print("\n NOTE: Shap values are only available for GBM for binary classification at this time.")

# REGRESSFLOW

In [None]:
# Define your custom model and parameter dictionaries Example
#custom_model_dict = {'rf': RandomForesRegresor(random_state=42), 
#                     'custom_model': MyCustomRegressor()
#                     }

#custom_params_dict = {'rf': {'n_estimators': [100, 150]}, 
#                      'custom_model': {'some_param': [1, 2, 3]}
#                      }

def RegressFlow(X_train, X_test, y_train, y_test, models=['xgb', 'lgbm', 'rf', 'gbm'], custom_model_dict=None, custom_params_dict=None):
    # Always fit and evaluate basic model: Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    evaluate_regression_model(lr_model, 'LINEAR REGRESSION', X_train, X_test, y_train, y_test)

    # Default model and parameter dictionaries
    model_dict = {
        'xgb': XGBRegressor(random_state=42),
        'lgbm': LGBMRegressor(random_state=42),
        'rf': RandomForestRegressor(random_state=42),
        'gbm': GradientBoostingRegressor(random_state=42)
    }

    params_dict = {
        'xgb': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'lgbm': {'n_estimators': [100, 150, 300], 'subsample': [None, 0.3, 0.5, 0.8, 0.9], 'colsample_bytree': [0.3, 0.5, 0.8, 1.0]},
        'rf': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
        'gbm': {'n_estimators': [100, 150, 300], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]}
    }

    # Use custom dictionaries if provided
    if custom_model_dict is not None:
        model_dict = custom_model_dict

    if custom_params_dict is not None:
        params_dict = custom_params_dict

    grid_dict = {}
    best_estimators = []
    selected_models = {}

    # Fit and evaluate models specified in the 'models' list
    print("\n[CREATING BASE MODELS]")
    for model_name in models:
        if model_name in model_dict:
            print(f"\n- Optimizing {model_name.upper()}")
            grid = GridSearchCV(model_dict[model_name], params_dict[model_name], cv=5, verbose=0, n_jobs=-1)
            grid.fit(X_train, y_train)
            grid_dict[model_name] = grid
            best_estimators.append((model_name, grid.best_estimator_))
            selected_models[model_name.upper()] = grid.best_estimator_
            print(f'{model_name.upper()} Best Parameters:', grid.best_params_)
            evaluate_regression_model(grid.best_estimator_, f'{model_name.upper()} REGRESSOR', X_train, X_test, y_train, y_test)

    # If more than one model is specified, create a voting regressor
    if best_estimators and len(models) > 1:
        print("\n[CREATING STACKING ENSEMBLE]")

        print("\n- Creating Stacking Regressor")
        stacking_reg = StackingRegressor(regressors=[est[1] for est in best_estimators], meta_regressor=LinearRegression())
        stacking_reg.fit(X_train, y_train)
        evaluate_regression_model(stacking_reg, 'STACKING REGRESSOR', X_train, X_test, y_train, y_test)

    # Feature Importances
    if selected_models:
        mean_importances = np.mean([model.feature_importances_ for model in selected_models.values() if hasattr(model, 'feature_importances_')], axis=0)

        plt.figure(figsize=(12, 8))
        sns.barplot(x=mean_importances, y=X_train.columns)
        plt.title('Mean Feature Importance Plot')
        plt.xlabel('Mean Feature Importance')
        plt.ylabel('Features')
        plt.show()

        for model_name, model in selected_models.items():
            plot_feature_importance(model, model_name, X_train)

    # Shapley Values
    for model_name, model in selected_models.items():
        plot_shap_values(model, model_name, X_train, multi_class=False)

# BASICS

In [52]:
# BASICS

from sklearn.datasets import make_classification, make_regression

def make_sample_data(task='classification', n_samples=1000, n_features=10,
                     n_informative=7, n_redundant=3, random_state=42, test_size=0.20):
    """
    Create a sample dataset and split it into training and testing sets.
    
    Parameters:
        task (str): The type of problem to generate data for ('classification' or 'regression').
        n_samples (int): The total number of samples.
        n_features (int): The total number of features.
        n_informative (int): The number of informative features.
        n_redundant (int): The number of redundant features.
        random_state (int): The seed for random number generator.
        test_size (float): The proportion of the dataset to include in the test split.
    
    Returns:
        tuple: X_train, X_test, y_train, y_test (as Pandas DataFrame and Series)
    """

    if task == 'classification':
        # Create a classification dataset
        X, y = make_classification(n_samples=n_samples, n_features=n_features,
                                   n_informative=n_informative, n_redundant=n_redundant,
                                   random_state=random_state)
    elif task == 'regression':
        # Create a regression dataset
        X, y = make_regression(n_samples=n_samples, n_features=n_features,
                               n_informative=n_informative, noise=0.1,
                               random_state=random_state)
    else:
        raise ValueError("Invalid task. Choose either 'classification' or 'regression'.")

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Convert to Pandas DataFrame and Series
    feature_names = [f'feature_{i}' for i in range(1, n_features+1)]
    X_train = pd.DataFrame(X_train, columns=feature_names)
    X_test = pd.DataFrame(X_test, columns=feature_names)
    y_train = pd.Series(y_train, name='target')
    y_test = pd.Series(y_test, name='target')

    return X_train, X_test, y_train, y_test

def manual_dropper(dataframe, columns_to_manually_drop):
    """
    Purpose:
    Manually drop specified columns from a given DataFrame.

    Parameters:
        - dataframe (pd.DataFrame): DataFrame from which columns will be dropped
        - columns_to_manually_drop (list): List of column names to drop

    Returns:
        - pd.DataFrame: DataFrame with specified columns removed
    """
    for col in columns_to_manually_drop:
        if col in dataframe.columns:
            dataframe = dataframe.drop(columns=col)
            print(f'Dropped {col} column')
        else:
            print(f'Warning: {col} column not found in DataFrame, unable to drop it')
    return dataframe

def ensure_data_types(X_train, X_test, y_train, y_test):
    """
    Purpose:
    Ensure that training and testing data are of correct types.

    Parameters:
        - X_train, X_test: Training and testing feature sets
        - y_train, y_test: Training and testing labels

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test in DataFrame or Series format
    """
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.Series(y_train)
    y_test = pd.Series(y_test)

    return X_train, X_test, y_train, y_test

def ensure_data_types_and_show(X_train, X_test, y_train, y_test):
    """
    Purpose:
    Ensure that training and testing data are of correct types, and display the first 5 rows of each DataFrame.

    Parameters:
        - X_train, X_test: Training and testing feature sets
        - y_train, y_test: Training and testing labels

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test in DataFrame or Series format
    """
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.Series(y_train)
    y_test = pd.Series(y_test)

    print("\nFirst 5 rows of the training dataframe:")
    display(X_train.head())

    print("\nFirst 5 rows of the testing dataframe:")
    display(X_test.head())

    return X_train, X_test, y_train, y_test


def optimal_bins(dataframe, list_of_columns_to_check, method='auto',
                 return_ordinal_bins=False, plot_bins=True, print_edges=False):
    """
    Purpose:
    Optimize and plot histogram bins for specified numerical columns in a DataFrame. Optionally prints the bin edges.

    Parameters:
        - dataframe (pd.DataFrame): The DataFrame containing the data to be binned.
        - list_of_columns_to_check (list): List of column names in the DataFrame for which bins should be optimized.
        - method (str): The method used for bin optimization (default='auto'). Refer to numpy.histogram_bin_edges for available methods.
        - return_ordinal_bins (bool): If True, returns DataFrame with ordinal bins instead of the original data (default=False).
        - plot_bins (bool): If True, plots the histograms for each column specified (default=True).
        - print_edges (bool): If True, prints the bin edges and their labels after each plot (default=False).

    Returns:
        - pd.DataFrame: Updated DataFrame with ordinal bins if return_ordinal_bins is True. Otherwise, None.

    Note:
    The function will issue a warning and skip the column if it is not found in the DataFrame.
    """
    sns.set(style="whitegrid")
    df_ordinal_bins = dataframe[list_of_columns_to_check].copy()

    for column in list_of_columns_to_check:
        if column not in dataframe.columns:
            print(f"Warning: Column '{column}' not found in DataFrame. Skipping...")
            continue

        if dataframe[column].dtype in ['int64', 'float64']:
            data = dataframe[column].dropna()
            bin_edges = np.histogram_bin_edges(data, bins=method)

            num_bins = int(np.floor(len(bin_edges) - 1))
            bin_edges = np.histogram_bin_edges(data, bins=num_bins)

            if plot_bins:
                plt.figure(figsize=(12, 6))
                colors = sns.color_palette("husl", num_bins)
                legend_labels = []
                for i in range(num_bins):
                    sns.histplot(data, bins=[bin_edges[i], bin_edges[i + 1]], kde=False, color=colors[i])
                    legend_labels.append(f'Bin {i+1} [{bin_edges[i]:.2f} to {bin_edges[i+1]:.2f}]')

                plt.legend(title='Bins', labels=legend_labels, loc='upper left', bbox_to_anchor=(1, 1))
                plt.title(f'{column} - {method.capitalize()} Bins')
                plt.tight_layout(rect=[0, 0, 0.85, 1])
                plt.show()

            if print_edges:
                print(f"Column: {column}")
                for i in range(num_bins):
                    print(f'Bin {i+1} [{bin_edges[i]:.2f} to {bin_edges[i+1]:.2f}]')

            if return_ordinal_bins:
                df_ordinal_bins[column] = pd.cut(dataframe[column], bins=bin_edges,
                                                 labels=False, right=False).astype(pd.Int64Dtype(), errors='ignore')

    if return_ordinal_bins:
        dataframe.update(df_ordinal_bins)
        return dataframe

# TRAIN TEST SPLITS

In [53]:
# TRAIN TEST SPLITS

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

def split_dataframe(df, test_size=0.2, random_state=42, target_column='target', stratified=False):
    """
    Purpose:
    Split a DataFrame into training and testing sets for features and labels.

    Parameters:
        - df (pd.DataFrame): DataFrame to be split
        - test_size (float): Proportion of the dataset to include in the test split (default=0.2)
        - random_state (int): The seed used by the random number generator (default=42)
        - target_column (str): The name of the target (label) column (default='target')
        - stratified (bool): Whether to perform stratified sampling (default=False)

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test as DataFrame or Series
    """
    if stratified:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
        for train_index, test_index in sss.split(df.drop([target_column], axis=1), df[target_column]):
            X_train = df.drop([target_column], axis=1).iloc[train_index]
            y_train = df[target_column].iloc[train_index]
            X_test = df.drop([target_column], axis=1).iloc[test_index]
            y_test = df[target_column].iloc[test_index]
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop([target_column], axis=1), df[target_column], test_size=test_size, random_state=random_state
        )

    return X_train, X_test, y_train, y_test

# IMPUTATION

In [54]:
# IMPUTATION

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

def validate_columns(columns, X_train, y_train, imputation_name):
    """
    Purpose:
    Validate that the specified columns for imputation exist in the training data.

    Parameters:
        - columns (list): List of columns to be imputed
        - X_train (pd.DataFrame): Training feature set
        - y_train (pd.Series): Training labels
        - imputation_name (str): Name of the imputation method being used

    Returns:
        - bool: True if all columns are valid, False otherwise
    """
    if not columns:
        print(f"No columns were specified for {imputation_name} imputation.")
    else:
        print(f"{', '.join(columns)} were specified for {imputation_name} imputation.")
    invalid_columns = [column for column in columns if column not in X_train.columns and column != y_train.name]
    if invalid_columns:
        print(f"{imputation_name} ran into invalid column names: {', '.join(invalid_columns)}")
        return False
    return True

def mean_imputation(X_train, X_test, y_train, y_test, mean_imputation_columns):
    """
    Purpose:
    Perform mean imputation on specified columns of training and testing sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - mean_imputation_columns (list): List of columns on which to perform mean imputation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test with mean imputation applied
    """
    if not validate_columns(mean_imputation_columns, X_train, y_train, "Mean"):
        return X_train, X_test, y_train, y_test

    imputer = SimpleImputer(strategy='mean')
    for column in mean_imputation_columns:
        if column == y_train.name:
            imputer.fit(y_train.to_frame())
            y_train = pd.Series(imputer.transform(y_train.to_frame()).flatten(), name=y_train.name)
            y_test = pd.Series(imputer.transform(y_test.to_frame()).flatten(), name=y_test.name)
        else:
            imputer.fit(X_train[[column]])
            X_train[column] = imputer.transform(X_train[[column]])
            X_test[column] = imputer.transform(X_test[[column]])
    return X_train, X_test, y_train, y_test

def trimmed_mean_imputation(X_train, X_test, y_train, y_test, trimmed_mean_imputation_columns):
    """
    Purpose:
    Perform trimmed mean imputation on specified columns of training and testing sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - trimmed_mean_imputation_columns (list): List of columns on which to perform trimmed mean imputation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test with trimmed mean imputation applied
    """
    if not validate_columns(trimmed_mean_imputation_columns, X_train, y_train, "Trimmed Mean"):
        return X_train, X_test, y_train, y_test

    def trimmed_mean(values):
        non_null_values = [value for value in values if pd.notnull(value)]
        trim_percentage = 0.1
        trim_size = int(trim_percentage * len(non_null_values))
        trimmed_values = sorted(non_null_values)[trim_size:-trim_size]
        trimmed_mean = np.mean(trimmed_values)
        return trimmed_mean

    for column in trimmed_mean_imputation_columns:
        imputer = SimpleImputer(strategy=trimmed_mean)
        if column == y_train.name:
            imputer.fit(y_train.to_frame())
            y_train = pd.Series(imputer.transform(y_train.to_frame()).flatten(), name=y_train.name)
            y_test = pd.Series(imputer.transform(y_test.to_frame()).flatten(), name=y_test.name)
        else:
            imputer.fit(X_train[[column]])
            X_train[column] = imputer.transform(X_train[[column]])
            X_test[column] = imputer.transform(X_test[[column]])
    return X_train, X_test, y_train, y_test

def median_imputation(X_train, X_test, y_train, y_test, median_imputation_columns):
    """
    Purpose:
    Perform median imputation on specified columns of training and testing sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - median_imputation_columns (list): List of columns on which to perform median imputation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test with median imputation applied
    """
    if not validate_columns(median_imputation_columns, X_train, y_train, "Median"):
        return X_train, X_test, y_train, y_test

    imputer = SimpleImputer(strategy='median')
    for column in median_imputation_columns:
        if column == y_train.name:
            imputer.fit(y_train.to_frame())
            y_train = pd.Series(imputer.transform(y_train.to_frame()).flatten(), name=y_train.name)
            y_test = pd.Series(imputer.transform(y_test.to_frame()).flatten(), name=y_test.name)
        else:
            imputer.fit(X_train[[column]])
            X_train[column] = imputer.transform(X_train[[column]])
            X_test[column] = imputer.transform(X_test[[column]])
    return X_train, X_test, y_train, y_test

def mode_imputation(X_train, X_test, y_train, y_test, mode_imputation_columns):
    """
    Purpose:
    Perform mode imputation on specified columns of training and testing sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - mode_imputation_columns (list): List of columns on which to perform mode imputation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test with mode imputation applied
    """
    if not validate_columns(mode_imputation_columns, X_train, y_train, "Mode"):
        return X_train, X_test, y_train, y_test

    imputer = SimpleImputer(strategy='most_frequent')
    for column in mode_imputation_columns:
        if column == y_train.name:
            imputer.fit(y_train.to_frame())
            y_train = pd.Series(imputer.transform(y_train.to_frame()).flatten(), name=y_train.name)
            y_test = pd.Series(imputer.transform(y_test.to_frame()).flatten(), name=y_test.name)
        else:
            imputer.fit(X_train[[column]])
            X_train[column] = imputer.transform(X_train[[column]])
            X_test[column] = imputer.transform(X_test[[column]])
    return X_train, X_test, y_train, y_test

def iterative_imputation(X_train, X_test, y_train, y_test, iterative_imputation_columns):
    """
    Purpose:
    Perform iterative imputation on specified columns of training and testing sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - iterative_imputation_columns (list): List of columns on which to perform iterative imputation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): 
          X_train, X_test, y_train, y_test with iterative imputation applied
    """
    if not validate_columns(iterative_imputation_columns, X_train, y_train, "Iterative"):
        return X_train, X_test, y_train, y_test

    imputer = IterativeImputer(estimator=LinearRegression())
    for column in iterative_imputation_columns:
        if column == y_train.name:
            imputer.fit(y_train.to_frame())
            y_train = pd.Series(imputer.transform(y_train.to_frame()).flatten(), name=y_train.name)
            y_test = pd.Series(imputer.transform(y_test.to_frame()).flatten(), name=y_test.name)
        else:
            imputer.fit(X_train[[column]])
            X_train[column] = imputer.transform(X_train[[column]])
            X_test[column] = imputer.transform(X_test[[column]])
    return X_train, X_test, y_train, y_test

def missing_value_checker(X_train, X_test, y_train, y_test):
    """
    Purpose:
    Check for missing values in the training and testing datasets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels

    Returns:
        None
    """
    datasets = {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
    columns_with_missings = {}

    for name, dataset in datasets.items():
        if isinstance(dataset, pd.DataFrame):
            missing_columns = dataset.columns[dataset.isnull().any()].tolist()
        elif isinstance(dataset, pd.Series):
            missing_columns = [dataset.name] if dataset.isnull().any() else []

        if missing_columns:
            columns_with_missings[name] = missing_columns

    if columns_with_missings:
        print("Missing values found in the following columns:")
        for name, missing_columns in columns_with_missings.items():
            print(f"{name}: {', '.join(missing_columns)}")
    else:
        print("No missing values found in the datasets.")


# ENCODING

In [55]:
# ENCODING

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

def target_encode(X_train, X_test, y_train, y_test, list_of_columns_to_target_encode):
    """
    Purpose:
    Perform target encoding on specified columns of training and testing feature sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - list_of_columns_to_target_encode (list): List of columns to be target encoded

    Returns:
        - (pd.DataFrame, pd.DataFrame): X_train_encoded, X_test_encoded with target encoding applied
    """
    print("[TARGET ENCODING CHOSEN]\n")
    print("Columns being target encoded: ", list_of_columns_to_target_encode)
    encoder = TargetEncoder(cols=list_of_columns_to_target_encode)
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_test_encoded = encoder.transform(X_test)
    return X_train_encoded, X_test_encoded

def one_hot_encode(X_train, X_test, list_of_columns_to_one_hot_encode, drops=0):
    """
    Purpose:
    Perform one-hot encoding on specified columns of training and testing feature sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - list_of_columns_to_one_hot_encode (list): List of columns to be one-hot encoded
        - drops (int): Number of one-hot encoded columns to drop to avoid multicollinearity (default=0)

    Returns:
        - (pd.DataFrame, pd.DataFrame): X_train_encoded, X_test_encoded with one-hot encoding applied
    """
    print("[ONE HOT ENCODING CHOSEN]\n")
    print("Columns being one-hot encoded: ", list_of_columns_to_one_hot_encode)
    encoder = OneHotEncoder()
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()

    for col in list_of_columns_to_one_hot_encode:
        enc = encoder.fit_transform(X_train[[col]])
        enc_test = encoder.transform(X_test[[col]])

        for i in range(drops, enc.shape[1]):
            X_train_encoded[col + '_' + str(i - drops)] = enc[:, i].toarray().ravel().astype(int)
            X_test_encoded[col + '_' + str(i - drops)] = enc_test[:, i].toarray().ravel().astype(int)

        X_train_encoded.drop(columns=[col], inplace=True)
        X_test_encoded.drop(columns=[col], inplace=True)

    return X_train_encoded, X_test_encoded

def woe_encode(X_train, X_test, y_train, y_test, list_of_columns_to_woe_encode):
    """
    Purpose:
    Perform Weight of Evidence (WoE) encoding on specified columns of training and testing feature sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series): Training and testing labels
        - list_of_columns_to_woe_encode (list): List of columns to be WoE encoded

    Returns:
        - (pd.DataFrame, pd.DataFrame): X_train_encoded, X_test_encoded with WoE encoding applied
    """
    unique_values = y_train.nunique()
    if unique_values > 2:
        print("WARNING: WoE encoding is traditionally used for binary classification. "
              "Your y_train has more than two unique values. A different encoding method might be more suitable.")

    print("[WOE ENCODING CHOSEN]\n")
    print("Columns being WoE encoded: ", list_of_columns_to_woe_encode)

    # Assuming WOEEncoder is imported from category_encoders
    encoder = WOEEncoder(cols=list_of_columns_to_woe_encode)
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_test_encoded = encoder.transform(X_test)

    return X_train_encoded, X_test_encoded

# DATA TRANSFORMS

In [56]:
# DATA TRANSFORMS

from scipy import stats

def data_transformations(X_train, X_test, y_train, y_test, list_of_columns_to_transform, transformation_type="boxcox"):
    """
    Purpose:
    Apply various transformations to specified columns of training and testing datasets 
    to make the data more closely follow a normal distribution. 

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - list_of_columns_to_transform (list): List of column names to apply transformation
        - transformation_type (str): The type of transformation to apply ("boxcox", "log", "sqrt")

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): Transformed X_train, X_test, y_train, y_test

    Note:
    The function also plots histograms and QQ plots before and after the transformation, 
    as well as printing out Shapiro-Wilk and Anderson-Darling test statistics.
    """
    
    transformed_X_train = X_train.copy()
    transformed_X_test = X_test.copy()
    transformed_y_train = pd.Series(y_train) if not isinstance(y_train, pd.Series) else y_train.copy()
    transformed_y_test = pd.Series(y_test) if not isinstance(y_test, pd.Series) else y_test.copy()

    for col in list_of_columns_to_transform:
        data_to_transform = transformed_X_train[col] if col in transformed_X_train.columns else transformed_y_train
        test_data_to_transform = transformed_X_test[col] if col in transformed_X_test.columns else transformed_y_test

        # For transformations that require strictly positive data, find the minimum value and shift all data
        if transformation_type in ["boxcox", "log"]:
            min_val = min(data_to_transform.min(), test_data_to_transform.min())
            shift_val = abs(min_val) + 1e-8  # Make it strictly positive
            data_to_transform += shift_val
            test_data_to_transform += shift_val

        # Plot and show stats for training data before transformation
        fig, axes = plt.subplots(1, 2, figsize=(15, 4))
        sns.histplot(data_to_transform, kde=True, ax=axes[0])
        stats.probplot(data_to_transform, plot=axes[1])
        axes[0].set_title(f'{col} Distribution in Training Data (Before Transformation)')
        axes[1].set_title(f'{col} QQ Plot in Training Data (Before Transformation)')
        plt.show()

        shapiro_test_stat, shapiro_p_value = stats.shapiro(data_to_transform)
        anderson_result = stats.anderson(data_to_transform)
        print(f"Training Data '{col}' (Before Transformation):\nShapiro-Wilk: Statistic = {shapiro_test_stat}, p-value = {shapiro_p_value}\nAnderson-Darling: Statistic = {anderson_result.statistic}, Critical Values = {anderson_result.critical_values}")

        # Plot and show stats for test data before transformation
        fig, axes = plt.subplots(1, 2, figsize=(15, 4))
        sns.histplot(test_data_to_transform, kde=True, ax=axes[0])
        stats.probplot(test_data_to_transform, plot=axes[1])
        axes[0].set_title(f'{col} Distribution in Test Data (Before Transformation)')
        axes[1].set_title(f'{col} QQ Plot in Test Data (Before Transformation)')
        plt.show()

        shapiro_test_stat, shapiro_p_value = stats.shapiro(test_data_to_transform)
        anderson_result = stats.anderson(test_data_to_transform)
        print(f"Test Data '{col}' (Before Transformation):\nShapiro-Wilk: Statistic = {shapiro_test_stat}, p-value = {shapiro_p_value}\nAnderson-Darling: Statistic = {anderson_result.statistic}, Critical Values = {anderson_result.critical_values}")
         
        # Apply appropriate transformation
        if transformation_type == "boxcox":
            data_to_transform, optimal_lambda = stats.boxcox(data_to_transform)
            test_data_to_transform = stats.boxcox(test_data_to_transform, lmbda=optimal_lambda)
        elif transformation_type == "log":
            data_to_transform = np.log(data_to_transform)
            test_data_to_transform = np.log(test_data_to_transform)
        elif transformation_type == "sqrt":
            data_to_transform = np.sqrt(data_to_transform)
            test_data_to_transform = np.sqrt(test_data_to_transform)
        else:
            raise ValueError("Invalid transformation_type. Choose from 'boxcox', 'log', 'sqrt'.")

        # Update the transformed data frames
        if col in transformed_X_train.columns:
            transformed_X_train[col] = data_to_transform
            transformed_X_test[col] = test_data_to_transform
        else:
            transformed_y_train = data_to_transform
            transformed_y_test = test_data_to_transform

        # Plot and show stats for training data after transformation
        fig, axes = plt.subplots(1, 2, figsize=(15, 4))
        sns.histplot(data_to_transform, kde=True, ax=axes[0])
        stats.probplot(data_to_transform, plot=axes[1])
        axes[0].set_title(f'{col} Distribution in Training Data (After Transformation)')
        axes[1].set_title(f'{col} QQ Plot in Training Data (After Transformation)')
        plt.show()

        shapiro_test_stat, shapiro_p_value = stats.shapiro(data_to_transform)
        anderson_result = stats.anderson(data_to_transform)
        print(f"Training Data '{col}' (After Transformation):\nShapiro-Wilk: Statistic = {shapiro_test_stat}, p-value = {shapiro_p_value}\nAnderson-Darling: Statistic = {anderson_result.statistic}, Critical Values = {anderson_result.critical_values}")

        # Plot and show stats for test data after transformation
        fig, axes = plt.subplots(1, 2, figsize=(15, 4))
        sns.histplot(test_data_to_transform, kde=True, ax=axes[0])
        stats.probplot(test_data_to_transform, plot=axes[1])
        axes[0].set_title(f'{col} Distribution in Test Data (After Transformation)')
        axes[1].set_title(f'{col} QQ Plot in Test Data (After Transformation)')
        plt.show()

        shapiro_test_stat, shapiro_p_value = stats.shapiro(test_data_to_transform)
        anderson_result = stats.anderson(test_data_to_transform)
        print(f"Test Data '{col}' (After Transformation):\nShapiro-Wilk: Statistic = {shapiro_test_stat}, p-value = {shapiro_p_value}\nAnderson-Darling: Statistic = {anderson_result.statistic}, Critical Values = {anderson_result.critical_values}")

    return transformed_X_train, transformed_X_test, transformed_y_train, transformed_y_test

# OUTLIER REMOVAL

In [57]:
# OUTLIER REMOVAL

from scipy.stats import iqr, zscore
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

def remove_outliers(X_train, X_test, y_train, y_test, method=None, apply_to_test_data=False):
    """
    Purpose:
    Remove outliers from the training and testing datasets using different methods.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - method (str): Method to use for outlier removal. Options are:
            - 'iqr': Interquartile Range
            - 'zscore': Z-Score
            - 'isolation': Isolation Forest
            - 'lof': Local Outlier Factor
        - apply_to_test_data (bool): Whether to apply outlier removal to test data (default=False)

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): X_train, X_test, y_train, y_test with outliers removed

    Note:
    The function also prints the method chosen and the parameters used for outlier removal.
    """
    print("[OUTLIER REMOVAL]\n")

    if method == 'iqr':
        print("Outlier Removal Method Chosen: IQR")

        Q1 = X_train.quantile(0.25)
        Q3 = X_train.quantile(0.75)
        IQR = Q3 - Q1
        lower_scalar = 1.5
        upper_scalar = 1.5
        lower_bound = Q1 - lower_scalar * IQR
        upper_bound = Q3 + upper_scalar * IQR
        print(f"Outlier Parameters Used: Lower Scalar = {lower_scalar}, Upper Scalar = {upper_scalar}")

        X_train = X_train[(X_train >= lower_bound) & (X_train <= upper_bound)].dropna()
        y_train = y_train.loc[X_train.index]

        if apply_to_test_data:
            X_test = X_test[(X_test >= lower_bound) & (X_test <= upper_bound)].dropna()
            y_test = y_test.loc[X_test.index]

    elif method == 'zscore':
        print("Outlier Removal Method Chosen: Z-Score")

        threshold = 3
        print(f"Outlier Parameters Used: Threshold = {threshold}")

        z_scores = abs(zscore(X_train))
        X_train = X_train[(z_scores < threshold).all(axis=1)]
        y_train = y_train.loc[X_train.index]

        if apply_to_test_data:
            z_scores_test = abs(zscore(X_test))
            X_test = X_test[(z_scores_test < threshold).all(axis=1)]
            y_test = y_test.loc[X_test.index]

    elif method == 'isolation':
        print("Outlier Removal Method Chosen: Isolation Forest")

        contamination = 0.05
        print(f"Outlier Parameters Used: Contamination = {contamination}")

        model = IsolationForest(contamination=contamination)
        model.fit(X_train)
        outliers = model.predict(X_train) == -1
        X_train = X_train[~outliers]
        y_train = y_train.loc[X_train.index]

        if apply_to_test_data:
            outliers_test = model.predict(X_test) == -1
            X_test = X_test[~outliers_test]
            y_test = y_test.loc[X_test.index]

    elif method == 'lof':
        print("Outlier Removal Method Chosen: Local Outlier Factor")

        n_neighbors = 20
        contamination = 'auto'
        print(f"Outlier Parameters Used: n_neighbors = {n_neighbors}, contamination = {contamination}")

        lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
        outlier_mask = lof.fit_predict(X_train) == 1
        X_train = X_train[outlier_mask]
        y_train = y_train[outlier_mask]

        if apply_to_test_data:
            outlier_mask_test = lof.fit_predict(X_test) == 1
            X_test = X_test[outlier_mask_test]
            y_test = y_test[outlier_mask_test]

    else:
        print("No outlier removal method chosen.")

    return X_train, X_test, y_train, y_test

# CHECK MULTICOLLINEARITY

In [58]:
# CHECK MULTICOLLINEARITY

from statsmodels.stats.outliers_influence import variance_inflation_factor

def check_multicol(X_train, X_test, y_train, y_test):
    """
    Purpose:
    Check for multicollinearity in the training dataset using correlation and Variance Inflation Factor (VIF).

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels

    Returns:
        None (prints out the results)

    Note:
    The function prints out the correlation and VIF results based on predefined thresholds.
    - High Correlation: > 0.8
    - High VIF: > 10

    The function also populates and prints a results dictionary to summarize the findings.
    """

    print("\n[CHECK MULTICOLLINEARITY]\n")

    # Thresholds
    corr_threshold = 0.8
    vif_threshold = 10
    print(f"Thresholds used for analysis:")
    print(f"High Correlation: > {corr_threshold}")
    print(f"High VIF: > {vif_threshold}\n")
    print(f"Results:")

    # Prepare the results dictionary
    results = {}

    # 1. Correlation Analysis
    correlation_matrix = X_train.corr()
    high_correlation_pairs = np.where(np.abs(correlation_matrix) > corr_threshold)
    high_correlation_pairs = [(X_train.columns[x], X_train.columns[y]) for x, y in zip(*high_correlation_pairs) if x != y and x < y]
    for pair in high_correlation_pairs:
        results[pair] = "High Correlation"

    # 2. VIF
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X_train.columns
    vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
    high_vif_features = vif_data[vif_data["VIF"] > vif_threshold]["Feature"].tolist()
    for feature in high_vif_features:
        for pair in high_correlation_pairs:
            if feature in pair:
                if pair in results:
                    results[pair] += f", High VIF for {feature} column"
                else:
                    results[pair] = f"High VIF for {feature} column"

    # Print results
    if not results:
        print("No issues detected.")
    else:
        for key, value in results.items():
            print(f"{key}: {value}")

# SCALING

In [59]:
# SCALING

from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_data(X_train, X_test, scaler_type='standard', scaler_exclude_columns=None):
    """
    Purpose:
    Scale the features in the training and testing datasets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - scaler_type (str): Type of scaler to use. Options are 'standard', 'minmax', or None.
        - scaler_exclude_columns (list): List of columns to exclude from scaling

    Returns:
        - (pd.DataFrame, pd.DataFrame): Scaled X_train and X_test

    Note:
    The function prints out the scaling method chosen and the columns being skipped from scaling, if any.
    """
    print("[SCALING]\n")

    # Show chosen method here
    print("Chosen Scaling Method: {}".format(scaler_type))

    # Check if scaling should be skipped
    if scaler_type is None:
        return X_train, X_test

    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid scaler_type. Use 'standard', 'minmax', or 'None'.")

    if scaler_exclude_columns:
        # Scale only the columns not in scaler_exclude_columns
        scale_columns = [col for col in X_train.columns if col not in scaler_exclude_columns]
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()
        X_train_scaled[scale_columns] = scaler.fit_transform(X_train[scale_columns])
        X_test_scaled[scale_columns] = scaler.transform(X_test[scale_columns])

        # Print the columns being skipped from scaling
        print("Columns being skipped from scaling: {}".format(scaler_exclude_columns))
    else:
        # Scale all columns
        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

    return X_train_scaled, X_test_scaled

# TSNE PREVIEW

In [60]:
# TSNE PREVIEW

import plotly.express as px
from sklearn.manifold import TSNE

def tsne_preview(X_train, X_test, y_train, y_test, n_components=3, random_state=42):
    """
    Purpose:
    Create a t-SNE preview of the training and testing datasets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - n_components (int): Number of components for t-SNE
        - random_state (int): Random state for reproducibility

    Returns:
        None (shows plotly figures)

    Note:
    The function generates 3D scatter plots for both the training and testing datasets.
    """
    tsne = TSNE(n_components=n_components, random_state=random_state)

    train_tsne_features = tsne.fit_transform(X_train)
    train_tsne_df = pd.DataFrame(train_tsne_features, columns=[f'TSNE{i}' for i in range(1, n_components+1)])
    train_tsne_df['target'] = y_train

    test_tsne_features = tsne.fit_transform(X_test)
    test_tsne_df = pd.DataFrame(test_tsne_features, columns=[f'TSNE{i}' for i in range(1, n_components+1)])
    test_tsne_df['target'] = y_test

    train_fig = px.scatter_3d(train_tsne_df, x='TSNE1', y='TSNE2', z='TSNE3', color='target')
    test_fig = px.scatter_3d(test_tsne_df, x='TSNE1', y='TSNE2', z='TSNE3', color='target')

    print("[T-SNE PREVIEW TRAIN SET]")
    train_fig.show()
    print("[T-SNE PREVIEW TEST SET]")
    test_fig.show()

# UMAP PREVIEW

In [61]:
# UMAP PREVIEW

import plotly.express as px
from umap.umap_ import UMAP

def umap_preview(X_train, X_test, y_train, y_test, n_components=3, random_state=42):
    """
    Purpose:
    Create a UMAP preview of the training and testing datasets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - n_components (int): Number of components for UMAP
        - random_state (int): Random state for reproducibility

    Returns:
        None (shows plotly figures)

    Note:
    The function generates 3D scatter plots for both the training and testing datasets.
    """
    umap_model = UMAP(n_components=n_components, random_state=random_state)

    train_umap_features = umap_model.fit_transform(X_train)
    train_umap_df = pd.DataFrame(train_umap_features, columns=[f'UMAP{i}' for i in range(1, n_components+1)])
    train_umap_df['target'] = y_train

    test_umap_features = umap_model.fit_transform(X_test)
    test_umap_df = pd.DataFrame(test_umap_features, columns=[f'UMAP{i}' for i in range(1, n_components+1)])
    test_umap_df['target'] = y_test

    train_fig = px.scatter_3d(train_umap_df, x='UMAP1', y='UMAP2', z='UMAP3', color='target')
    test_fig = px.scatter_3d(test_umap_df, x='UMAP1', y='UMAP2', z='UMAP3', color='target')

    print("[UMAP PREVIEW TRAIN SET]")
    train_fig.show()
    print("[UMAP PREVIEW TEST SET]")
    test_fig.show()

# PCA PREVIEW

In [62]:
# PCA PREVIEW

from sklearn.decomposition import PCA

def pca_preview(X_train, X_test):
    """
    Purpose:
    Preview the explained and cumulative variance of features in the training and test datasets using PCA.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets

    Returns:
        None (shows plots and prints explained variance)

    Note:
    This function generates variance plots and prints out the optimal number of components for PCA based on the Kaiser criterion.
    Kaiser criterion is just a recommended starting point, always visualize and experiment.
    """
    print("[PCA PREVIEW]\n")
    for data_name, data in zip(['Train', 'Test'], [X_train, X_test]):
        print(f"\n{data_name} Data:")

        features = data

        # Perform PCA
        pca = PCA()
        pca.fit(features)
        explained_variance = pca.explained_variance_ratio_
        cumulative_variance = np.cumsum(explained_variance)

        # Determine optimal components using Kaiser criterion
        optimal_components = np.sum(explained_variance > 1)  # Example threshold

        # Plot explained variance and cumulative variance
        plt.figure(figsize=(10, 5))
        plt.plot(explained_variance, marker='o', label='Explained Variance', color='lightgreen')
        plt.plot(cumulative_variance, marker='o', linestyle='--', label='Cumulative Variance', color='darkgreen')
        plt.xlabel('Component')
        plt.ylabel('Variance')
        plt.title(f'{data_name} Data: Explained and Cumulative Variance per Component')
        plt.legend()
        plt.grid(True)
        plt.show()

        # Display Kaiser criterion result
        print(f"\n[OPTIMAL PCA COMPONENTS BY KAISER CRITERION ({data_name} Data)]:\n {optimal_components}")
        if optimal_components == 0:
            print("According to Kaiser criterion, PCA might not be necessary.")

        # Display explained variance per component
        print(f"\n[EXPLAINED VARIANCE PER COMPONENT ({data_name} Data)]:")
        for idx, variance in enumerate(explained_variance, start=1):
            print(f"Component {idx}: {variance:.4f}")

        # Display cumulative variance per component
        print(f"\n[CUMULATIVE VARIANCE PER COMPONENT ({data_name} Data)]:")
        for idx, variance in enumerate(cumulative_variance, start=1):
            print(f"Components {idx}: {variance:.4f}")


# FEATURE EXTRACTION / DIMENSIONALITY REDUCTION

In [63]:
# FEATURE EXTRACTION / DIMENSIONALITY REDUCTION

from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def extract_features_dim_reduce(X_train, X_test, y_train, y_test, n_components=3, random_state=42, method=None):
    """
    Purpose:
    Perform feature extraction or dimensionality reduction on the feature sets.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - n_components (int): Number of components for the chosen method
        - random_state (int): Random state for reproducibility
        - method (str): Feature extraction or reduction method ('pca', 'ica', 'svd', 'lda', or None)

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): Transformed X_train, X_test, y_train, and y_test
    """
    print(f"[FEATURE EXTRACTION / DIMENSIONALITY REDUCTION METHOD CHOSEN]\n{method}")

    if method == None:
        return X_train, X_test, y_train, y_test
    elif method == 'pca':
        reducer = PCA(n_components=n_components)
        X_train = reducer.fit_transform(X_train)
        X_test = reducer.transform(X_test)
    elif method == 'ica':
        reducer = FastICA(n_components=n_components, random_state=random_state)
        X_train = reducer.fit_transform(X_train)
        X_test = reducer.transform(X_test)
    elif method == 'svd':
        reducer = TruncatedSVD(n_components=n_components, random_state=random_state)
        X_train = reducer.fit_transform(X_train)
        X_test = reducer.transform(X_test)
    elif method == 'lda':
        reducer = LinearDiscriminantAnalysis(n_components=n_components)
        X_train = reducer.fit_transform(X_train, y_train)
        X_test = reducer.transform(X_test)
    else:
        raise ValueError(f"Invalid method '{method}'. Please choose one of the following: 'pca', 'ica', 'svd', 'lda', 'None'.")

    # Convert the transformed data to a pandas DataFrame and assign column names
    if method != None:
        column_names = [f'{method.upper()}{i}' for i in range(1, n_components+1)]
        X_train = pd.DataFrame(X_train, columns=column_names)
        X_test = pd.DataFrame(X_test, columns=column_names)

    return X_train, X_test, y_train, y_test

# OVERSAMPLING & UNDERSAMPLING

In [64]:
# OVERSAMPLING & UNDERSAMPLING

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

def balance_classes(X_train, y_train, method='oversample', random_state=42, k_neighbors=5):
    """
    Purpose:
    Balance class distribution in the training set via oversampling, undersampling, or combined methods.

    Parameters:
        - X_train (pd.DataFrame): Training feature set
        - y_train (pd.Series or array-like): Training labels
        - method (str): Method to balance classes ('oversample', 'undersample', 'mixedenn', 'mixedtomek')
        - random_state (int): Random state for reproducibility
        - k_neighbors (int): Number of nearest neighbors for SMOTE

    Returns:
        - (pd.DataFrame, pd.Series): Resampled X_train and y_train

    Note:
    The function prints out the new class distribution in the training set.
    """
    if method == 'oversample':
        smote = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
        X_train, y_train = smote.fit_resample(X_train, y_train)
    elif method == 'undersample':
        rus = RandomUnderSampler(random_state=random_state)
        X_train, y_train = rus.fit_resample(X_train, y_train)
    elif method == 'mixedenn':
        smeenn = SMOTEENN(random_state=random_state, smote=SMOTE(k_neighbors=k_neighbors))
        X_train, y_train = smeenn.fit_resample(X_train, y_train)
    elif method == 'mixedtomek':
        smtomek = SMOTETomek(random_state=random_state, smote=SMOTE(k_neighbors=k_neighbors))
        X_train, y_train = smtomek.fit_resample(X_train, y_train)
    else:
        raise ValueError("Invalid method. Use 'oversample', 'undersample', 'mixedenn', or 'mixedtomek'.")

    print('Training Set Class Balance: \n', y_train.value_counts())

    return X_train, y_train

# FEATURE SELECTION

In [65]:
# FEATURE SELECTION
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import KFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.base import clone
from mlxtend.feature_selection import ExhaustiveFeatureSelector

def select_features_rfecv(X_train, X_test, y_train, y_test, chosen_rfecv_base_model, step_value=1, minimum_features_to_keep=5, cv=5):
    """
    Purpose:
    Perform feature selection using Recursive Feature Elimination with Cross-Validation (RFECV).

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - chosen_rfecv_base_model (sklearn estimator): The base model to use in RFECV
        - step_value (int): Number of features to remove at each iteration
        - minimum_features_to_keep (int): Minimum number of features to keep
        - cv (int): Number of folds in cross-validation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): Transformed X_train, X_test, y_train, and y_test
    """
    # Use RFECV to determine the most optimal features to keep
    selector = RFECV(estimator=chosen_rfecv_base_model,
                     step=step_value,
                     min_features_to_select=minimum_features_to_keep,
                     cv=cv)
    selector.fit(X_train, y_train)

    # Get the list of selected features
    selected_features = X_train.columns[selector.support_]

    # Get the features to drop
    dropped_features = [feature for feature in X_train.columns if feature not in selected_features]

    # Modify the train and test data accordingly
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]

    # Show the dropped features
    print("[RFECV] Dropped features:", dropped_features)

    return X_train, X_test, y_train, y_test

def select_features_permutation(X_train, X_test, y_train, y_test, permutation_chosen_model, n_features_to_keep=5, cv=5):
    """
    Purpose:
    Perform feature selection using Permutation Importance with cross-validation.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - permutation_chosen_model (sklearn estimator): The model to use for calculating permutation importance
        - n_features_to_keep (int): Number of top features to keep
        - cv (int): Number of folds in cross-validation

    Returns:
        - (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series): Transformed X_train, X_test, y_train, and y_test
    """
    # Initialize an array to store permutation importances
    perm_importances = np.zeros(X_train.shape[1])

    # Perform cross-validation
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]

        # Train the model on the CV training set
        permutation_chosen_model.fit(X_train_cv, y_train_cv)

        # Calculate feature importances using permutation importance on the CV validation set
        perm_importance = permutation_importance(permutation_chosen_model, X_val_cv, y_val_cv, n_repeats=30)

        # Accumulate feature importances
        perm_importances += perm_importance.importances_mean

    # Average the feature importances over all CV folds
    perm_importances /= cv

    # Get the indices of the features sorted by importance
    sorted_idx = perm_importances.argsort()[::-1]

    # Get the list of selected features
    selected_features = X_train.columns[sorted_idx][:n_features_to_keep]

    # Modify the train and test data accordingly
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]

    # Show the dropped features
    dropped_features = [feature for feature in X_train.columns if feature not in selected_features]
    print("[PERMUTATION CV] Dropped features:", dropped_features)

    return X_train, X_test, y_train, y_test

def select_features_basic_filter(X_train, X_test, tolerance=0.01):
    """
    Purpose:
    Performs a basic filter based feature selection by removing constant, quasi-constant, and duplicate features.

    Parameters:
        - X_train, X_test (pd.DataFrame): Training and testing feature sets
        - tolerance (float): Variance threshold for determining constant and quasi-constant features

    Returns:
        - (pd.DataFrame, pd.DataFrame): Transformed X_train and X_test
    """
    # Drop constant and quasi-constant features from X_train and get the remaining columns
    constant_filter_train = X_train.var() > tolerance
    X_train_filtered = X_train.loc[:, constant_filter_train]

    # Apply the same filter to X_test
    X_test_filtered = X_test.loc[:, constant_filter_train]

    # Drop duplicated features from X_train and get the remaining columns
    X_train_deduplicated = X_train_filtered.T.drop_duplicates().T
    remaining_columns = X_train_deduplicated.columns
    X_test_deduplicated = X_test_filtered[remaining_columns]

    # Find the dropped features
    dropped_features = set(X_train.columns) - set(remaining_columns)

    # Show the dropped features
    print("[BASIC FILTER] Dropped features:", dropped_features)

    return X_train_deduplicated, X_test_deduplicated

def select_features_sequential(estimator, X_train, X_test, y_train, y_test,
                               forward=False, cv=5,
                               n_features_to_select='auto', tol=None, scoring='accuracy'):
    """
    Purpose:
    Perform feature selection using SequentialFeatureSelector from scikit-learn with cross-validation.
    Supports both forward and backward selection methods

    Parameters:
        - estimator (sklearn estimator): The base model to use for feature selection.
        - X_train, X_test (pd.DataFrame): Training and testing feature sets.
        - y_train, y_test (pd.Series or array-like): Training and testing labels.
        - forward (bool): Whether to perform forward selection. Default is False for backward selection.
        - cv (int): Number of folds for cross-validation. Default is 5.
        - n_features_to_select (int or 'auto'): Number of features to select. Default is 'auto'.
        - tol (float or None): Tolerance for optimization. Default is None.
        - scoring (str): Scoring metric used for feature selection. Default is 'accuracy'.

    Returns:
        - (pd.DataFrame, pd.DataFrame): Transformed X_train and X_test with irrelevant features removed.
    """

    # Get feature names from DataFrame
    feature_names = X_train.columns.tolist()

    # Initialize SequentialFeatureSelector
    sfs = SequentialFeatureSelector(estimator,
                                    direction='forward' if forward else 'backward',
                                    n_features_to_select=n_features_to_select,
                                    tol=tol,
                                    scoring=scoring,
                                    cv=cv)

    # Fit the model
    sfs.fit(X_train, y_train)

    # Identify and store dropped features
    dropped_features = np.array(feature_names)[~sfs.get_support()]
    print("[SEQUENTIAL FEATURE SELECTOR WITH CV] Dropped features:", dropped_features)

    # Transform data
    X_train_transformed = X_train.loc[:, sfs.get_support()]
    X_test_transformed = X_test.loc[:, sfs.get_support()]

    return X_train_transformed, X_test_transformed

def select_features_exhaustive(estimator, X_train, X_test, y_train, y_test,
                               cv=5, min_features=5, scoring='accuracy'):
    """
    Purpose:
    Perform feature selection using the Exhaustive Feature Selector from mlxtend with cross-validation.

    Parameters:
        - estimator (sklearn estimator): The base model to use for feature selection
        - X_train, X_test (pd.DataFrame or array-like): Training and testing feature sets
        - y_train, y_test (pd.Series or array-like): Training and testing labels
        - cv (int): Number of folds in cross-validation
        - min_features (int): Minimum number of features to consider for selection
        - scoring (str): Scoring metric used for feature selection

    Returns:
        - (pd.DataFrame or array-like, pd.DataFrame or array-like): Transformed X_train and X_test with best features selected
    """

    # Get feature names if X_train is a DataFrame
    if hasattr(X_train, 'columns'):
        original_feature_names = X_train.columns.tolist()
    else:
        original_feature_names = [str(i) for i in range(X_train.shape[1])]

    # Initialize Exhaustive Feature Selector
    efs = ExhaustiveFeatureSelector(estimator,
                                    min_features=min_features,
                                    max_features=X_train.shape[1],
                                    scoring=scoring,
                                    cv=cv)

    # Fit the selector to the data
    efs = efs.fit(X_train, y_train)

    # Get best feature indices and names
    best_feature_indices = np.array(efs.best_idx_)
    best_feature_names = np.array(original_feature_names)[best_feature_indices]

    # Identify and print the dropped features
    dropped_features = [feature for feature in original_feature_names if feature not in best_feature_names]
    print("\n\n[EXHAUSTIVE FEATURE SELECTOR] Dropped features:", dropped_features)

    # Transform X_train and X_test based on best features
    if hasattr(X_train, 'loc'):
        X_train_transformed = X_train.loc[:, best_feature_names]
        X_test_transformed = X_test.loc[:, best_feature_names]
    else:
        X_train_transformed = X_train[:, best_feature_indices]
        X_test_transformed = X_test[:, best_feature_indices]

    # Return transformed data
    return X_train_transformed, X_test_transformed

# MODEL EVALUATION (BINARY CLASSIFICATION)

In [66]:
# MODEL EVALUATION (BINARY CLASSIFICATION)

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold

def evaluate_binary_model(model, model_name, X_train, X_test, y_train, y_test, stratify=True):
    """
    Evaluate a binary classification model.

    Parameters:
        model : The classification model
        model_name : Name of the model (for printing)
        X_train, y_train : Training data
        X_test, y_test : Test data
        stratify : Whether to use Stratified K-Fold (default True)

    Returns:
        None (prints and plots evaluation metrics)
    """
    # Choose the appropriate KFold method based on stratify argument
    if stratify:
        cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        cv_method = KFold(n_splits=5, shuffle=True, random_state=42)

    # Print mean 5-fold cross validation score
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=cv_method, scoring='accuracy', verbose=0)) * 100
    print('[{}] Training Mean 5-Fold CV Score: {:.2f}%'.format(model_name, cv_score))

    # Print test set accuracy score
    test_accuracy = model.score(X_test, y_test) * 100
    print('[{}] Test Set Accuracy Score: {:.2f}%'.format(model_name, test_accuracy))

    # Print classification report
    y_pred = model.predict(X_test)
    print("[{}] Classification Report:\n".format(model_name), classification_report(y_test, y_pred))

    # Plot ROC AUC curve
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='{} (AUC = {:.2f})'.format(model_name, roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC Curve')
    plt.legend(loc='lower right')
    plt.show()

# MODEL EVALUATION (MULTI CLASS CLASSIFICATION)

In [67]:
# MODEL EVALUATION (MULTI CLASS CLASSIFICATION)

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import label_binarize

def evaluate_multiclass_model(model, model_name, X_train, X_test, y_train, y_test, stratify=True):
    """
    Evaluate a multiclass classification model.

    Parameters:
        model : The classification model
        model_name : Name of the model (for printing)
        X_train, y_train : Training data
        X_test, y_test : Test data
        stratify : Whether to use Stratified K Fold (default is True)

    Returns:
        None (prints and plots evaluation metrics)
    """

    # Create a KFold or StratifiedKFold instance based on the 'stratify' flag
    if stratify:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Print mean 5-fold cross-validation score
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', verbose=0)) * 100
    print('[{}] Training Mean 5-Fold CV Score: {:.2f}%'.format(model_name, cv_score))

    # Print test set accuracy score
    test_accuracy = model.score(X_test, y_test) * 100
    print('[{}] Test Set Accuracy Score: {:.2f}%'.format(model_name, test_accuracy))

    # Print classification report
    y_pred = model.predict(X_test)
    print("[{}] Classification Report:\n".format(model_name), classification_report(y_test, y_pred))

    # Plot ROC AUC curve for multi-class
    n_classes = len(np.unique(y_test))
    y_test_bin = label_binarize(y_test, classes=np.arange(n_classes))
    y_prob = model.predict_proba(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(8, 6))
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label='Class {} (AUC = {:.2f})'.format(i, roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC Curve')
    plt.legend(loc='lower right')
    plt.show()

# MODEL EVALUATION (REGRESSION)

In [68]:
# MODEL EVALUATION (REGRESSION)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_regression_model(model, model_name, X_train, X_test, y_train, y_test):
    """
    Evaluate a regression model.

    Parameters:
        model : The regression model
        model_name : Name of the model (for printing)
        X_train, y_train : Training data
        X_test, y_test : Test data

    Returns:
        None (prints evaluation metrics)
    """
    # Print mean 5-fold cross-validation MSE score
    cv_mse_score = np.mean(-cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', verbose=0))
    print('\n[{}] Training Mean 5-Fold CV MSE: {:.2f}'.format(model_name, cv_mse_score))

    # Print mean 5-fold cross-validation MAE score
    cv_mae_score = np.mean(-cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', verbose=0))
    print('[{}] Training Mean 5-Fold CV MAE: {:.2f}'.format(model_name, cv_mae_score))

    # Print test set MSE score
    test_mse = mean_squared_error(y_test, model.predict(X_test))
    print('[{}] Test Set MSE: {:.2f}'.format(model_name, test_mse))

    # Print test set MAE score
    test_mae = mean_absolute_error(y_test, model.predict(X_test))
    print('[{}] Test Set MAE: {:.2f}'.format(model_name, test_mae))

# PLOT FEATURE IMPORTANCES

In [69]:
# PLOT FEATURE IMPORTANCES

def plot_feature_importance(model, model_name, X_train):
    """
    Plot the feature importances for a given model if available.

    Parameters:
        model : The trained model
        model_name : Name of the model (for printing)
        X_train : Training feature data

    Returns:
        None (displays a plot)
    """
    # Check if the model has feature importances
    if hasattr(model, 'feature_importances_'):
        feat_importances = model.feature_importances_

        # Plot the feature importances
        plt.figure(figsize=(12, 8))
        sns.barplot(x=feat_importances, y=X_train.columns)
        plt.title('{} Feature Importance Plot'.format(model_name))
        plt.xlabel('{} Feature Importance'.format(model_name))
        plt.ylabel('Features')
        plt.show()
    else:
        # If the model doesn't have feature importances, print a note
        print("\n NOTE: Feature importances are not available for {}. \n".format(model_name))

# SHOW SHAP VALUES

In [70]:
# SHOW SHAP VALUES

import shap

def plot_shap_values(model, model_name, X_train, multi_class=False):
    """
    Plot the SHAP values for a given model if available.

    Parameters:
        model : The trained model
        model_name : Name of the model (for printing)
        X_train : Training feature data
        multi_class : Whether the model is multi-class or not (default is False)

    Returns:
        None (displays a plot)
    """
    # Check if the model is suitable for calculating shap values
    if hasattr(model, 'feature_importances_'):
        # Create an explainer object for the model
        explainer = shap.TreeExplainer(model)

        # Calculate the Shapley values for the model
        shap_values = explainer.shap_values(X_train)

        # If multi_class, take the mean of the shap values across the classes
        if multi_class and isinstance(shap_values, list):
            shap_values = np.mean(shap_values, axis=0)

        # Plot the Shapley values for each feature
        shap.summary_plot(shap_values, X_train, plot_type='bar', show=False)
        plt.title('{} Shap Value Plot'.format(model_name))
        plt.xlabel('Shapley Value')
        plt.ylabel('Features')
        plt.show()
    else:
        # If the model is not suitable for calculating shap values, print a note
        print("\n NOTE: Shap values are not available for {}. \n".format(model_name))