In [None]:
import numpy as np
import pandas as pd
import logging
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
from joblib import dump

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
def determine_best_bins(X, y, column, bins_range=(2, 10)):
    """
    Determine the best number of bins for KBinsDiscretizer based on cross-validated model performance.
    """
    best_bins = 2
    best_score = -np.inf

    for bins in range(bins_range[0], bins_range[1] + 1):
        kbins = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='quantile')
        X_binned = kbins.fit_transform(X[[column]])
        clf = LogisticRegression(max_iter=1000)
        score = cross_val_score(clf, X_binned, y, cv=3, scoring="accuracy").mean()
        if score > best_score:
            best_score = score
            best_bins = bins

    return best_bins


def preprocess_column(data, col_name, y=None, train=True, kbins=None, scaler=None):
    """
    Preprocess individual columns based on certain criteria.
    """
    data_copy = data.copy()
    if train:
        if y is None:
            raise ValueError("Target variable y must be provided during training phase.")

        # KBins discretization
        best_bins = determine_best_bins(data, y, col_name)
        kbins = KBinsDiscretizer(n_bins=best_bins, encode='ordinal', strategy='quantile')
        data_copy[col_name] = kbins.fit_transform(data_copy[[col_name]])

        # Scaling
        if data_copy[col_name].mean() < data_copy[col_name].median() and data_copy[col_name].var() > 1:
            scaler = StandardScaler()
        elif data_copy[col_name].var() > 1:
            scaler = MinMaxScaler()
        if scaler:
            data_copy[col_name] = scaler.fit_transform(data_copy[[col_name]])
    else:
        if kbins:
            data_copy[col_name] = kbins.transform(data_copy[[col_name]])
        if scaler:
            data_copy[col_name] = scaler.transform(data_copy[[col_name]])

    return data_copy, kbins, scaler


In [None]:
def preprocess_data(X_train, X_val, X_test, y_train):
    """
    Preprocess datasets using column preprocessing logic.
    """
    kbins_dict = {}
    scaler_dict = {}

    for col in X_train.columns:
        X_train, kbins, scaler = preprocess_column(X_train, col, y_train, train=True)
        kbins_dict[col] = kbins
        scaler_dict[col] = scaler

        X_val, _, _ = preprocess_column(X_val, col, train=False, kbins=kbins_dict[col], scaler=scaler_dict[col])
        X_test, _, _ = preprocess_column(X_test, col, train=False, kbins=kbins_dict[col], scaler=scaler_dict[col])

    return X_train, X_val, X_test


In [None]:
def train_model(model, param_grid, X, y, search_type="grid", n_iter=10):
    """
    Train model using GridSearch or RandomizedSearch.
    """
    if search_type == "grid":
        search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=3))
    elif search_type == "random":
        search = RandomizedSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=3), n_iter=n_iter)
    else:
        raise ValueError(f"Unknown search_type: {search_type}")

    search.fit(X, y)
    return search.best_estimator_


def evaluate_model(model, X, y):
    """
    Evaluate model performance.
    """
    y_pred = model.predict(X)
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'roc_auc': roc_auc_score(y, y_pred)
    }
    return metrics


In [None]:
def main(X_train, y_train, X_val, y_val, X_test, y_test):
    # Preprocess data
    X_train, X_val, X_test = preprocess_data(X_train, X_val, X_test, y_train)

    # Model configurations
    models_config = {
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
            'search_type': "grid",
            'n_iter': None
        },
        'Random Forest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [100, 300, 500],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt']
            },
            'search_type': "random",
            'n_iter': 10
        },
        'Gradient Boosting': {
            'model': GradientBoostingClassifier(),
            'params': {
                'n_estimators': [100, 300, 500],
                'learning_rate': [0.001, 0.01, 0.1],
                'max_depth': [3, 5, 8],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            },
            'search_type': "random",
            'n_iter': 10
        },
        'SVM': {
            'model': SVC(probability=True),
            'params': {
                'C': [0.1, 1, 10],
                'gamma': [1, 0.1, 0.01],
                'kernel': ['linear', 'rbf']
            },
            'search_type': "random",
            'n_iter': 6
        },
        'MLP': {
            'model': MLPClassifier(max_iter=1000),
            'params': {
                'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
                'activation': ['tanh', 'relu'],
                'solver': ['sgd', 'adam'],
                'alpha': [0.0001, 0.05],
                'learning_rate': ['constant', 'adaptive'],
            },
            'search_type': "random",
            'n_iter': 10
        },
        'XGBoost': {
            'model': XGBClassifier(),
            'params': {
                'n_estimators': [100, 300, 500],
                'learning_rate': [0.001, 0.01, 0.1],
                'max_depth': [3, 5, 8],
                'min_child_weight': [1, 5, 10],
                'subsample': [0.5, 0.7, 1.0],
                'colsample_bytree': [0.5, 0.7, 1.0]
            },
            'search_type': "random",
            'n_iter': 10
        },
        'CatBoost': {
            'model': CatBoostClassifier(verbose=0),
            'params': {
                'iterations': [100, 300, 500],
                'learning_rate': [0.001, 0.01, 0.1],
                'depth': [4, 6, 8],
                'l2_leaf_reg': [1, 3, 5, 7, 9]
            },
            'search_type': "random",
            'n_iter': 10
        }
    }

    #... rest of the main function ...
    best_models = {}
    model_scores = {}

    for model_name, config in models_config.items():
        logging.info(f"Training {model_name}")
        best_model = train_model(
            config['model'],
            config['params'],
            X_train, y_train,
            search_type=config['search_type'],
            n_iter=config['n_iter']
        )
        best_models[model_name] = best_model

        val_metrics = evaluate_model(best_model, X_val, y_val)
        model_scores[model_name] = val_metrics['f1']
        logging.info(f"{model_name} Validation Metrics: {val_metrics}")

    # Select best model based on F1-Score
    best_model_name = max(model_scores, key=model_scores.get)
    logging.info(f"The best model is {best_model_name} with a Validation F1-Score of {model_scores[best_model_name]}")

    # Ensemble: Voting Classifiers
    model_weights = list(model_scores.values())
    hard_voting_clf = VotingClassifier(estimators=[(name, model) for name, model in best_models.items()], voting='hard')
    hard_voting_clf.fit(X_train, y_train)
    logging.info(f"Hard Voting Classifier Validation ROC AUC: {evaluate_model(hard_voting_clf, X_val, y_val)['roc_auc']}")

    soft_voting_clf = VotingClassifier(estimators=[(name, model) for name, model in best_models.items()], voting='soft', weights=model_weights)
    soft_voting_clf.fit(X_train, y_train)
    logging.info(f"Soft Voting Classifier Validation ROC AUC: {evaluate_model(soft_voting_clf, X_val, y_val)['roc_auc']}")

    # Calibrated Classifier for improved soft voting
    calibrated_clf = CalibratedClassifierCV(soft_voting_clf, method='sigmoid', cv='prefit')
    calibrated_clf.fit(X_val, y_val)
    logging.info(f"Calibrated Classifier Validation ROC AUC: {evaluate_model(calibrated_clf, X_val, y_val)['roc_auc']}")

    # Evaluate best model and ensemble models on test set
    logging.info(f"{best_model_name} Test Metrics: {evaluate_model(best_models[best_model_name], X_test, y_test)}")
    logging.info(f"Hard Voting Classifier Test Metrics: {evaluate_model(hard_voting_clf, X_test, y_test)}")
    logging.info(f"Soft Voting Classifier Test Metrics: {evaluate_model(soft_voting_clf, X_test, y_test)}")
    logging.info(f"Calibrated Classifier Test Metrics: {evaluate_model(calibrated_clf, X_test, y_test)}")

    # Save the best model
    dump(best_models[best_model_name], f"{best_model_name}_best_model.pkl")
    logging.info(f"{best_model_name} model saved to disk.")


In [None]:
if __name__ == "__main__":
    # Assuming X_train, y_train, X_val, y_val, X_test, y_test are already defined
    main(X_train, y_train, X_val, y_val, X_test, y_test)
