In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
import numpy as np
import pandas as pd

def preprocess_column(data, col_name, train=True, kbins=None, scaler=None):
    """Decides the best preprocessing method for a column and applies it."""

    # If number of unique values is above a threshold (let's say 20), consider bucketing
    if len(data[col_name].unique()) > 20:
        if train:
            kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
            data[col_name] = kbins.fit_transform(data[[col_name]])
        else:
            data[col_name] = kbins.transform(data[[col_name]])
    # Else if variance is high, apply StandardScaler if mean is near 0, else MinMaxScaler
    elif data[col_name].mean() < 5 and data[col_name].var() > 1:
        if train:
            scaler = StandardScaler()
            data[col_name] = scaler.fit_transform(data[[col_name]])
        else:
            data[col_name] = scaler.transform(data[[col_name]])
    elif data[col_name].var() > 1:
        if train:
            scaler = MinMaxScaler()
            data[col_name] = scaler.fit_transform(data[[col_name]])
        else:
            data[col_name] = scaler.transform(data[[col_name]])
    return data, kbins, scaler

def preprocess_data(X_train, X_val, X_test):
    """Main preprocessing function."""

    kbins_dict = {}
    scaler_dict = {}

    # Preprocess each column
    for col in X_train.columns:
        X_train, kbins, scaler = preprocess_column(X_train, col, train=True)
        kbins_dict[col] = kbins
        scaler_dict[col] = scaler

        X_val, _, _ = preprocess_column(X_val, col, train=False, kbins=kbins_dict[col], scaler=scaler_dict[col])
        X_test, _, _ = preprocess_column(X_test, col, train=False, kbins=kbins_dict[col], scaler=scaler_dict[col])

    return X_train, X_val, X_test

# Placeholder dataframes (to avoid errors in the code block)
X_train = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'b': [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]})
X_val = pd.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [-2, -1, 0, 1, 2]})
X_test = pd.DataFrame({'a': [6, 7, 8, 9, 10], 'b': [3, 4, 5, 6, 7]})

# Preprocess placeholder data
X_train, X_val, X_test = preprocess_data(X_train, X_val, X_test)
X_train, X_val, X_test


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV


In [None]:
# Placeholder for best models
best_models = {}

# Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
clf_lr = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid_lr, cv=3)
clf_lr.fit(X_train, y_train)
best_models['lr'] = clf_lr.best_estimator_

# Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
clf_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3)
clf_rf.fit(X_train, y_train)
best_models['rf'] = clf_rf.best_estimator_

# Gradient Boosting (using XGBoost)
param_grid_xgb = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
clf_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, cv=3)
clf_xgb.fit(X_train, y_train)
best_models['xgb'] = clf_xgb.best_estimator_

# SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
clf_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=3)
clf_svm.fit(X_train, y_train)
best_models['svm'] = clf_svm.best_estimator_

# Neural Network
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam']
}
clf_mlp = GridSearchCV(MLPClassifier(max_iter=1000), param_grid_mlp, cv=3)
clf_mlp.fit(X_train, y_train)
best_models['mlp'] = clf_mlp.best_estimator_

# CatBoost (using randomized search because of the large hyperparameter space)
param_dist_cat = {
    'iterations': [50, 100, 150],
    'depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.5],
    'loss_function': ['Logloss']
}
clf_cat = RandomizedSearchCV(CatBoostClassifier(verbose=0), param_dist_cat, cv=3, n_iter=10)
clf_cat.fit(X_train, y_train)
best_models['cat'] = clf_cat.best_estimator_


In [None]:
voting_clf = VotingClassifier(estimators=[('lr', best_models['lr']),
                                          ('rf', best_models['rf']),
                                          ('xgb', best_models['xgb']),
                                          ('svm', best_models['svm']),
                                          ('mlp', best_models['mlp']),
                                          ('cat', best_models['cat'])],
                              voting='soft')
voting_clf.fit(X_train, y_train)


In [None]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    return accuracy, precision, recall, f1

# Evaluating on train, val, and test datasets
train_metrics = evaluate_model(voting_clf, X_train, y_train)
val_metrics = evaluate_model(voting_clf, X_val, y_val)
test_metrics = evaluate_model(voting_clf, X_test, y_test)


In [None]:
calibrated_clf = CalibratedClassifierCV(voting_clf, method='sigmoid', cv='prefit')
calibrated_clf.fit(X_val, y_val)

# Evaluate the calibrated classifier on the test set
test_metrics_calibrated = evaluate_model(calibrated_clf, X_test, y_test)
