In [42]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error, precision_score, roc_auc_score
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import numpy as np

class ModelOptimization(BaseEstimator):
    def __init__(self, models, param_grids, scoring, cv=5, top_n=3, n_iter=10, search_method='grid'):
        self.models = models
        self.param_grids = param_grids
        self.scoring = scoring
        self.cv = cv
        self.top_n = top_n
        self.best_models = []
        self.search_method = search_method
        self.n_iter = n_iter
        self.best_overall_score = float('-inf')
        self.best_overall_model = None
        self.initial_scores = {}
        self.best_hyperparameters = {}
        self.saved_models = []

    def _fit_initial_models(self, X_train, y_train, X_test, y_test):
        all_scores = {}
        
        for i, model in enumerate(self.models):
            print(f"Fitting model {i+1}/{len(self.models)}: {type(model).__name__}")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Calculate and store the initial score
            model_name = type(model).__name__
            score = self._calculate_score(self.scoring, y_test, y_pred)
            all_scores[model_name] = score
            self.initial_scores[model_name] = score
            
            if score > self.best_overall_score:
                self.best_overall_score = score
                self.best_overall_model = model

    def _select_top_models(self):
            # Selecting and printing the top_n models
            top_models = sorted(self.initial_scores.items(), key=lambda x: x[1], reverse=True)[:self.top_n]
            self.best_model_names = [model[0] for model in top_models]

            print("\nModel ranking based on scores:")
            for rank, (model_name, score) in enumerate(top_models, 1):
                print(f"Rank {rank}: {model_name} - Score: {score:.4f}")

            print('\n')

    def _optimize_and_cv_top_models(self, X_train, y_train):
        for model_name in self.best_model_names:
            # Get the index of the current model in the loop
            model_index = [i for i, model in enumerate(self.models) if type(model).__name__ == model_name][0]

            # Retrieve the actual model using the index
            model = self.models[model_index]

            # Doing CV without hyperparameters first:
            print(f"\n\t\t\t-----{model_name}-----")
            print(f"Cross-validating model without hyperparameters: {model_name}")
            scores_cv = cross_val_score(model, X_train, y_train, scoring=self.scoring, cv=self.cv, n_jobs=-1)
            avg_score_no_opt = np.mean(scores_cv)
            print(f"Avg CV score for {model_name} without hyperparameters: {avg_score_no_opt:.4f}")

            print(f"Optimizing hyperparameters and cross-validating for model {model_name}")
            search = self._create_search(model, self.param_grids[model_index])
            search.fit(X_train, y_train)

            best_model = search.best_estimator_
            best_score = search.best_score_
            best_params = search.best_params_

            print(f"\nBest score for {model_name}: {best_score:.4f}")
            print(f"Best hyperparameters for {model_name}: {best_params}")

            self.best_models.append(best_model)
            self.best_hyperparameters[model_name] = best_params

            # Compare models with hyperparameters using cross-validation on training data
            scores_cv_opt = cross_val_score(best_model, X_train, y_train, scoring=self.scoring, cv=self.cv, n_jobs=-1)
            avg_score_opt = np.mean(scores_cv_opt)

            if best_score > avg_score_opt:
                relative_increase = ((best_score - avg_score_opt) / avg_score_opt) * 100
                print(f"Relative increase in score after optimization: {relative_increase:.2f}%")
            else:
                print(f"The score with optimization is worse than without any hyperparameters")
        
    def _calculate_score(self, scoring, y_true, y_pred):
        if scoring == 'accuracy':
            return accuracy_score(y_true, y_pred)
        elif scoring == 'roc_auc':
            if len(set(y_true)) > 2:
                return roc_auc_score(y_true, y_pred, average='macro', multi_class='ovr')
            else:
                return roc_auc_score(y_true, y_pred)
        elif scoring == 'precision':
            return precision_score(y_true, y_pred, average='macro')
        elif scoring == 'mae':
            return -mean_absolute_error(y_true, y_pred)
        else:
            raise ValueError(f"Invalid scoring metric: {scoring}")

    def _create_search(self, model, param_grid):
        if self.search_method == 'grid':
            return GridSearchCV(model, param_grid, scoring=self.scoring, cv=self.cv, n_jobs=-1)
        elif self.search_method == 'random':
            return RandomizedSearchCV(model, param_grid, scoring=self.scoring, cv=self.cv, n_jobs=-1, n_iter=self.n_iter)
        elif self.search_method == 'bayesian':
            return BayesSearchCV(model, param_grid, scoring=self.scoring, cv=self.cv, n_iter=self.n_iter, n_jobs=-1)
        else:
            raise ValueError("Invalid search method. Supported options: 'grid', 'random', 'bayesian'")
    
    def _list_saved_models(self):
        print("Saved models:")
        for i, saved_model in enumerate(self.saved_models, start=1):
            print(f"{i}: {saved_model['model_name']} - Score: {saved_model['score']:.4f}")
            
            
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        self._fit_initial_models(X_train, y_train, X_test, y_test)  # Pass X_test and y_test here
        self._select_top_models()
        self._optimize_and_cv_top_models(X_train, y_train)

        # Calculate the best overall score after optimization
        best_score = float('-inf')
        best_model = None
        for model in self.best_models:
            model_name = type(model).__name__
            scores_cv = cross_val_score(model, X_train, y_train, scoring=self.scoring, cv=self.cv, n_jobs=-1)
            avg_score = np.mean(scores_cv)
            if avg_score > best_score:
                best_score = avg_score
                best_model = model_name

        print(f"\n\t\t\t-----RESULTS-----")
        print(f"Best overall score: {best_score:.4f} for model {best_model} with hyperparameters: {self.best_hyperparameters[best_model]}")
        return self
    
    def predict(self, X):
        predictions = {}
        for model in self.best_models:
            model_name = type(model).__name__
            y_pred = model.predict(X)
            predictions[model_name] = y_pred

        return predictions
    
    def get_best_model(self):
        best_hyperparameters = self.best_hyperparameters.get(type(self.best_overall_model).__name__, None)
        return self.best_overall_model, best_hyperparameters

    def choose_model_to_load(self):
        self._list_saved_models()
        choice = int(input("Enter the index of the model you want to load: ")) - 1
        if 0 <= choice < len(self.saved_models):
            return self.saved_models[choice]['model']
        else:
            print("Invalid choice.")
            return None

    def save_best_models(self, num_models_to_save):
        if num_models_to_save > 0:
            top_saved_models = sorted(self.saved_models, key=lambda x: x['score'], reverse=True)[:num_models_to_save]
            self.saved_models = top_saved_models


In [43]:
import time
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Generate a small dataset
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(),
          XGBClassifier()]

param_grids = [
    {'C': [0.1, 1, 10]},  # Parameter grid for LogisticRegression
    {'max_depth': [None, 5, 10], 'n_estimators': [50, 100, 150]},  # Parameter grid for RandomForestClassifier
    {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [50, 100, 150]},  # Parameter grid for GradientBoostingClassifier
    {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},  # Parameter grid for KNeighborsClassifier
    {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [50, 100, 150], 'max_depth': [3, 5, 7]},  # Parameter grid for XGBClassifier
]

model_comparison = ModelOptimization(models=models, param_grids=param_grids,
                                     scoring='accuracy', top_n=3, search_method='bayesian')

ti = time.time()
model_comparison.fit(X_train, y_train)
print(time.time() - ti)


Fitting model 1/5: LogisticRegression
Fitting model 2/5: RandomForestClassifier
Fitting model 3/5: GradientBoostingClassifier
Fitting model 4/5: KNeighborsClassifier
Fitting model 5/5: XGBClassifier

Model ranking based on scores:
Rank 1: LogisticRegression - Score: 0.8750
Rank 2: RandomForestClassifier - Score: 0.8125
Rank 3: GradientBoostingClassifier - Score: 0.8125



			-----LogisticRegression-----
Cross-validating model without hyperparameters: LogisticRegression
Avg CV score for LogisticRegression without hyperparameters: 0.9526
Optimizing hyperparameters and cross-validating for model LogisticRegression

Best score for LogisticRegression: 0.9692
Best hyperparameters for LogisticRegression: OrderedDict([('C', 0.1)])
The score with optimization is worse than without any hyperparameters

			-----RandomForestClassifier-----
Cross-validating model without hyperparameters: RandomForestClassifier
Avg CV score for RandomForestClassifier without hyperparameters: 0.9692
Optimizing hyperp