In [7]:
%run Model_And_Optimization.ipynb

In [22]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, mean_absolute_error
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

class ModelOptimization(BaseEstimator):
    def __init__(self, models, param_grids, scoring, cv=5, top_n=3, n_iter=10, search_method='grid'):
        self.models = models
        self.param_grids = param_grids
        self.scoring = scoring
        self.cv = cv
        self.top_n = top_n
        self.best_models = []
        self.search_method = search_method
        self.n_iter = n_iter
        self.best_overall_score = float('-inf')
        self.best_overall_model = None
        self.initial_scores = {}
        self.best_hyperparameters = {}

    def fit(self, X, y):
        all_scores = {}
        for i, model in enumerate(self.models):
            print(f"Fitting model {i+1}/{len(self.models)}: {type(model).__name__}")
            model.fit(X, y)
            y_pred = model.predict(X)

            if self.scoring == 'accuracy':
                score = accuracy_score(y, y_pred)
            elif self.scoring == 'roc_auc':
                if len(set(y)) > 2:
                    score = roc_auc_score(y, model.predict_proba(X), multi_class='ovr', average='macro')
                else:
                    score = roc_auc_score(y, y_pred)
            elif self.scoring == 'precision':
                score = precision_score(y, y_pred, average='macro')
            elif self.scoring == 'mae':
                score = -mean_absolute_error(y, y_pred)
            else:
                raise ValueError(f"Invalid scoring metric: {self.scoring}")

            model_name = type(model).__name__
            all_scores[model_name] = score
            self.initial_scores[model_name] = score
            
            if all_scores[model_name] > self.best_overall_score:
                self.best_overall_score = all_scores[model_name]
                self.best_overall_model = model

        # Select the "top N" models based on the scores
        top_models = sorted(all_scores.items(), key=lambda x: x[1], reverse=True)[:self.top_n]
        top_model_names = [model[0] for model in top_models]

        print("\nModel ranking based on scores:")
        for rank, (model_name, score) in enumerate(top_models, 1):
            print(f"Rank {rank}: {model_name} - Score: {score:.4f}")
        print('\n\n')

        # Perform GridSearchCV only on the top N models
        best_scores = {}
        for model_name in top_model_names:
            model_index = [i for i, model in enumerate(self.models) if type(model).__name__ == model_name][0]
            model = self.models[model_index]
            print(f"Optimizing hyperparameters for model {model_name}")

            if self.search_method == 'grid':
                search = GridSearchCV(model, self.param_grids[model_index], scoring=self.scoring, cv=self.cv, n_jobs=-1)
            elif self.search_method == 'random':
                search = RandomizedSearchCV(model, self.param_grids[model_index], scoring=self.scoring, cv=self.cv, n_jobs=-1, n_iter=self.n_iter)
            elif self.search_method == 'bayesian':
                search = BayesSearchCV(model, self.param_grids[model_index], scoring=self.scoring, cv=self.cv, n_iter=self.n_iter, n_jobs=-1)
            else:
                raise ValueError("Invalid search method. Supported options: 'grid', 'random', 'bayesian'")

            search.fit(X, y)
            
            
            best_model = search.best_estimator_
            best_score = search.best_score_

            print(f"Best score for {model_name}: {best_score:.4f}")
            print(f"Params: {search.best_params_}\n\n")

            best_scores[model_name] = best_score
            self.best_models.append(best_model)
            
            if model_name in best_scores and best_scores[model_name] > all_scores[model_name]:
                self.best_hyperparameters[model_name] = search.best_params_
        
        print(f"Best overall score: {self.best_overall_score} for model {self.best_overall_model} with hyperparameters:{self.best_hyperparameters}")
        return self

    def predict(self, X):
        predictions = {}
        for model in self.best_models:
            model_name = type(model).__name__
            y_pred = model.predict(X)
            predictions[model_name] = y_pred

        return predictions
    
    def get_best_model(self):
        return self.best_overall_model

In [23]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

X, y = make_classification(n_samples=1000, n_features=10, random_state=42, n_informative=5, n_redundant=5)


# Define models and parameter grids
models = [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(),
         XGBClassifier()]

param_grids = [
    {'C': [0.1, 1, 10]},  # Parameter grid for LogisticRegression
    {'max_depth': [None, 5, 10], 'n_estimators': [50, 100, 150]},  # Parameter grid for RandomForestClassifier
    {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [50, 100, 150]},  # Parameter grid for GradientBoostingClassifier
    {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']},  # Parameter grid for KNeighborsClassifier
    {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [50, 100, 150], 'max_depth': [3, 5, 7]},  # Parameter grid for XGBClassifier
]

# Example: Use the ModelOptimization class to optimize models with roc_auc scoring
scoring = 'accuracy'
model_comparison = ModelOptimization(models = models, param_grids = param_grids,
                                     scoring = 'accuracy', top_n = 3, search_method ='bayesian')
model_comparison.fit(X, y)




# # Make predictions using the best models
# X_test, y_test = make_classification(n_samples=100, n_features=10, random_state=42)
# predictions = model_comparison.predict(X_test)

# # Print the predictions for each model
# for model_name, y_pred in predictions.items():
#     print(f"Predictions for {model_name}:\n{y_pred}\n")
    

Fitting model 1/5: LogisticRegression
Fitting model 2/5: RandomForestClassifier
Fitting model 3/5: GradientBoostingClassifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)



Fitting model 4/5: KNeighborsClassifier
Fitting model 5/5: XGBClassifier

Model ranking based on scores:
Rank 1: RandomForestClassifier - Score: 1.0000
Rank 2: XGBClassifier - Score: 1.0000
Rank 3: GradientBoostingClassifier - Score: 0.9860



Optimizing hyperparameters for model RandomForestClassifier
Best score for RandomForestClassifier: 0.9410
Params: OrderedDict([('max_depth', None), ('n_estimators', 50)])


Optimizing hyperparameters for model XGBClassifier
Best score for XGBClassifier: 0.9340
Params: OrderedDict([('learning_rate', 0.1), ('max_depth', 5), ('n_estimators', 100)])


Optimizing hyperparameters for model GradientBoostingClassifier
Best score for GradientBoostingClassifier: 0.9250
Params: OrderedDict([('learning_rate', 0.1), ('n_estimators', 150)])


Best overall score: 1.0 for model RandomForestClassifier() with hyperparameters:{}


ModelOptimization(models=[LogisticRegression(), RandomForestClassifier(),
                          GradientBoostingClassifier(), KNeighborsClassifier(),
                          XGBClassifier(base_score=None, booster=None,
                                        callbacks=None, colsample_bylevel=None,
                                        colsample_bynode=None,
                                        colsample_bytree=None,
                                        early_stopping_rounds=None,
                                        enable_categorical=False,
                                        eval_metric=None, feature_types=None,
                                        gamma=None, gp...
                                        random_state=None, ...)],
                  param_grids=[{'C': [0.1, 1, 10]},
                               {'max_depth': [None, 5, 10],
                                'n_estimators': [50, 100, 150]},
                               {'learning_rate': [0.1, 0

In [24]:
modelo = model_comparison.get_best_model()

In [25]:
modelo

RandomForestClassifier()