In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [2]:
import optuna

class Optuna:
    def __init__(self, X, y, n_trials=100, model: str = 'RandomForest'):
        self.X = X
        self.y = y
        self.n_trials = n_trials
        self.study = optuna.create_study(direction='maximize')
        self.model = model

    def objective(self, trial):
        params = {
            'C': trial.suggest_loguniform('C', 1e-10, 1e10),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
            'degree': trial.suggest_int('degree', 1, 5),
            'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
            'coef0': trial.suggest_uniform('coef0', 0, 10),
            'shrinking': trial.suggest_categorical('shrinking', [True, False]),
            'probability': trial.suggest_categorical('probability', [True, False]),
            'tol': trial.suggest_loguniform('tol', 1e-5, 1e-1),
            'decision_function_shape': trial.suggest_categorical('decision_function_shape', ['ovo', 'ovr'])
        }
        if self.model == 'RandomForest':
            self.clf = RandomForestClassifier(**params)
        if self.model == 'SVC':
            self.clf = SVC(**params)
        return cross_val_score(self.clf, self.X, self.y, cv=5).mean()

    def optimize(self):
        self.study.optimize(self.objective, n_trials=self.n_trials)
        return self.study.best_params

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('../data/encoded/physicochemical_WOLS870103.csv')
df.columns

Index(['p_0', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6', 'p_7', 'p_8', 'p_9',
       ...
       'p_2003', 'p_2004', 'p_2005', 'p_2006', 'p_2007', 'p_2008', 'p_2009',
       'p_2010', 'p_2011', 'response'],
      dtype='object', length=2013)

In [4]:
X = df.drop('response', axis=1).values
y = df['response'].values

In [None]:
opt = Optuna(X, y, n_trials=5, model='SVC')
opt.optimize()

[I 2024-12-04 17:42:00,888] A new study created in memory with name: no-name-cc41220c-dcc9-46ab-86f3-831377b75723
  'C': trial.suggest_loguniform('C', 1e-10, 1e10),
  'coef0': trial.suggest_uniform('coef0', 0, 10),
  'tol': trial.suggest_loguniform('tol', 1e-5, 1e-1),
[I 2024-12-04 17:51:19,431] Trial 0 finished with value: 0.630708114046701 and parameters: {'C': 4785889.9780677445, 'kernel': 'rbf', 'degree': 5, 'gamma': 'scale', 'coef0': 7.429097448132248, 'shrinking': False, 'probability': True, 'tol': 0.0043663652111854325, 'decision_function_shape': 'ovr'}. Best is trial 0 with value: 0.630708114046701.
  'C': trial.suggest_loguniform('C', 1e-10, 1e10),
  'coef0': trial.suggest_uniform('coef0', 0, 10),
  'tol': trial.suggest_loguniform('tol', 1e-5, 1e-1),
[I 2024-12-04 17:53:12,179] Trial 1 finished with value: 0.6203306342269552 and parameters: {'C': 233280.68389436475, 'kernel': 'linear', 'degree': 2, 'gamma': 'auto', 'coef0': 8.09629002237622, 'shrinking': True, 'probability': F

{'C': 4785889.9780677445,
 'kernel': 'rbf',
 'degree': 5,
 'gamma': 'scale',
 'coef0': 7.429097448132248,
 'shrinking': False,
 'probability': True,
 'tol': 0.0043663652111854325,
 'decision_function_shape': 'ovr'}

## HalvingSearch

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
import numpy as np

In [9]:
class Models():
    def __init__(self, linspace_size: int = 3, n_iter_search: int = 15):
        self.linspace_size = 3
        self.algorithms = {
                "knn": ( 
                    KNeighborsClassifier(),
                    {
                        "n_neighbors": np.linspace(3, 10, num=linspace_size, dtype=int),
                        "weights": ["uniform", "distance"],
                        "p": np.linspace(1, 10, num=3, dtype=int),
                        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                        "leaf_size": np.linspace(10, 100, num=linspace_size, dtype=int),
                        "metric": ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
                    }
                ),
                "dt": (
                    DecisionTreeClassifier(),
                    {
                        "criterion": ["gini", "entropy", "log_loss"],
                        "splitter": ["best", "random"],
                        "max_depth": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "min_samples_split": np.linspace(2, 100, num=linspace_size, dtype=int),
                        "min_samples_leaf": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "max_features": ["sqrt", "log2"],
                        "ccp_alpha": np.linspace(0, 0.1, num=3),
                    }
                ),
                "svm": (
                    SVC(),
                    {
                        "C": np.linspace(0.1, 10, num=linspace_size),
                        "kernel": ["linear", "poly", "rbf", "sigmoid"],
                        "degree": np.linspace(1, 10, num=linspace_size, dtype=int),
                        "gamma": ["scale", "auto"],
                        "coef0": np.linspace(0, 10, num=linspace_size),
                        "shrinking": [True, False],
                        "probability": [True, False],
                        "tol": np.linspace(0.0001, 0.01, num=linspace_size),
                    }
                ),
                "rf": (
                    RandomForestClassifier(),
                    {
                        "n_estimators": np.linspace(100, 200, num=linspace_size, dtype=int), # 30 segundos
                        "criterion": ["gini", "entropy", "log_loss"],
                        "max_depth": np.linspace(1, 100, num=linspace_size, dtype=int),
                        #"min_samples_split": np.linspace(2, 10, num=linspace_size, dtype=int), 20 segundos
                        #"min_samples_leaf": np.linspace(1, 10, num=linspace_size, dtype=int),
                        #"max_leaf_nodes": np.linspace(10, 90, num=9, dtype=int), # 1 minuto y algo, Afecta demasiado al tiempo
                        #"min_impurity_decrease": np.linspace(0, 0.1, num=linspace_size),
                        "max_features": ["sqrt", "log2"],
                        "ccp_alpha": np.linspace(0, 0.1, num=linspace_size),
                        "class_weight": ["balanced", "balanced_subsample"]
                    }
                ),
                "ada": (
                    AdaBoostClassifier(),
                    {
                        "n_estimators": np.linspace(10, 100, num=linspace_size, dtype=int),
                        "learning_rate": np.linspace(0.1, 1, num=linspace_size),
                        "algorithm": ["SAMME", "SAMME.R"],
                    }
                )    
        }
        self.n_iter_search = n_iter_search

    def report(self, results, n_top: int = 3):
        export_list = []
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results["rank_test_score"] == i)
            for candidate in candidates:
                export_list.append({
                    "rank": i,
                    "mean_test_score": results["mean_test_score"][candidate],
                    "std_test_score": results["std_test_score"][candidate],
                    "params": results["params"][candidate]
                })
        
        return pd.DataFrame(export_list, columns=["rank", "mean_test_score", "std_test_score", "params"])

    def random_search(self, algorithm: str, seed: int = 42):
        return RandomizedSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_iter=self.n_iter_search, random_state=seed, n_jobs=-1)
    
    def grid_search(self, algorithm: str):
        return GridSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_jobs=-1)
    
    def halving_search(self, algorithm: str):
        return HalvingGridSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_jobs=-1)
    
    def halving_search_f1(self, algorithm: str):
        scoring = make_scorer(f1_score, average='macro')
        return HalvingGridSearchCV(
            estimator=self.algorithms[algorithm][0], 
            param_grid=self.algorithms[algorithm][1], 
            n_jobs=-1, 
            scoring=scoring
            )

    
    def compare_search(self, dataset: str, X: np.array, y: np.array, algorithm: str, seed: int = 42):
        export_list = []
        
        print(f"Running Grid Search with {dataset} and {algorithm}...")
        grid_search = self.grid_search(algorithm)
        
        start = time()
        grid_search.fit(X, y)
        time_spent = time() - start

        print("Grid Search took %.2f seconds" % (time_spent))

        results = grid_search.cv_results_

        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]
        
        export_list.append({
            "dataset": dataset,
            "type": "grid",
            "time": time_spent,
            "algorithm": algorithm,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        print("Running Random Search...")
        random_search = self.random_search(algorithm, seed)

        start = time()
        random_search.fit(X, y)
        time_spent = time() - start

        print("Random Search took %.2f seconds" % (time_spent))

        results = random_search.cv_results_

        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]

        export_list.append({
            "dataset": dataset,
            "type": "random",
            "time": time_spent,
            "algorithm": algorithm,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        print("Running Halving Grid Search...")
        halving_search = self.halving_search(algorithm)

        start = time()
        halving_search.fit(X, y)
        time_spent = time() - start

        print("Halving Grid Search took %.2f seconds" % (time_spent))

        results = halving_search.cv_results_
        
        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]

        export_list.append({
            "dataset": dataset,
            "type": "halving",
            "time": time_spent,
            "algorithm": algorithm,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        return pd.DataFrame(export_list, columns=["dataset", "type", "time", "algorithm", "mean_test_score", "std_test_score", "params"])

In [10]:
X = df.drop('response', axis=1).values
y = df['response'].values

In [None]:
models_instance = Models(linspace_size=10, n_iter_search=50)
results = pd.DataFrame(columns=["dataset", "algorithm", "rank", "mean_test_score", "std_test_score", "params"])
seed = 42


X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=seed)

halving_search = models_instance.halving_search_f1("svm")
halving_search.fit(X_train, np.squeeze(y_train))

result = models_instance.report(halving_search.cv_results_)
result.insert(0, "best_params", halving_search.best_params_)
result.insert(1, "best_score", halving_search.best_score_)

results = pd.concat([results, result])

results