In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [11]:
import optuna

class Optuna:
    def __init__(self, X, y, n_trials=100, model: str = 'RandomForest'):
        self.X = X
        self.y = y
        self.n_trials = n_trials
        self.study = optuna.create_study(direction='maximize')
        self.model = model

    def objective(self, trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 2, 50),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'max_depth': trial.suggest_int('max_depth', 2, 32),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
            'n_jobs': -1
        }
        if self.model == 'RandomForest':
            self.clf = RandomForestClassifier(**params)
        return cross_val_score(self.clf, self.X, self.y, cv=5).mean()

    def optimize(self):
        self.study.optimize(self.objective, n_trials=self.n_trials)
        return self.study.best_params

In [None]:
df = pd.read_csv('../data/encoded/Mistral-Prot-v1-15M.csv')
df.columns

Index(['p_1', 'p_2', 'p_3', 'p_4', 'p_5', 'p_6', 'p_7', 'p_8', 'p_9', 'p_10',
       ...
       'p_248', 'p_249', 'p_250', 'p_251', 'p_252', 'p_253', 'p_254', 'p_255',
       'p_256', 'target'],
      dtype='object', length=257)

In [10]:
X = df.drop('target', axis=1).values
y = df['target'].values

In [None]:
opt = Optuna(X, y, n_trials=100, model='RandomForest')
opt.optimize()

[I 2024-11-29 17:10:43,672] A new study created in memory with name: no-name-48eae289-ee9c-476f-beb0-f5daa519885a
[I 2024-11-29 17:10:44,519] Trial 0 finished with value: 0.4030010847818226 and parameters: {'n_estimators': 25, 'criterion': 'gini', 'max_depth': 27, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.4030010847818226.
[I 2024-11-29 17:10:45,536] Trial 1 finished with value: 0.6230514567259975 and parameters: {'n_estimators': 50, 'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6230514567259975.
[I 2024-11-29 17:10:46,525] Trial 2 finished with value: 0.40466987584308517 and parameters: {'n_estimators': 25, 'criterion': 'gini', 'max_depth': 27, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6230514567259975.
[I 2024-11-29 17:10:47,233] Trial 3 finished with value: 0.5208808544341634 and parameters: {'n_estimators': 12, 'criterion': 'gini', 'max_depth'

{'n_estimators': 47,
 'criterion': 'gini',
 'max_depth': 2,
 'min_samples_leaf': 9,
 'max_features': 'sqrt'}