## ML-7. Оптимизация гиперпараметров модели

In [79]:
import numpy as np
import pandas as pd
import hyperopt
import optuna

from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, \
train_test_split, cross_val_score
from hyperopt import hp, fmin, tpe, Trials

### Разведывательный анализ

In [80]:
data = pd.read_csv('data/_train_sem09__1_.zip')

In [81]:
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3751 entries, 0 to 3750
Columns: 1777 entries, Activity to D1776
dtypes: float64(942), int64(835)
memory usage: 50.9 MB


In [83]:
data.describe(include='all')

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
count,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,...,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0
mean,0.542255,0.076948,0.592436,0.068142,0.03899,0.212112,0.686653,0.274713,0.455133,0.749517,...,0.026926,0.014663,0.013863,0.021861,0.015196,0.016796,0.012263,0.01173,0.020261,0.011197
std,0.498278,0.079989,0.10586,0.078414,0.115885,0.102592,0.078702,0.090017,0.162731,0.071702,...,0.161889,0.120215,0.116938,0.146249,0.122348,0.128522,0.110074,0.107683,0.140911,0.105236
min,0.0,0.0,0.282128,0.0,0.0,0.00263,0.137873,0.00613,0.0,0.27559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0333,0.517811,0.0,0.0,0.138118,0.625627,0.207374,0.378062,0.707339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0667,0.585989,0.05,0.0,0.190926,0.674037,0.277845,0.499942,0.738961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.668395,0.1,0.0,0.261726,0.740663,0.335816,0.569962,0.788177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.964381,0.95,1.0,1.0,0.994735,0.790831,0.98987,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [84]:
data['Activity'].value_counts(normalize=True)

Activity
1    0.542255
0    0.457745
Name: proportion, dtype: float64

**Вывод:** В целом данные почти сбалансированы.

### Разбиение на выборки

In [85]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.2)

### Обучение базовых моделей

<center> *Логистическая регрессия*

In [87]:

log_simpl = linear_model.LogisticRegression(max_iter = 1000, random_state=42)

log_simpl.fit(X_train, y_train)

y_test_pred = log_simpl.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


<center> *Случайный лес*

In [88]:
rf_simpl = ensemble.RandomForestClassifier(random_state=42)


rf_simpl.fit(X_train, y_train)

y_train_pred = rf_simpl.predict(X_train)

y_test_pred = rf_simpl.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.80


### GridSearchCV

<center> *Логистическая регрессия*

In [89]:
param_grid = {'penalty': [None, 'l2'] ,
              'solver': ['lbfgs', 'newton-cg', 'saga'],
              'C': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 5]
              }
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42,
        max_iter=1000
    ), 
    param_grid=param_grid, 
    cv=10, 
    n_jobs = -1
)  
%time grid_search.fit(X_train, y_train) 

y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {grid_search.best_params_}")

CPU times: total: 17.9 s
Wall time: 38min 21s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


<center> *Случайный лес*

In [90]:
param_grid = {'n_estimators': [100, 200, 300, 400],
              'min_samples_leaf': [2, 5, 10, 20],
              'max_depth': [3, 5, 10, 15, 20]
              }

grid_search = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=10, 
    n_jobs = -1
)  
%time grid_search.fit(X_train, y_train) 

y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {grid_search.best_params_}")

CPU times: total: 9.3 s
Wall time: 7min 32s
f1_score на тестовом наборе: 0.81
Наилучшие значения гиперпараметров: {'max_depth': 15, 'min_samples_leaf': 2, 'n_estimators': 200}


**Результаты по этапу GridSearchCV:**
1. Логистическая регрессия (f1=0.78, время=38min 21s)
2. Случайный лес (f1=0.81, время=7min 32s)

*Лучший результат: метод GridSearchCV, алгоритм случайный лес (f1=0.81, время=7min 32s)*

### RandomizedSearchCV

<center> *Логистическая регрессия*

In [109]:
param_distributions = {'penalty': [None, 'l2'] ,
              'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
               'C': list(np.linspace(0.01, 10, 20, dtype=float))}
            
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42,
        max_iter=1000
    ), 
    param_distributions=param_distributions, 
    cv=10, 
    n_iter = 10, 
    n_jobs = -1
)  
%time random_search.fit(X_train, y_train) 

y_test_pred = random_search.predict(X_test)

print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {random_search.best_params_}")

CPU times: total: 4.78 s
Wall time: 8min 18s
f1_score на тестовом наборе: 0.77
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.5357894736842106}


<center> *Случайный лес*

In [92]:
param_distributions = {'n_estimators': list(np.linspace(100, 500, 5, dtype=int)),
              'min_samples_leaf': list(np.linspace(2, 20, 5, dtype=int)),
              'max_depth': list(np.linspace(2, 20, 5, dtype=int))}
            
random_search = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42),
    param_distributions=param_distributions, 
    cv=10, 
    n_iter = 10, 
    n_jobs = -1
)  
%time random_search.fit(X_train, y_train) 

y_test_pred = random_search.predict(X_test)

print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {random_search.best_params_}")

CPU times: total: 2.61 s
Wall time: 1min 19s
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'n_estimators': 300, 'min_samples_leaf': 11, 'max_depth': 20}


**Результаты по этапу RandomizedSearchCV:**
1. Логистическая регрессия (f1=0.77, время=8min 18s)
2. Случайный лес (f1=0.79, время=1min 19s)

*Лучший результат: метод GridSearchCV, алгоритм случайный лес (f1=0.81, время=7min 32s)*

### Hyperopt

<center> *Логистическая регрессия*

In [93]:
def hyperopt_lr(params, cv=10, X=X_train, y=y_train):
    params = {'penalty': params['penalty'], 
              'solver': params['solver'], 
              'C': float(params['C'])
              }
  
    model = linear_model.LogisticRegression(random_state=42, max_iter=1000, **params)
    
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [94]:
space={'penalty': hp.choice('penalty', [None, 'l2']),
       'solver': hp.choice('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']),
       'C': hp.uniform('С', 0.001, 10)}

In [95]:
%%time

trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_lr, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(42)# фиксируем для повторяемости результата
         )

print(f"Наилучшие значения гиперпараметров {best}")

100%|██████████| 20/20 [19:37<00:00, 58.87s/trial, best loss: -0.7784555979100054] 
Наилучшие значения гиперпараметров {'penalty': 1, 'solver': 3, 'С': 0.643804346232183}
CPU times: total: 6 s
Wall time: 19min 37s


In [96]:
model = linear_model.LogisticRegression(random_state=42, 
                                        max_iter=1000,
                                        penalty=[None, 'l2'][best['penalty']],
                                        solver=['lbfgs', 'newton-cg', 'sag', 'saga'][best['solver']],
                                        C=best['С']
                                        )

model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.77


<center> *Случайный лес*

In [97]:
def hyperopt_rf(params, cv=10, X=X_train, y=y_train):
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])
              }
  
    model = ensemble.RandomForestClassifier(**params, random_state=42)

    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [98]:
space={'n_estimators': hp.quniform('n_estimators', 100, 500, 10),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 20, 1)
      }

In [99]:
%%time

trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(42)# фиксируем для повторяемости результата
         )

print(f"Наилучшие значения гиперпараметров {best}")

100%|██████████| 20/20 [02:30<00:00,  7.52s/trial, best loss: -0.8197052715279278]
Наилучшие значения гиперпараметров {'max_depth': 13.0, 'min_samples_leaf': 3.0, 'n_estimators': 160.0}
CPU times: total: 766 ms
Wall time: 2min 30s


In [100]:
model = ensemble.RandomForestClassifier(
                                        random_state=42, 
                                        n_estimators=int(best['n_estimators']),
                                        max_depth=int(best['max_depth']),
                                        min_samples_leaf=int(best['min_samples_leaf'])
                                        )

model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


**Результаты по этапу Hyperopt:**
1. Логистическая регрессия (f1=0.77, время=19min 37s)
2. Случайный лес (f1=0.78, время=2min 30s)

*Лучший результат: метод GridSearchCV, алгоритм случайный лес (f1=0.81, время=7min 32s)*

### Optuna

<center> *Логистическая регрессия*

In [101]:
def optuna_lr(trial,):
  # задаем пространства поиска гиперпараметров
  penalty = trial.suggest_categorical('penalty', [None, 'l2'])
  solver = trial.suggest_categorical('solver', ['lbfgs', 'newton-cg', 'sag', 'saga'])
  С = trial.suggest_float('C', 0.01, 9.51, step=0.5, log=False)

  
  model = linear_model.LogisticRegression(random_state=42, 
                                          max_iter=1000, 
                                          C=С,
                                          penalty=penalty,
                                          solver=solver)
  
  # обучаем модель
  score = cross_val_score(model, X_train, y_train, cv=10, scoring="f1", n_jobs=-1).mean()

  return score

In [102]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lr, n_trials=20)

[I 2024-09-12 14:31:16,510] A new study created in memory with name: LogisticRegression
[I 2024-09-12 14:31:24,652] Trial 0 finished with value: 0.7661796382476469 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 9.01}. Best is trial 0 with value: 0.7661796382476469.
[I 2024-09-12 14:34:13,994] Trial 1 finished with value: 0.7739884788543067 and parameters: {'penalty': 'l2', 'solver': 'saga', 'C': 4.51}. Best is trial 1 with value: 0.7739884788543067.
[I 2024-09-12 14:34:20,240] Trial 2 finished with value: 0.772455070828275 and parameters: {'penalty': 'l2', 'solver': 'newton-cg', 'C': 5.01}. Best is trial 1 with value: 0.7739884788543067.
[I 2024-09-12 14:36:59,010] Trial 3 finished with value: 0.7739233542671953 and parameters: {'penalty': 'l2', 'solver': 'saga', 'C': 7.01}. Best is trial 1 with value: 0.7739884788543067.
[I 2024-09-12 14:39:18,018] Trial 4 finished with value: 0.7659153272207446 and parameters: {'penalty': None, 'solver': 'sag', 'C': 5.01}. Best is trial 1 

CPU times: total: 4.55 s
Wall time: 21min 3s


In [103]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'sag', 'C': 0.01}
f1_score на обучающем наборе: 0.79


In [104]:
model = linear_model.LogisticRegression(**study.best_params, random_state=42, max_iter=1000)
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


<center> *Случайный лес*

In [105]:
def optuna_rf(trial,):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 500,step=10,log=False)
  max_depth = trial.suggest_int('max_depth', 2, 20, step=1,log=False)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 20, step=1, log=False)
  

  
  model = ensemble.RandomForestClassifier(random_state=42, 
                                          n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf)
  
  # обучаем модель
  score = cross_val_score(model, X_train, y_train, cv=10, scoring="f1", n_jobs=-1).mean()

  return score

In [106]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

[I 2024-09-12 14:52:22,764] A new study created in memory with name: RandomForestClassifier
[I 2024-09-12 14:52:37,617] Trial 0 finished with value: 0.8068744943887489 and parameters: {'n_estimators': 460, 'max_depth': 9, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8068744943887489.
[I 2024-09-12 14:52:50,402] Trial 1 finished with value: 0.809525662734025 and parameters: {'n_estimators': 410, 'max_depth': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.809525662734025.
[I 2024-09-12 14:52:55,408] Trial 2 finished with value: 0.7518623865535313 and parameters: {'n_estimators': 390, 'max_depth': 3, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.809525662734025.
[I 2024-09-12 14:53:02,905] Trial 3 finished with value: 0.8147106559202737 and parameters: {'n_estimators': 210, 'max_depth': 17, 'min_samples_leaf': 8}. Best is trial 3 with value: 0.8147106559202737.
[I 2024-09-12 14:53:10,991] Trial 4 finished with value: 0.7954341671389766 and parameters: {'n_estima

CPU times: total: 1.28 s
Wall time: 3min 48s


In [107]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'n_estimators': 480, 'max_depth': 19, 'min_samples_leaf': 2}
f1_score на обучающем наборе: 0.82


In [108]:
model = ensemble.RandomForestClassifier(**study.best_params, random_state=42)
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.80


**Результаты по этапу Optuna:**
1. Логистическая регрессия (f1=0.78, время=21min 3s)
2. Случайный лес (f1=0.80, время=3min 48s)

*Лучший результат: метод GridSearchCV, алгоритм случайный лес (f1=0.81, время=7min 32s)*

### Общий вывод

***Лучший результат показал метод GridSearchCV, алгоритм случайный лес (f1=0.81, время=7min 32s)***