In [12]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn import linear_model 
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

import optuna

%matplotlib inline
plt.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

In [13]:
data = pd.read_csv('_train_sem09 (1).csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [14]:
TARGET = 'Activity'

X = data.drop(TARGET, axis=1)
y = data[TARGET]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.2)

In [16]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
log_reg = linear_model.LogisticRegression(max_iter = 1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_train_pred = log_reg.predict(X_train_scaled)
y_test_pred = log_reg.predict(X_test_scaled)

print('f1_score for Train with DecisionTreeClassifier: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print('f1_score for Train with LogisticRegression: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score for Train with DecisionTreeClassifier: 0.90
f1_score for Train with LogisticRegression: 0.78


In [18]:
rf = ensemble.RandomForestClassifier(random_state=42)

rf.fit(X_train_scaled, y_train)

y_train_pred = rf.predict(X_train_scaled)
y_test_pred = rf.predict(X_test_scaled)

print('f1_score for Train with RandomForestClassifier: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print('f1_score for Test with RandomForestClassifier: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score for Train with RandomForestClassifier: 1.00
f1_score for Test with RandomForestClassifier: 0.80


GridSearchCV

In [20]:
param_grid = {'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
              'C': list(np.linspace(0.01, 1, 3, dtype=float))}

grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42, 
        max_iter=1000 
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  

%time grid_search.fit(X_train_scaled, y_train) 
y_test_pred = grid_search.predict(X_test_scaled)

print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

print("Наилучшая модель:\n{}".format(grid_search.best_estimator_))
print("Наилучшее значение точности при кросс-валидаци: {:.2f}".format(grid_search.best_score_))

CPU times: total: 1.75 s
Wall time: 14min 45s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Наилучшая модель:
LogisticRegression(C=0.01, max_iter=1000, random_state=42)
Наилучшее значение точности при кросс-валидаци: 0.76


In [21]:
param_grid = {'n_estimators': list(range(100, 500, 100)),
              'min_samples_leaf': [5],
              'max_depth': list(np.linspace(20, 40, 5, dtype=int)),
              'criterion': ['gini', 'entropy']
              }
            
grid_search_forest = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
%time grid_search_forest.fit(X_train_scaled, y_train) 
y_train_pred = grid_search_forest.predict(X_train_scaled)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))

y_test_pred = grid_search_forest.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_forest.best_params_))


CPU times: total: 8.09 s
Wall time: 9min 5s
f1_score на обучающем наборе: 0.95
f1_score на тестовом наборе: 0.81
Наилучшие значения гиперпараметров: {'criterion': 'entropy', 'max_depth': 25, 'min_samples_leaf': 5, 'n_estimators': 300}


RandomizedSearchCV

In [22]:

param_distributions = {'penalty': ['l2', 'none'],
                       'solver': ['lbfgs', 'sag'],
                       'C': list(np.linspace(0.01, 1, 4, dtype=float))}
            
random_search = RandomizedSearchCV(
    estimator=log_reg, 
    param_distributions=param_distributions, 
    cv=5, 
    n_iter = 10, 
    n_jobs = -1
)  
%time random_search.fit(X_train_scaled, y_train) 
y_test_pred = random_search.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

CPU times: total: 6.3 s
Wall time: 9min 32s
f1_score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'solver': 'sag', 'penalty': 'l2', 'C': 0.01}


In [23]:
param_distributions = {'n_estimators': list(range(80, 200, 30)),
                       'min_samples_leaf': [5],
                       'max_depth': list(np.linspace(20, 40, 10, dtype=int)),
                       'criterion': ['gini', 'entropy']
}
            
random_search_forest = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_distributions, 
    cv=5,
    n_iter = 10, 
    n_jobs = -1
)  
%time random_search_forest.fit(X_train_scaled, y_train) 
y_train_pred = random_search_forest.predict(X_train_scaled)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))

y_test_pred = random_search_forest.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

CPU times: total: 3.55 s
Wall time: 1min 1s
f1_score на обучающем наборе: 0.95
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'n_estimators': 140, 'min_samples_leaf': 5, 'max_depth': 22, 'criterion': 'entropy'}


In [24]:
print("Наилучшая модель:\n{}".format(random_search_forest.best_estimator_))
print("Наилучшее значение точности при кросс-валидаци: {:.2f}".format(random_search_forest.best_score_))

Наилучшая модель:
RandomForestClassifier(criterion='entropy', max_depth=22, min_samples_leaf=5,
                       n_estimators=140, random_state=42)
Наилучшее значение точности при кросс-валидаци: 0.80


Hyperopt x LogisticRegression

In [31]:
space={'penalty': hp.choice('penalty', ('l2','none' )),
       'solver' : hp.choice('solver', ('lbfgs', 'sag')),
       'C': hp.uniform('C', 0.01, 1)
    }


random_state = 42
def hyperopt_lg(params, cv=5, X=X_train_scaled, y=y_train, random_state=random_state):
    params = {'penalty': params['penalty'], 
              'solver': params['solver'], 
             'C': float(params['C'])
              }
  
    model = linear_model.LogisticRegression(**params, random_state=random_state)

    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score


%%time 
trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_lg, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)
         )

print("Наилучшие значения гиперпараметров {}".format(best))

CPU times: total: 0 ns
Wall time: 0 ns
100%|██████████| 20/20 [06:46<00:00, 20.33s/trial, best loss: -0.7910017999615568]
Наилучшие значения гиперпараметров {'C': 0.05084775379720359, 'penalty': 0, 'solver': 0}


In [32]:

# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(
    random_state=random_state, 
    penalty=hyperopt.space_eval(space, best)['penalty'],
    solver=hyperopt.space_eval(space, best)['solver'],
    C=float(best['C'])
)
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
y_test_pred = model.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.85
f1_score на тестовом наборе: 0.78


Hyperopt x RandomForestClassifier

In [37]:
space={'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 15, 26, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
      }

random_state = 42
def hyperopt_rf(params, cv=5, X=X_train_scaled, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])
              }
  
    # используем эту комбинацию для построения модели
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    # обучаем модель
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [38]:
%%time

trials = Trials() # используется для логирования результатов

best=fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)
        #   rstate=np.random.RandomState(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

CPU times: total: 0 ns
Wall time: 0 ns
100%|██████████| 20/20 [06:13<00:00, 18.69s/trial, best loss: -0.8151059500953352]
Наилучшие значения гиперпараметров {'max_depth': 18.0, 'min_samples_leaf': 2.0, 'n_estimators': 103.0}


In [None]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf']),
    criterion=hyperopt.space_eval['criterion']
)
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
y_test_pred = model.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.99
f1_score на тестовом наборе: 0.81


Optuna x LogisticRegression

In [None]:
def optuna_lg(trial):

  penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
  solver = trial.suggest_categorical('solver',  ['lbfgs', 'sag'])
  C_ = trial.suggest_float('C', 0.01, 1)

  # создаем модель
  model = linear_model.LogisticRegression(penalty=penalty,
                                          solver=solver,
                                          C=C_,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_train_scaled, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train_scaled))

  return score
    

In [None]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lg, n_trials=20)

[32m[I 2023-02-06 13:26:29,555][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-02-06 13:26:40,516][0m Trial 0 finished with value: 0.8946889226100152 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.58188923811319}. Best is trial 0 with value: 0.8946889226100152.[0m
[32m[I 2023-02-06 13:26:49,453][0m Trial 1 finished with value: 0.8861911987860395 and parameters: {'penalty': 'l2', 'solver': 'sag', 'C': 0.8065264240693678}. Best is trial 0 with value: 0.8946889226100152.[0m
[32m[I 2023-02-06 13:26:50,741][0m Trial 2 finished with value: 0.9169741697416974 and parameters: {'penalty': 'none', 'solver': 'lbfgs', 'C': 0.6427549112472186}. Best is trial 2 with value: 0.9169741697416974.[0m
[32m[I 2023-02-06 13:26:52,510][0m Trial 3 finished with value: 0.891317547055252 and parameters: {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.786388341871086}. Best is trial 2 with value: 0.9169741697416974.[0m
[32m[I 2023-02-06 13:26:53,662][0m Tr

CPU times: total: 1min 20s
Wall time: 1min 20s


In [None]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'penalty': 'none', 'solver': 'lbfgs', 'C': 0.6427549112472186}
f1_score на обучающем наборе: 0.92


In [None]:
# рассчитаем точность для тестовой выборки

model = linear_model.LogisticRegression(**study.best_params,random_state=random_state, )
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.75


Optuna x RandomForestClassifier

In [None]:
def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
  max_depth = trial.suggest_int('max_depth', 10, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)
  

  # создаем модель
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_train_scaled, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train_scaled))

  return score
    

In [None]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

[32m[I 2023-02-06 13:48:38,582][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-02-06 13:48:44,689][0m Trial 0 finished with value: 0.9087591240875912 and parameters: {'n_estimators': 113, 'max_depth': 24, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9087591240875912.[0m
[32m[I 2023-02-06 13:48:51,636][0m Trial 1 finished with value: 0.9448338921060652 and parameters: {'n_estimators': 154, 'max_depth': 23, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.9448338921060652.[0m
[32m[I 2023-02-06 13:48:57,198][0m Trial 2 finished with value: 0.9731051344743277 and parameters: {'n_estimators': 194, 'max_depth': 20, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.9731051344743277.[0m
[32m[I 2023-02-06 13:49:04,316][0m Trial 3 finished with value: 0.9172245891661593 and parameters: {'n_estimators': 194, 'max_depth': 16, 'min_samples_leaf': 7}. Best is trial 2 with value: 0.9731051344743277.[0m
[32m[I 2023-02-06 13:49:10,018

CPU times: total: 1min 26s
Wall time: 1min 40s


In [None]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'n_estimators': 169, 'max_depth': 30, 'min_samples_leaf': 2}
f1_score на обучающем наборе: 0.99


In [None]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params,random_state=random_state, )
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тестовом наборе: 0.80
