## ML-7. Оптимизация гиперпараметров модели

In [26]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

### Разведывательный анализ

In [3]:
data = pd.read_csv('data/_train_sem09__1_.zip')

In [10]:
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3751 entries, 0 to 3750
Columns: 1777 entries, Activity to D1776
dtypes: float64(942), int64(835)
memory usage: 50.9 MB


In [12]:
data.describe(include='all')

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
count,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,...,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0
mean,0.542255,0.076948,0.592436,0.068142,0.03899,0.212112,0.686653,0.274713,0.455133,0.749517,...,0.026926,0.014663,0.013863,0.021861,0.015196,0.016796,0.012263,0.01173,0.020261,0.011197
std,0.498278,0.079989,0.10586,0.078414,0.115885,0.102592,0.078702,0.090017,0.162731,0.071702,...,0.161889,0.120215,0.116938,0.146249,0.122348,0.128522,0.110074,0.107683,0.140911,0.105236
min,0.0,0.0,0.282128,0.0,0.0,0.00263,0.137873,0.00613,0.0,0.27559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0333,0.517811,0.0,0.0,0.138118,0.625627,0.207374,0.378062,0.707339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0667,0.585989,0.05,0.0,0.190926,0.674037,0.277845,0.499942,0.738961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.668395,0.1,0.0,0.261726,0.740663,0.335816,0.569962,0.788177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.964381,0.95,1.0,1.0,0.994735,0.790831,0.98987,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
data['Activity'].value_counts(normalize=True)

Activity
1    0.542255
0    0.457745
Name: proportion, dtype: float64

**Вывод:** В целом данные почти сбалансированы.

### Разбиение на выборки

In [14]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.2)

### Обучение базовых моделей

*Логистическая регрессия*

In [23]:

log_simpl = linear_model.LogisticRegression(max_iter = 1000, random_state=42)

log_simpl.fit(X_train, y_train)

y_test_pred = log_simpl.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


*Случайный лес*

In [24]:
rf_simpl = ensemble.RandomForestClassifier(random_state=42)


rf_simpl.fit(X_train, y_train)

y_train_pred = rf_simpl.predict(X_train)

y_test_pred = rf_simpl.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.80


### GridSearchCV

*Логистическая регрессия*

In [29]:
param_grid = {'penalty': ['l1', 'l2'] ,
              'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
              'C': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 5]
              }
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=42,
        max_iter=1000
    ), 
    param_grid=param_grid, 
    cv=10, 
    n_jobs = -1
)  
%time 
grid_search.fit(X_train, y_train) 

y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {grid_search.best_params_}")

CPU times: total: 0 ns
Wall time: 0 ns


KeyboardInterrupt: 

In [28]:
param_grid = {'n_estimators': [100, 200, 300, 400],
              'min_samples_leaf': [2, 5, 10, 20],
              'max_depth': [3, 5, 10, 15, 20]
              }

grid_search = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=10, 
    n_jobs = -1
)  
%time 
grid_search.fit(X_train, y_train) 

y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')
print(f"Наилучшие значения гиперпараметров: {grid_search.best_params_}")

CPU times: total: 0 ns
Wall time: 0 ns
f1_score на тестовом наборе: 0.81
Наилучшие значения гиперпараметров: {'max_depth': 15, 'min_samples_leaf': 2, 'n_estimators': 200}
