### Домашнее задание №13 по теме «Улучшение качества модели»

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV, KFold
import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances

import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format

#### Задание
_________
__Цель:__  
Применить на практике алгоритмы по автоматической оптимизации параметров моделей машинного обучения.
Описание задания:
В домашнем задании нужно решить задачу классификации наличия болезни сердца у пациентов наиболее эффективно. Данные для обучения моделей необходимо загрузить самостоятельно с сайта. Целевая переменная - наличие болезни сердца (`HeartDisease`). Она принимает значения 0 или 1 в зависимости от отсутствия или наличия болезни соответственно. Подробное описание признаков можно прочесть в описании датасета на сайте. Для выполнения работы не обязательно вникать в медицинские показатели.
__Этапы работы:__. 

1. Получите данные и загрузите их в рабочую среду.  
2. Подготовьте датасет к обучению моделей:  
    a. Категориальные переменные переведите в цифровые значения. Можно использовать `pd.get_dummies, preprocessing.LabelEncoder.`
    Старайтесь не использовать для этой задачи циклы.
3. Разделите выборку на обучающее и тестовое подмножество. 80% данных оставить на обучающее множество, 20% на тестовое.  
4. Обучите модель логистической регрессии с параметрами по умолчанию.  
5. Подсчитайте основные метрики модели. Используйте следующие метрики и функцию:  
   `cross_validate(…, cv=10, scoring=[‘accuracy’,‘recall’,‘precision’,‘f1’])`
6. Оптимизируйте 3-4 параметра модели:  
    a. Используйте `GridSearchCV.`  
    b. Используйте `RandomizedSearchCV.`  
    c. \*Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.  
    d. Повторите п. 5 после каждого итогового изменения параметров.  
7. Сформулируйте выводы по проделанной работе:  
    a. Сравните метрики построенных моделей.  
    b. \*Сравните с полученными результатами в домашнем задании по теме __«Ансамблирование».__
____
Для получения зачета по этому домашнему заданию минимально необходимо:
1. обучить одну модель классификации;
2. оптимизировать параметры, используя метод из п. 6a; 
3. вывести значения метрик. 

#### 1. Загружаем данные

In [2]:
heart_raw = pd.read_csv('Downloads/Heart.csv')

In [3]:
heart = heart_raw.copy()

#### 2. Предобработка

In [4]:
heart['Sex'] = heart['Sex'].map({'M': 0, 'F': 1})
heart['ExerciseAngina'] = heart['ExerciseAngina'].map({'N': 0, 'Y': 1})

In [5]:
heart = pd.get_dummies(data=heart, columns=heart.select_dtypes('object').columns).copy()

In [6]:
heart.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1


#### 3. Разделение выборки

In [7]:
X, y = heart.drop(columns='HeartDisease', axis=1),heart['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((734, 18), (184, 18))

#### 4&5 Обучение логистической регрессии с параметрами по умолчанию и подсчет метрик

In [8]:
res = pd.DataFrame()
scoring=['accuracy', 'recall', 'precision','f1']

In [9]:
def get_metrics(df, model = LogisticRegression(), model_name='LR_default'):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    X,y = df.drop(columns='HeartDisease', axis=1),df['HeartDisease']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = model
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    rep_train = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, return_train_score=True)
    rep_train = pd.DataFrame(rep_train)
    s = pd.Series()
    s['model'] = model_name
    s = s.append(rep_train.mean())
    res = pd.DataFrame(s)
    return res

In [10]:
res_0 = get_metrics(heart)
res = pd.concat([res, res_0], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


#### 6. Оптимизируйте 3-4 параметра модели

Если в сёрч подать более 1 метрики, т.е. список метрик, методы `best_params_, best_estimator_, best_score_` и т.п. работать не будут. Соберем функцию, которая выдернет нужные данные из словаря `cv_results` и выведет их в виде датафрейма

In [11]:
def bestparams(df, target,  model, params, search='G', rs=10):
    scoring=['accuracy', 'recall', 'precision', 'f1']
    X,y = heart.drop(columns=target, axis=1),heart[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
    m = model
    if search == 'G':
        grid = GridSearchCV(m, params, cv=10, scoring=scoring, refit=False)
    elif search != 'G':
        grid = RandomizedSearchCV(m, params, cv=10, scoring=scoring, refit=False)    
    grid.fit(X_train, y_train)
    report = pd.DataFrame(grid.cv_results_).set_index('params').dropna()
    report = report[[c for c in report.columns if c.__contains__('mean')]]
    report.columns = ['_'.join(c.split('_')[1:]) for c in report.columns]
    report = report.sort_values(by=['test_accuracy', 'fit_time', 'score_time'], 
                  ascending=[False, True, True]).head(1)
    return report

##### a. GridSearchCV

In [12]:
param_grid_lr = {'penalty': ['l1', 'l2'], 
                 'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
                 'l1_ratio': np.around(np.linspace(0, 1, 10), 5)}

In [13]:
res_GS = bestparams(heart, 'HeartDisease', LogisticRegression(), param_grid_lr, 'G')
res_GS

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"{'l1_ratio': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}",0.013,0.001,0.867,0.9,0.869,0.883


##### b. RandomizedSearchCV

In [14]:
res_RS = bestparams(heart, 'HeartDisease', LogisticRegression(), param_grid_lr, 'R')
res_RS

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"{'solver': 'lbfgs', 'penalty': 'l2', 'l1_ratio': 1.0}",0.013,0.001,0.867,0.9,0.869,0.883


In [15]:
res_1 = get_metrics(heart, LogisticRegression(**res_GS.index[0]), model_name='LR_GS' )
res = pd.concat([res, res_1], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


In [16]:
res_2 = get_metrics(heart, LogisticRegression(**res_RS.index[0]), model_name='LR_RS' )
res = pd.concat([res, res_2], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_RS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


##### c. *Добавьте 2-5 моделей классификации и вариации их параметров.

In [17]:
%%time
models=[
    {'name':'RF',
     'model': RandomForestClassifier(), 'params':{
         'n_estimators':[10, 25, 50, 100, 150, 200], 
         'criterion':['gini', 'entropy'], 
         'max_depth':[3, 5, 7, 9, 11], 
         'max_samples': np.linspace(0, 1, 10),
         'random_state': [1, 10, 42]}},
    {'name':'DT',
     'model': DecisionTreeClassifier(), 'params':{
         'criterion':['gini', 'entropy'], 
         'max_depth':[3, 5, 7, 9, 11], 
         'random_state': [1, 10, 42]}},
    {'name':'GB',
     'model': GradientBoostingClassifier(), 'params':{
         'max_features':list(range(1, X.shape[1] + 1)),
#          'learning_rate':np.linspace(1e-4, 0.3, 30),
         'max_depth': list(range(3, 15)),
         'n_estimators':[10, 25, 50, 100, 150, 200, 500], 
         'min_samples_leaf':[1, 2 ,3, 5, 7, 9, 11], 
         'subsample': np.linspace(0.5, 1., 6),
         'loss': ['deviance', 'exponential'],
         'criterion': ['friedman_mse', 'squared_error', 'mse', 'mae'],
         'random_state': [1, 10, 42]}},

]
res_multilpex = pd.DataFrame()
for v in models:
    res_ = bestparams(heart, 'HeartDisease', v['model'], v['params'], 'R', rs=1)
    res_.reset_index(inplace=True)
    res_.set_index([[v['name']] * len(res_)], inplace=True) # чтобы задать индекс, вектор с индексами должен быть в длину df
    res_multilpex = pd.concat([res_multilpex, res_], axis=0)

CPU times: user 23 s, sys: 101 ms, total: 23.1 s
Wall time: 23.1 s


In [18]:
res_multilpex

Unnamed: 0,params,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
RF,"{'random_state': 10, 'n_estimators': 100, 'max...",0.051,0.004,0.864,0.909,0.853,0.879
DT,"{'random_state': 42, 'max_depth': 5, 'criterio...",0.001,0.001,0.818,0.819,0.843,0.829
GB,"{'subsample': 0.8, 'random_state': 10, 'n_esti...",0.048,0.002,0.864,0.887,0.868,0.876


Словарь с параметрами получаем через `loc`

In [19]:
res_multilpex.loc['RF', 'params']

{'random_state': 10,
 'n_estimators': 100,
 'max_samples': 0.8888888888888888,
 'max_depth': 5,
 'criterion': 'gini'}

In [20]:
res_3 = get_metrics(
    heart, 
    RandomForestClassifier(**res_multilpex.loc['RF', 'params']), 
    model_name='RF_RS')
res = pd.concat([res, res_3], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_RS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RF_RS,0.05,0.004,0.856,0.911,0.896,0.948,0.848,0.896,0.871,0.921


In [21]:
res_4 = get_metrics(
    heart, 
    DecisionTreeClassifier(**res_multilpex.loc['DT', 'params']), 
    model_name='DTC_RS')
res = pd.concat([res, res_4], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_RS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RF_RS,0.05,0.004,0.856,0.911,0.896,0.948,0.848,0.896,0.871,0.921
DTC_RS,0.001,0.001,0.811,0.901,0.816,0.909,0.833,0.91,0.824,0.909


In [22]:
res_5 = get_metrics(
    heart, 
    GradientBoostingClassifier(**res_multilpex.loc['GB', 'params']), 
    model_name='GBC_RS')
res = pd.concat([res, res_5], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_RS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RF_RS,0.05,0.004,0.856,0.911,0.896,0.948,0.848,0.896,0.871,0.921
DTC_RS,0.001,0.001,0.811,0.901,0.816,0.909,0.833,0.91,0.824,0.909
GBC_RS,0.041,0.002,0.858,0.985,0.899,0.996,0.851,0.977,0.874,0.986


#### 7. Выводы:
_____
В целом с помощью `GridSearchCV` и `RandomizedSearchCV` можно упорядочить поиск гиперпараметров, но все равно возможные варианты перебрать нет возможности и в связи с перегрузкой вычислительных мощностей

__\*Сравнение с полученными результатами в домашнем задании по теме «Ансамблирование».__

\*кто будет смотреть, этот словарь выгружен в папку /Downloads в дз 8.09 и из нее залит сюда

In [23]:
with open('Downloads/dz_8.09_result.pkl', 'rb') as f:
    dz_09_result = pickle.load(f)

In [24]:
dz_09_result = pd.DataFrame(dz_09_result)

In [25]:
dz_09_result.T

Unnamed: 0,duration,accuracy_train,accuracy_test,f1_train,f1_test
dtc_min_leaf=6,0.0051 sec,0.91,0.837,0.917,0.853
rfc_max_feat=6_max_sampl=0.6,0.0632 sec,0.973,0.891,0.975,0.906
rfc_default,0.1295 sec,1.0,0.897,1.0,0.91
baggcl_max_feat=6_max_sampl=0.6,0.0621 sec,0.896,0.886,0.908,0.902
stack_svc,2.2529 sec,0.963,0.897,0.967,0.912
stack_baggcl,0.9188 sec,0.936,0.88,0.942,0.896
gbc_max_feat=10_min_leaf=5,0.0224 sec,0.891,0.891,0.903,0.905


In [26]:
res_multilpex.loc['GB', 'params']

{'subsample': 0.8,
 'random_state': 10,
 'n_estimators': 50,
 'min_samples_leaf': 11,
 'max_features': 9,
 'max_depth': 14,
 'loss': 'exponential',
 'criterion': 'mse'}

В __дз№9__ лучшей моделью оказалась `GradientBoostingClassifier` в таблице `gbc_max_feat=10_min_leaf=5` , где параметры были подобраны вручную и без кроссвалидации: 
* random_state=42,  
* n_estimators=25, 
* min_samples_leaf=5, 
* max_features=10, 
 
Модель показывала высокие метрики `accuracy` и `f1` на тренировочной и тестовой выборках.  
____   
В этой дз для модели `GradientBoostingClassifier` `RandomizedSearchCV` предложил параметры:  
* random_state=1,  
* n_estimators=50,
* min_samples_leaf=9, 
* max_features=6, 

Метрики `accuracy` и `f1` на тренировочной выборке ощутимо ниже.  
___Это странно!___

## Бонус "OPTUNA"  
Попробуем подобрать параметры с `GradientBoostingClassifier` помощью `OPTUNA`

In [27]:
def objective(trial, X, y, cv, scoring):
    """
    Все что suggest_int или suggest_float заполняется как начало, конец, шаг. 
    Если надо передать просто список используем suggest_categorical
    """
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    kf = KFold(n_splits = cv, shuffle = True, random_state = 1)
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10, 25, 50, 100, 150, 200, 500]),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log = True),
        "max_depth": trial.suggest_int("max_depth", 3, 15, step = 1),
        'min_samples_leaf': trial.suggest_categorical("min_samples_leaf", [1, 2 ,3, 5, 7, 9, 11]),
        "subsample": trial.suggest_float("subsample", 0.5, 1., step = 0.1),
        "max_features": trial.suggest_categorical("max_features", list(range(1, X.shape[1] + 1))), # если mf больше чем есть - будет ошибка
        "random_state": trial.suggest_categorical("random_state", [1, 10, 42]),
        "loss": trial.suggest_categorical("loss", ['deviance', 'exponential']),
        "criterion" : trial.suggest_categorical("criterion", ['friedman_mse', 'squared_error', 'mse', 'mae']),
    }
    model = GradientBoostingClassifier(**params)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)    
#     accuracy = accuracy_score(y_pred, y_test)
    scores = cross_validate(model, X, y, cv = kf, scoring = scoring)
    accuracy = scores["test_score"].mean()
    return accuracy

In [28]:
study = optuna.create_study(
    direction = "maximize", 
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.HyperbandPruner(),
    study_name='GradientBoostingClassifier-Hyperparameter-Tuning')
func = lambda trial: objective(trial, X_train, y_train, 5, scoring = "accuracy")

[32m[I 2023-03-11 17:25:20,828][0m A new study created in memory with name: GradientBoostingClassifier-Hyperparameter-Tuning[0m


In [29]:
%%time
study.optimize(
    func, 
    n_trials = 130, 
    show_progress_bar=True, 
    gc_after_trial=True)

  0%|          | 0/130 [00:00<?, ?it/s]

[32m[I 2023-03-11 17:25:20,968][0m Trial 0 finished with value: 0.8569285248345914 and parameters: {'n_estimators': 25, 'max_depth': 11, 'min_samples_leaf': 11, 'subsample': 0.9, 'max_features': 8, 'random_state': 42, 'loss': 'deviance', 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.8569285248345914.[0m
[32m[I 2023-03-11 17:25:21,705][0m Trial 1 finished with value: 0.8610381138756873 and parameters: {'n_estimators': 150, 'max_depth': 15, 'min_samples_leaf': 9, 'subsample': 0.9, 'max_features': 7, 'random_state': 10, 'loss': 'deviance', 'criterion': 'mse'}. Best is trial 1 with value: 0.8610381138756873.[0m
[32m[I 2023-03-11 17:25:22,170][0m Trial 2 finished with value: 0.856937843630603 and parameters: {'n_estimators': 100, 'max_depth': 12, 'min_samples_leaf': 7, 'subsample': 0.6, 'max_features': 9, 'random_state': 1, 'loss': 'exponential', 'criterion': 'squared_error'}. Best is trial 1 with value: 0.8610381138756873.[0m
[32m[I 2023-03-11 17:25:23,719][0m Tria

[32m[I 2023-03-11 17:26:05,977][0m Trial 27 finished with value: 0.8556145745969619 and parameters: {'n_estimators': 150, 'max_depth': 6, 'min_samples_leaf': 1, 'subsample': 0.7, 'max_features': 16, 'random_state': 10, 'loss': 'exponential', 'criterion': 'mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:06,171][0m Trial 28 finished with value: 0.8705805609915199 and parameters: {'n_estimators': 25, 'max_depth': 7, 'min_samples_leaf': 1, 'subsample': 0.8, 'max_features': 6, 'random_state': 1, 'loss': 'deviance', 'criterion': 'mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:06,372][0m Trial 29 finished with value: 0.8501351225421676 and parameters: {'n_estimators': 25, 'max_depth': 8, 'min_samples_leaf': 2, 'subsample': 0.8, 'max_features': 6, 'random_state': 42, 'loss': 'deviance', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:06,510][0m Trial 30 fi

[32m[I 2023-03-11 17:26:14,901][0m Trial 53 finished with value: 0.8760413754542913 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_leaf': 2, 'subsample': 0.9, 'max_features': 1, 'random_state': 10, 'loss': 'exponential', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:15,243][0m Trial 54 finished with value: 0.8555772994129158 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_leaf': 2, 'subsample': 0.9, 'max_features': 11, 'random_state': 10, 'loss': 'exponential', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:15,450][0m Trial 55 finished with value: 0.8664989283384585 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_leaf': 2, 'subsample': 1.0, 'max_features': 1, 'random_state': 10, 'loss': 'exponential', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17

[32m[I 2023-03-11 17:26:25,839][0m Trial 79 finished with value: 0.8501258037461561 and parameters: {'n_estimators': 150, 'max_depth': 6, 'min_samples_leaf': 3, 'subsample': 1.0, 'max_features': 1, 'random_state': 42, 'loss': 'exponential', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:26,931][0m Trial 80 finished with value: 0.856937843630603 and parameters: {'n_estimators': 100, 'max_depth': 15, 'min_samples_leaf': 2, 'subsample': 0.7, 'max_features': 9, 'random_state': 42, 'loss': 'deviance', 'criterion': 'squared_error'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:27,084][0m Trial 81 finished with value: 0.8732830118348709 and parameters: {'n_estimators': 100, 'max_depth': 3, 'min_samples_leaf': 2, 'subsample': 1.0, 'max_features': 1, 'random_state': 42, 'loss': 'exponential', 'criterion': 'friedman_mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26

[32m[I 2023-03-11 17:26:38,574][0m Trial 105 finished with value: 0.8583170254403131 and parameters: {'n_estimators': 150, 'max_depth': 5, 'min_samples_leaf': 7, 'subsample': 0.7, 'max_features': 1, 'random_state': 42, 'loss': 'deviance', 'criterion': 'mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:38,830][0m Trial 106 finished with value: 0.8637498835150497 and parameters: {'n_estimators': 150, 'max_depth': 4, 'min_samples_leaf': 7, 'subsample': 0.7, 'max_features': 1, 'random_state': 42, 'loss': 'deviance', 'criterion': 'mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:39,049][0m Trial 107 finished with value: 0.8732923306308825 and parameters: {'n_estimators': 150, 'max_depth': 3, 'min_samples_leaf': 7, 'subsample': 0.6, 'max_features': 1, 'random_state': 1, 'loss': 'deviance', 'criterion': 'mse'}. Best is trial 23 with value: 0.8773926008759668.[0m
[32m[I 2023-03-11 17:26:39,389][0m Trial 108 finished 

In [30]:
study.best_value

0.8773926008759668

In [31]:
study.best_params

{'n_estimators': 50,
 'max_depth': 5,
 'min_samples_leaf': 1,
 'subsample': 0.7,
 'max_features': 4,
 'random_state': 1,
 'loss': 'deviance',
 'criterion': 'mse'}

In [32]:
plot_param_importances(study)

In [33]:
res_6 = get_metrics(
    heart, 
    GradientBoostingClassifier(**study.best_params), 
    model_name='GBC_Optuna')
res = pd.concat([res, res_6], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LR_default,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_GS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LR_RS,0.013,0.001,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RF_RS,0.05,0.004,0.856,0.911,0.896,0.948,0.848,0.896,0.871,0.921
DTC_RS,0.001,0.001,0.811,0.901,0.816,0.909,0.833,0.91,0.824,0.909
GBC_RS,0.041,0.002,0.858,0.985,0.899,0.996,0.851,0.977,0.874,0.986
GBC_Optuna,0.027,0.002,0.875,0.973,0.902,0.983,0.874,0.968,0.886,0.976


In [34]:
def get_score_df(df, target, random_seed=1, model=None, title='dtc_default'):
    start_time = time.time()
    if model is None:
        model = DecisionTreeClassifier(random_state=random_seed)
    
    X = df.drop([target], axis = 1)
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed) 


    model.fit(X_train.values, y_train)
    y_pred_train = model.predict(X_train.values)
    y_pred_test = model.predict(X_test.values)
    duration = str(round((time.time() - start_time), 4)) + ' sec'
    

    accuracy_train = model.score(X_train.values, y_train)
    accuracy_test = model.score(X_test.values, y_test)
    f1_train = f1_score(y_pred_train, y_train)
    f1_test = f1_score(y_pred_test, y_test)

    metrix = {}
    metrix['duration'] = duration
    metrix['accuracy_train'] = accuracy_train
    metrix['accuracy_test'] = accuracy_test
    metrix['f1_train'] = f1_train
    metrix['f1_test'] = f1_test
    res = pd.DataFrame.from_dict(metrix, orient='index', columns=[title])
    
    return res

In [35]:
%%time
res_6 = get_score_df(
    heart, 
    'HeartDisease', 
    model=GradientBoostingClassifier(**res_multilpex.loc['GB', 'params']), 
    title='GBC_RS')
dz_09_result = pd.concat([dz_09_result, res_6], axis=1)

CPU times: user 60.1 ms, sys: 987 µs, total: 61.1 ms
Wall time: 60.5 ms


In [36]:
%%time
res_7 = get_score_df(
    heart, 
    'HeartDisease', 
    model=GradientBoostingClassifier(**study.best_params), 
    title='GBC_Optuna')
dz_09_result = pd.concat([dz_09_result, res_7], axis=1)

CPU times: user 34.4 ms, sys: 915 µs, total: 35.3 ms
Wall time: 34.5 ms


In [37]:
dz_09_result.T

Unnamed: 0,duration,accuracy_train,accuracy_test,f1_train,f1_test
dtc_min_leaf=6,0.0051 sec,0.91,0.837,0.917,0.853
rfc_max_feat=6_max_sampl=0.6,0.0632 sec,0.973,0.891,0.975,0.906
rfc_default,0.1295 sec,1.0,0.897,1.0,0.91
baggcl_max_feat=6_max_sampl=0.6,0.0621 sec,0.896,0.886,0.908,0.902
stack_svc,2.2529 sec,0.963,0.897,0.967,0.912
stack_baggcl,0.9188 sec,0.936,0.88,0.942,0.896
gbc_max_feat=10_min_leaf=5,0.0224 sec,0.891,0.891,0.903,0.905
GBC_RS,0.0571 sec,0.981,0.902,0.982,0.919
GBC_Optuna,0.0319 sec,0.958,0.908,0.961,0.923


В итоге все методы очень долгие и дают разные наборы гиперпараметров. Безусловно, метрики на тесте высокие, и модели не особо переучиваются. Однако если надо перебрать очень много параметров выбирать стоит между `OPTUNA` и `RandomizedSearchCV` ибо `GridSearchCV` скорее всего перегрузит процессор. Особенность `OPTUNA` в том, что он перебирает варианты из дозволенных `n_trials` и выбирает лучший. Не факт что при перезапуске кода лучший результат будет таким же. Чтобы найти действительно самый лучший результат надо ставить примерно 10000 `n_trials` на сутки и сохранять словарь с параметрами в pickle