### Домашнее задание №13 по теме «Улучшение качества модели»

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV, KFold

import optuna
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_contour

import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format

#### Задание
_________
__Цель:__  
Применить на практике алгоритмы по автоматической оптимизации параметров моделей машинного обучения.
Описание задания:
В домашнем задании нужно решить задачу классификации наличия болезни сердца у пациентов наиболее эффективно. Данные для обучения моделей необходимо загрузить самостоятельно с сайта. Целевая переменная - наличие болезни сердца (`HeartDisease`). Она принимает значения 0 или 1 в зависимости от отсутствия или наличия болезни соответственно. Подробное описание признаков можно прочесть в описании датасета на сайте. Для выполнения работы не обязательно вникать в медицинские показатели.
__Этапы работы:__. 

1. Получите данные и загрузите их в рабочую среду.  
2. Подготовьте датасет к обучению моделей:  
    a. Категориальные переменные переведите в цифровые значения. Можно использовать `pd.get_dummies, preprocessing.LabelEncoder.`
    Старайтесь не использовать для этой задачи циклы.
3. Разделите выборку на обучающее и тестовое подмножество. 80% данных оставить на обучающее множество, 20% на тестовое.  
4. Обучите модель логистической регрессии с параметрами по умолчанию.  
5. Подсчитайте основные метрики модели. Используйте следующие метрики и функцию:  
   `cross_validate(…, cv=10, scoring=[‘accuracy’,‘recall’,‘precision’,‘f1’])`
6. Оптимизируйте 3-4 параметра модели:  
    a. Используйте `GridSearchCV.`  
    b. Используйте `RandomizedSearchCV.`  
    c. \*Добавьте в п. 6b 2-5 моделей классификации и вариации их параметров.  
    d. Повторите п. 5 после каждого итогового изменения параметров.  
7. Сформулируйте выводы по проделанной работе:  
    a. Сравните метрики построенных моделей.  
    b. \*Сравните с полученными результатами в домашнем задании по теме __«Ансамблирование».__
____
Для получения зачета по этому домашнему заданию минимально необходимо:
1. обучить одну модель классификации;
2. оптимизировать параметры, используя метод из п. 6a; 
3. вывести значения метрик. 

#### 1. Загружаем данные

In [2]:
heart_raw = pd.read_csv('Downloads/Heart.csv')

In [3]:
heart = heart_raw.copy()

#### 2. Предобработка

In [4]:
heart['Sex'] = heart['Sex'].map({'M': 0, 'F': 1})
heart['ExerciseAngina'] = heart['ExerciseAngina'].map({'N': 0, 'Y': 1})

In [5]:
heart = pd.get_dummies(data=heart, columns=heart.select_dtypes('object').columns).copy()

In [6]:
heart.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,0,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,1,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,0,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,1,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,0,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1


#### 3. Разделение выборки

In [7]:
X, y = heart.drop(columns='HeartDisease', axis=1),heart['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((734, 18), (184, 18))

#### 4&5 Обучение логистической регрессии с параметрами по умолчанию и подсчет метрик

In [8]:
res = pd.DataFrame()
scoring=['accuracy', 'recall', 'precision','f1']

In [9]:
def get_metrics(df, model = LogisticRegression(), model_name='LogisticRegression_default'):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    X,y = df.drop(columns='HeartDisease', axis=1),df['HeartDisease']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = model
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    rep_train = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, return_train_score=True)
    rep_train = pd.DataFrame(rep_train)
    s = pd.Series()
    s['model'] = model_name
    s = s.append(rep_train.mean())
    res = pd.DataFrame(s)
    return res

In [10]:
res_0 = get_metrics(heart)
res = pd.concat([res, res_0], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


#### 6. Оптимизируйте 3-4 параметра модели

Если в сёрч подать список метрик `best_params_` и т.п. не получить. Соберем функцию, которая будет выдергивать нужные данные из словаря `cv_results` и выводить их в виде датафрейма

In [11]:
def bestparams(df, target,  model, params, search='G', rs=10):
    scoring=['accuracy', 'recall', 'precision', 'f1']
    X,y = heart.drop(columns=target, axis=1),heart[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)
    m = model
    if search == 'G':
        grid = GridSearchCV(m, params, cv=10, scoring=scoring, refit=False)
    elif search != 'G':
        grid = RandomizedSearchCV(m, params, cv=10, scoring=scoring, refit=False)    
    grid.fit(X_train, y_train)
    report = pd.DataFrame(grid.cv_results_).set_index('params').dropna()
    report = report[[c for c in report.columns if c.__contains__('mean')]]
    report.columns = ['_'.join(c.split('_')[1:]) for c in report.columns]
    report = report.sort_values(by=['test_accuracy', 'fit_time', 'score_time'], 
                  ascending=[False, True, True]).head(1)
    return report

##### a. GridSearchCV

In [12]:
param_grid_lr = {'penalty': ['l1', 'l2'], 
                 'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
                 'l1_ratio': np.linspace(0, 1, 10)}

In [13]:
res_GS = bestparams(heart, 'HeartDisease', LogisticRegression(), param_grid_lr, 'G')
res_GS

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"{'l1_ratio': 0.3333333333333333, 'penalty': 'l2', 'solver': 'lbfgs'}",0.021,0.002,0.867,0.9,0.869,0.883


##### b. RandomizedSearchCV

In [14]:
res_RS = bestparams(heart, 'HeartDisease', LogisticRegression(), param_grid_lr, 'R')
res_RS

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"{'solver': 'lbfgs', 'penalty': 'l2', 'l1_ratio': 0.6666666666666666}",0.021,0.002,0.867,0.9,0.869,0.883


In [15]:
res_1 = get_metrics(heart, LogisticRegression(**res_GS.index[0]), model_name='LogisticRegression_GS_opt' )
res = pd.concat([res, res_1], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


In [16]:
res_2 = get_metrics(heart, LogisticRegression(**res_RS.index[0]), model_name='LogisticRegression_RS_opt' )
res = pd.concat([res, res_2], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_RS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887


##### c. *Добавьте 2-5 моделей классификации и вариации их параметров.

In [17]:
models=[
    {'name':'RF',
     'model': RandomForestClassifier(), 'params':{
         'n_estimators':[10, 25, 50, 100, 150, 200], 
         'criterion':['gini', 'entropy'], 
         'max_depth':[3, 5, 7, 9, 11], 
         'max_samples': np.linspace(0, 1, 10),
         'random_state': [1, 10, 42]}},
    {'name':'DT',
     'model': DecisionTreeClassifier(), 'params':{
         'criterion':['gini', 'entropy'], 
         'max_depth':[3, 5, 7, 9, 11], 
         'random_state': [1, 10, 42]}},
    {'name':'GB',
     'model': GradientBoostingClassifier(), 'params':{
         'max_features':list(range(1, 30)),
         'n_estimators':[10, 25, 50, 100, 150, 200], 
         'min_samples_leaf':[1, 2 ,3, 5], 
         'random_state': [1, 10, 42]}},

]
res_multilpex = pd.DataFrame()
for v in models:
    res_ = bestparams(heart, 'HeartDisease', v['model'], v['params'], 'R', rs=42)
    res_.reset_index(inplace=True)
    res_.set_index([[v['name']] * len(res_)], inplace=True) # чтобы задать индекс, вектор с индексами должен быть в длину df
    res_multilpex = pd.concat([res_multilpex, res_], axis=0)

In [18]:
res_multilpex

Unnamed: 0,params,fit_time,score_time,test_accuracy,test_recall,test_precision,test_f1
RF,"{'random_state': 42, 'n_estimators': 100, 'max...",0.101,0.008,0.873,0.91,0.867,0.887
DT,"{'random_state': 42, 'max_depth': 3, 'criterio...",0.002,0.002,0.843,0.885,0.839,0.861
GB,"{'random_state': 10, 'n_estimators': 50, 'min_...",0.053,0.003,0.87,0.898,0.871,0.884


Словарь с параметрами получаем через `loc`

In [19]:
res_multilpex.loc['RF', 'params']

{'random_state': 42,
 'n_estimators': 100,
 'max_samples': 0.6666666666666666,
 'max_depth': 9,
 'criterion': 'entropy'}

In [20]:
res_3 = get_metrics(
    heart, 
    RandomForestClassifier(**res_multilpex.loc['RF', 'params']), 
    model_name='RandomForestClassifier_RS_opt')
res = pd.concat([res, res_3], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_RS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RandomForestClassifier_RS_opt,0.109,0.009,0.851,0.966,0.887,0.981,0.849,0.957,0.866,0.969


In [21]:
res_4 = get_metrics(
    heart, 
    DecisionTreeClassifier(**res_multilpex.loc['DT', 'params']), 
    model_name='DecisionTreeClassifier_RS_opt')
res = pd.concat([res, res_4], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_RS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RandomForestClassifier_RS_opt,0.109,0.009,0.851,0.966,0.887,0.981,0.849,0.957,0.866,0.969
DecisionTreeClassifier_RS_opt,0.002,0.002,0.837,0.858,0.853,0.884,0.849,0.861,0.848,0.871


In [22]:
res_5 = get_metrics(
    heart, 
    GradientBoostingClassifier(**res_multilpex.loc['GB', 'params']), 
    model_name='GradientBoostingClassifier_RS_opt')
res = pd.concat([res, res_5], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_RS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RandomForestClassifier_RS_opt,0.109,0.009,0.851,0.966,0.887,0.981,0.849,0.957,0.866,0.969
DecisionTreeClassifier_RS_opt,0.002,0.002,0.837,0.858,0.853,0.884,0.849,0.861,0.848,0.871
GradientBoostingClassifier_RS_opt,0.038,0.002,0.857,0.928,0.896,0.95,0.85,0.92,0.872,0.935


#### 7. Выводы:
_____
В целом с помощью `GridSearchCV` и `RandomizedSearchCV` можно упорядочить поиск гиперпараметров, но все равно возможные варианты перебрать нет возможности и в связи с перегрузкой вычислительных мощностей

__\*Сравнение с полученными результатами в домашнем задании по теме «Ансамблирование».__

\*кто будет смотреть, этот словарь выгружен в папку /Downloads в дз 8.09 и из нее залит сюда

In [23]:
with open('Downloads/dz_8.09_result.pkl', 'rb') as f:
    dz_09_result = pickle.load(f)

In [24]:
dz_09_result = pd.DataFrame(dz_09_result)

In [25]:
dz_09_result.T

Unnamed: 0,duration,accuracy_train,accuracy_test,f1_train,f1_test
dtc_min_leaf=6,0.0051 sec,0.91,0.837,0.917,0.853
rfc_max_feat=6_max_sampl=0.6,0.0632 sec,0.973,0.891,0.975,0.906
rfc_default,0.1295 sec,1.0,0.897,1.0,0.91
baggcl_max_feat=6_max_sampl=0.6,0.0621 sec,0.896,0.886,0.908,0.902
stack_svc,2.2529 sec,0.963,0.897,0.967,0.912
stack_baggcl,0.9188 sec,0.936,0.88,0.942,0.896
gbc_max_feat=10_min_leaf=5,0.0224 sec,0.891,0.891,0.903,0.905


In [26]:
res_multilpex.loc['GB', 'params']

{'random_state': 10,
 'n_estimators': 50,
 'min_samples_leaf': 2,
 'max_features': 13}

В __дз№9__ лучшей моделью оказалась `GradientBoostingClassifier` в таблице `gbc_max_feat=10_min_leaf=5` , где параметры были подобраны вручную и без кроссвалидации: 
* random_state=42,  
* n_estimators=25, 
* min_samples_leaf=5, 
* max_features=10, 
 
Модель показывала высокие метрики `accuracy` и `f1` на тренировочной и тестовой выборках.  
____   
В этой дз для модели `GradientBoostingClassifier` `RandomizedSearchCV` предложил параметры:  
* random_state=1,  
* n_estimators=200,
* min_samples_leaf=5, 
* max_features=1, 

Метрики `accuracy` и `f1` на тренировочной выборке ощутимо ниже.  
___Это странно!___

## Бонус "OPTUNA"  
Попробуем подобрать параметры с `GradientBoostingClassifier` помощью `OPTUNA`

In [27]:
def objective(trial, X, y, cv, scoring):
    kf = KFold(n_splits = cv, shuffle = True, random_state = 1)
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 200, step = 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log = True),
        "max_depth": trial.suggest_int("max_depth", 3, 15, step = 1),
        'min_samples_leaf': trial.suggest_int("min_samples_leaf", 2, 10, step = 1),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9, step = 0.1),
        "max_features": trial.suggest_int("max_features", 2, 30, step = 1),
        "random_state": trial.suggest_int("random_state", 1, 42, step = 1),
    }
    model = GradientBoostingClassifier(**params)
    scores = cross_validate(model, X, y, cv = kf, scoring = scoring)
    accuracy = scores["test_score"].mean()
    return accuracy

In [28]:
study = optuna.create_study(direction = "maximize")
func = lambda trial: objective(trial, X, y, 10, scoring = "accuracy")

[32m[I 2022-12-26 12:48:20,015][0m A new study created in memory with name: no-name-b2732672-a1d9-4df8-b6e2-e2ca01477c08[0m


In [29]:
%%time
study.optimize(func, n_trials = 100)

[32m[I 2022-12-26 12:48:21,194][0m Trial 0 finished with value: 0.8735785953177257 and parameters: {'n_estimators': 140, 'learning_rate': 0.03808571152030792, 'max_depth': 4, 'min_samples_leaf': 6, 'subsample': 0.9, 'max_features': 6, 'random_state': 12}. Best is trial 0 with value: 0.8735785953177257.[0m
[33m[W 2022-12-26 12:48:21,213][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2022-12-26 12:48:23,106][0m Trial 2 finished with value: 0.8496536072623029 and parameters: {'n_estimators': 70, 'learning_rate': 0.005428252596667924, 'max_depth': 14, 'min_samples_leaf': 10, 'subsample': 0.9, 'max_features': 15, 'random_state': 15}. Best is trial 0 with value: 0.8735785953177257.[0m
[32m[I 2022-12-26 12:48:23,530][0m Trial 3 finished with value: 0.8387482083134257 and parameters: {'n_estimators': 20, 'learning_rate': 0.012705778872097352, 'max_depth': 7, 'min_samples_leaf': 6, 'subsample': 0.6, 'max_features': 18, 'random_state': 36}. Best is trial 0 

[33m[W 2022-12-26 12:48:42,077][0m Trial 60 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,099][0m Trial 61 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,120][0m Trial 62 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,142][0m Trial 63 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,163][0m Trial 64 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,185][0m Trial 65 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,206][0m Trial 66 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,227][0m Trial 67 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,248][0m Trial 68 failed, because the objective function returned nan.[0m
[33m[W 2022-12-26 12:48:42,270][0m Trial 69 failed, because the objecti

CPU times: user 22.6 s, sys: 208 ms, total: 22.8 s
Wall time: 22.9 s


__!!!Некоторые комбинации параметров привели к падению, ну и ладно!!!__

In [30]:
study.best_value

0.8735785953177257

In [31]:
study.best_params

{'n_estimators': 140,
 'learning_rate': 0.03808571152030792,
 'max_depth': 4,
 'min_samples_leaf': 6,
 'subsample': 0.9,
 'max_features': 6,
 'random_state': 12}

In [32]:
res_6 = get_metrics(
    heart, 
    GradientBoostingClassifier(**study.best_params), 
    model_name='GradientBoostingClassifier_Optuna')
res = pd.concat([res, res_6], axis=1)
res.T.set_index('model')

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,train_accuracy,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LogisticRegression_default,0.022,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_GS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
LogisticRegression_RS_opt,0.021,0.002,0.861,0.874,0.886,0.901,0.863,0.873,0.874,0.887
RandomForestClassifier_RS_opt,0.109,0.009,0.851,0.966,0.887,0.981,0.849,0.957,0.866,0.969
DecisionTreeClassifier_RS_opt,0.002,0.002,0.837,0.858,0.853,0.884,0.849,0.861,0.848,0.871
GradientBoostingClassifier_RS_opt,0.038,0.002,0.857,0.928,0.896,0.95,0.85,0.92,0.872,0.935
GradientBoostingClassifier_Optuna,0.096,0.003,0.861,0.949,0.906,0.964,0.85,0.943,0.877,0.953


In [33]:
def get_score_df(df, target, random_seed=1, model=None, title='dtc_default'):
    start_time = time.time()
    if model is None:
        model = DecisionTreeClassifier(random_state=random_seed)
    
    X = df.drop([target], axis = 1)
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed) 


    model.fit(X_train.values, y_train)
    y_pred_train = model.predict(X_train.values)
    y_pred_test = model.predict(X_test.values)
    duration = str(round((time.time() - start_time), 4)) + ' sec'
    

    accuracy_train = model.score(X_train.values, y_train)
    accuracy_test = model.score(X_test.values, y_test)
    f1_train = f1_score(y_pred_train, y_train)
    f1_test = f1_score(y_pred_test, y_test)

    metrix = {}
    metrix['duration'] = duration
    metrix['accuracy_train'] = accuracy_train
    metrix['accuracy_test'] = accuracy_test
    metrix['f1_train'] = f1_train
    metrix['f1_test'] = f1_test
    res = pd.DataFrame.from_dict(metrix, orient='index', columns=[title])
    
    return res

In [34]:
%%time
res_6 = get_score_df(
    heart, 
    'HeartDisease', 
    model=GradientBoostingClassifier(**res_multilpex.loc['GB', 'params']), 
    title='GradientBoostingClassifier_RS_opt')
dz_09_result = pd.concat([dz_09_result, res_6], axis=1)

CPU times: user 49.5 ms, sys: 983 µs, total: 50.4 ms
Wall time: 49.8 ms


In [35]:
%%time
res_7 = get_score_df(
    heart, 
    'HeartDisease', 
    model=GradientBoostingClassifier(**study.best_params), 
    title='GradientBoostingClassifier_Optuna')
dz_09_result = pd.concat([dz_09_result, res_7], axis=1)

CPU times: user 122 ms, sys: 2.28 ms, total: 125 ms
Wall time: 124 ms


In [36]:
dz_09_result.T

Unnamed: 0,duration,accuracy_train,accuracy_test,f1_train,f1_test
dtc_min_leaf=6,0.0051 sec,0.91,0.837,0.917,0.853
rfc_max_feat=6_max_sampl=0.6,0.0632 sec,0.973,0.891,0.975,0.906
rfc_default,0.1295 sec,1.0,0.897,1.0,0.91
baggcl_max_feat=6_max_sampl=0.6,0.0621 sec,0.896,0.886,0.908,0.902
stack_svc,2.2529 sec,0.963,0.897,0.967,0.912
stack_baggcl,0.9188 sec,0.936,0.88,0.942,0.896
gbc_max_feat=10_min_leaf=5,0.0224 sec,0.891,0.891,0.903,0.905
GradientBoostingClassifier_RS_opt,0.0465 sec,0.913,0.897,0.921,0.916
GradientBoostingClassifier_Optuna,0.1169 sec,0.936,0.891,0.942,0.908


В итоге все методы дают разные наборы гиперпараметров. Метрики высокие, но модель все равно переучивается. 