# Прогнозируем задержки самолетов

In [1]:
!pip install lightgbm optuna -q

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import pandas as pd

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [3]:
RANDOM_STATE = 111
DATASET_PATH = 'https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/flight_delays_train.csv'

In [49]:
data = pd.read_csv(DATASET_PATH)

X = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'] == 'Y'

X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732
1,c-4,c-20,c-3,1548,US,PIT,MCO,834
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423


Создайте список номеров колонок с категориальными признаками для бустингов

## Quiz
Какой длины получился список?

(подсказка: колонка `DepTime` числовая)

In [50]:
import numpy as np
cat_features = X.select_dtypes(include=object).columns.to_list()
cat_features

['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']

Разобъем данные на обучение и контроль

In [63]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [67]:
optuna --version

NameError: name 'optuna' is not defined

## Модели с параметрами по умолчанию

Обучите CatBoost с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [52]:
model = CatBoostClassifier(logging_level='Silent')
model.fit(Xtrain, ytrain,
         cat_features=cat_features,
         plot=True)
pred = model.predict_proba(Xtest)[:,1]
roc_auc = roc_auc_score(ytest, pred)
roc_auc

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0.7679447117365583

Обучите LightGBM с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [64]:
# for c in X.columns:
#     col_type = X[c].dtype
#     if col_type == 'object' or col_type.name == 'category':
#         Xtrain[c] = Xtrain[c].astype('category')
#         Xtest[c] = Xtest[c].astype('category')
for cols in X.columns:
    if X[cols].dtypes == object:
        Xtrain[cols] = Xtrain[cols].astype('category')
        Xtest[cols] = Xtest[cols].astype('category')
Xtrain.dtypes

Month            category
DayofMonth       category
DayOfWeek        category
DepTime             int64
UniqueCarrier    category
Origin           category
Dest             category
Distance            int64
dtype: object

In [72]:
model_lgmn = LGBMClassifier(
)
model_lgmn.fit(Xtrain, ytrain)
pred = model_lgmn.predict_proba(Xtest)[:,1]
roc_auc_score(ytest, pred)

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


0.7341149074685321

## Optuna

Выделим дополнительную валидационную выборку.

In [66]:
Xtrain_new, Xval, ytrain_new, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=RANDOM_STATE)

Создайте функцию objective_lgbm, в которой среди гиперпараметров

* num_leaves = trial.suggest_int("num_leaves", 10, 100)
* n_estimators = trial.suggest_int("n_estimators", 10, 1000)

подберите оптимальные, обучая LGBM на Xtrain_new, ytrain_new и проверяя качество (ROC-AUC) на Xval.

Используйте 30 эпох обучения Optuna.


In [73]:
import optuna
from sklearn.model_selection import cross_val_score
def objective_lgbm(trial):
    num_leaves = trial.suggest_int('num_leaves', 10, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    model = LGBMClassifier(num_leaves=num_leaves, n_estimators=n_estimators, n_jobs=-1)
    model.fit(Xtrain_new, ytrain_new)
    pred = model.predict_proba(Xval)[:,1]
    roc_auc = roc_auc_score(yval, pred)
    return roc_auc
study = optuna.create_study(direction='maximize')
study.optimize(objective_lgbm, n_trials=30)
    

[I 2024-09-06 01:23:58,354] A new study created in memory with name: no-name-1c80541c-c05e-4f68-82b5-3a52bfa41d9b


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:24:25,449] Trial 0 finished with value: 0.716458523619676 and parameters: {'num_leaves': 19, 'n_estimators': 717}. Best is trial 0 with value: 0.716458523619676.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004361 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:24:30,150] Trial 1 finished with value: 0.723209607740176 and parameters: {'num_leaves': 27, 'n_estimators': 132}. Best is trial 1 with value: 0.723209607740176.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004735 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:25:07,617] Trial 2 finished with value: 0.7156014675974276 and parameters: {'num_leaves': 26, 'n_estimators': 867}. Best is trial 1 with value: 0.723209607740176.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:25:09,819] Trial 3 finished with value: 0.7247959343434639 and parameters: {'num_leaves': 84, 'n_estimators': 20}. Best is trial 3 with value: 0.7247959343434639.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:25:14,837] Trial 4 finished with value: 0.7260980009262389 and parameters: {'num_leaves': 22, 'n_estimators': 121}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:26:18,746] Trial 5 finished with value: 0.719910959879355 and parameters: {'num_leaves': 90, 'n_estimators': 625}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:26:29,580] Trial 6 finished with value: 0.7205640489210837 and parameters: {'num_leaves': 11, 'n_estimators': 384}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:26:38,310] Trial 7 finished with value: 0.7201246482748216 and parameters: {'num_leaves': 62, 'n_estimators': 86}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:27:58,180] Trial 8 finished with value: 0.7181275918900302 and parameters: {'num_leaves': 96, 'n_estimators': 643}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:28:46,463] Trial 9 finished with value: 0.7192386838344418 and parameters: {'num_leaves': 70, 'n_estimators': 477}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:28:59,995] Trial 10 finished with value: 0.7180163072715597 and parameters: {'num_leaves': 45, 'n_estimators': 290}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:29:01,501] Trial 11 finished with value: 0.7242981960708527 and parameters: {'num_leaves': 45, 'n_estimators': 29}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:29:21,742] Trial 12 finished with value: 0.7200950089232356 and parameters: {'num_leaves': 79, 'n_estimators': 239}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:29:32,385] Trial 13 finished with value: 0.7225063401167857 and parameters: {'num_leaves': 50, 'n_estimators': 209}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:29:41,613] Trial 14 finished with value: 0.7242664278755799 and parameters: {'num_leaves': 83, 'n_estimators': 56}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:30:00,204] Trial 15 finished with value: 0.7202377236802558 and parameters: {'num_leaves': 35, 'n_estimators': 363}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:30:13,344] Trial 16 finished with value: 0.7205256018213693 and parameters: {'num_leaves': 65, 'n_estimators': 164}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:30:50,642] Trial 17 finished with value: 0.7185589157216185 and parameters: {'num_leaves': 57, 'n_estimators': 452}. Best is trial 4 with value: 0.7260980009262389.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:30:54,930] Trial 18 finished with value: 0.7263426352168416 and parameters: {'num_leaves': 78, 'n_estimators': 28}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:31:36,290] Trial 19 finished with value: 0.7208265088513117 and parameters: {'num_leaves': 73, 'n_estimators': 326}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:31:42,382] Trial 20 finished with value: 0.7252067920383221 and parameters: {'num_leaves': 11, 'n_estimators': 191}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:31:47,016] Trial 21 finished with value: 0.725965281684211 and parameters: {'num_leaves': 11, 'n_estimators': 148}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:31:56,096] Trial 22 finished with value: 0.722780494982288 and parameters: {'num_leaves': 36, 'n_estimators': 127}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:32:07,229] Trial 23 finished with value: 0.7230363765114243 and parameters: {'num_leaves': 21, 'n_estimators': 253}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:32:13,633] Trial 24 finished with value: 0.724052264373368 and parameters: {'num_leaves': 34, 'n_estimators': 109}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:32:14,066] Trial 25 finished with value: 0.7202484227186982 and parameters: {'num_leaves': 17, 'n_estimators': 18}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:32:38,779] Trial 26 finished with value: 0.7170677932007985 and parameters: {'num_leaves': 10, 'n_estimators': 928}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:33:05,995] Trial 27 finished with value: 0.7189736839104589 and parameters: {'num_leaves': 26, 'n_estimators': 570}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:33:55,249] Trial 28 finished with value: 0.7182010141555495 and parameters: {'num_leaves': 96, 'n_estimators': 414}. Best is trial 18 with value: 0.7263426352168416.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005714 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2024-09-06 01:34:24,512] Trial 29 finished with value: 0.7160521794389004 and parameters: {'num_leaves': 21, 'n_estimators': 740}. Best is trial 18 with value: 0.7263426352168416.


In [74]:
study.best_params

{'num_leaves': 78, 'n_estimators': 28}

Обучите модель с найденными гиперпараметрами на Xtrain, ytrain и оцените ROC-AUC на тестовых данных.

In [75]:
model = LGBMClassifier(n_estimators=28, num_leaves=78)
model.fit(Xtrain, ytrain)
pred = model.predict_proba(Xtest)[:,1]
roc_auc_score(ytest, pred)

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


0.7333910096747289

## Quiz

Чему равно количество деревьев в LGBM после подбора гиперпараметров?

## Работа над улучшением модели

* Попробуйте при помощи Optuna подобрать и другие гиперпарамтеры
* Также подберите гиперпараметры у CatBoost (а не только у LightGBM)

In [None]:
# your code here

## Quiz

Поделитесь своими результатами!