# Прогнозируем задержки самолетов 

In [90]:
!pip install catboost lightgbm optuna -q

In [91]:
import optuna

In [92]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier 
from lightgbm import LGBMClassifier

In [93]:
RANDOM_STATE = 111
DATASET_PATH = 'https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/flight_delays_train.csv'

In [94]:
data = pd.read_csv(DATASET_PATH)

X = data.drop('dep_delayed_15min', axis=1)
y = data.dep_delayed_15min.map({'Y': 1, 'N': 0})

X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732
1,c-4,c-20,c-3,1548,US,PIT,MCO,834
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423


In [45]:
y.value_counts()

0    80956
1    19044
Name: dep_delayed_15min, dtype: int64

In [11]:
null_value_states = X.isnull().sum(axis=1)
null_value_states[null_value_states != 0]

Series([], dtype: int64)

In [None]:
y.value_counts()

False    80956
True     19044
Name: dep_delayed_15min, dtype: int64

Создайте список номеров колонок с категориальными признаками для бустингов

## Quiz 
Какой длины получился список?

(подсказка: колонка `DepTime` числовая)

In [17]:
cols = list(X.select_dtypes(exclude='number').columns)

In [18]:
cols

['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']

In [19]:
cat_features = [X.columns.get_loc(c) for c in cols if c in X]

In [20]:
cat_features

[0, 1, 2, 4, 5, 6]

In [21]:
len(cat_features)

6

Разобъем данные на обучение и контроль

In [47]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [23]:
Xtrain.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
41207,c-4,c-18,c-1,1457,CO,EWR,TPA,998
28283,c-11,c-1,c-2,1225,UA,DEN,BOS,1754
34619,c-6,c-16,c-5,1650,YV,IAD,CAE,401
8789,c-5,c-18,c-4,923,AA,SLC,DFW,988
38315,c-2,c-14,c-2,1839,AA,STL,SAN,1558


## Модели с параметрами по умолчанию

Обучите CatBoost с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [100]:
model = CatBoostClassifier(logging_level='Silent')

In [101]:
model.fit(Xtrain, ytrain, cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x7faf9fbaca30>

In [50]:
y_pred = model.predict(Xtest)

In [102]:
y_pred = model.predict_proba(Xtest)[:,1]

In [103]:
model.predict_proba(Xtest)

array([[0.82681981, 0.17318019],
       [0.91179584, 0.08820416],
       [0.61403625, 0.38596375],
       ...,
       [0.84966433, 0.15033567],
       [0.93269303, 0.06730697],
       [0.92162589, 0.07837411]])

In [98]:
y_pred

array([0.23396821, 0.10670627, 0.50069913, ..., 0.18205545, 0.14017226,
       0.0988326 ])

In [104]:
roc_auc_score(ytest, y_pred)

0.7670858940177856

In [73]:
model.score(Xtest, ytest)

0.83188

Обучите LightGBM с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [62]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        Xtrain[c] = Xtrain[c].astype('category')
        Xtest[c] = Xtest[c].astype('category')

In [None]:
Xtrain.dtypes

Month            category
DayofMonth       category
DayOfWeek        category
DepTime             int64
UniqueCarrier    category
Origin           category
Dest             category
Distance            int64
dtype: object

In [None]:
Xtest.dtypes

Month            category
DayofMonth       category
DayOfWeek        category
DepTime             int64
UniqueCarrier    category
Origin           category
Dest             category
Distance            int64
dtype: object

In [64]:
modelLGBM = LGBMClassifier().fit(Xtrain, ytrain)

In [107]:
predLGB = modelLGBM.predict_proba(Xtest)[:,1]

In [108]:
predLGB

array([0.23534746, 0.08481192, 0.56496314, ..., 0.13353553, 0.08805128,
       0.14096668])

In [109]:
roc_auc_score(ytest, predLGB)

0.7365867881158827

In [68]:
modelLGBM.score(Xtest, ytest)

0.82088

## Optuna

Выделим дополнительную валидационную выборку.

In [74]:
Xtrain_new, Xval, ytrain_new, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=RANDOM_STATE)

Создайте функцию objective_lgbm, в которой среди гиперпараметров

* num_leaves = trial.suggest_int("num_leaves", 10, 100)
* n_estimators = trial.suggest_int("n_estimators", 10, 1000)

подберите оптимальные, обучая LGBM на Xtrain_new, ytrain_new и проверяя качество (ROC-AUC) на Xval.

Используйте 30 эпох обучения Optuna.


In [75]:
def objective_lgbm(trial):    
    num_leaves = trial.suggest_int("num_leaves", 10, 100)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)

    score = cross_val_score(LGBMClassifier(num_leaves=num_leaves, n_estimators=n_estimators),
                            Xtrain_new, ytrain_new, cv=3, scoring='roc_auc', n_jobs=-1).mean()
    return score

In [78]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_lgbm, n_trials=30)

[32m[I 2023-03-07 17:14:51,520][0m A new study created in memory with name: no-name-a05b0a27-fc42-41ee-8bff-747ede5c32aa[0m
[32m[I 2023-03-07 17:14:58,045][0m Trial 0 finished with value: 0.7074400688971987 and parameters: {'num_leaves': 35, 'n_estimators': 404}. Best is trial 0 with value: 0.7074400688971987.[0m
[32m[I 2023-03-07 17:15:03,174][0m Trial 1 finished with value: 0.7060849528612234 and parameters: {'num_leaves': 45, 'n_estimators': 280}. Best is trial 0 with value: 0.7074400688971987.[0m
[32m[I 2023-03-07 17:15:05,727][0m Trial 2 finished with value: 0.7106995695905619 and parameters: {'num_leaves': 24, 'n_estimators': 225}. Best is trial 2 with value: 0.7106995695905619.[0m
[32m[I 2023-03-07 17:15:13,386][0m Trial 3 finished with value: 0.7068980489849034 and parameters: {'num_leaves': 86, 'n_estimators': 361}. Best is trial 2 with value: 0.7106995695905619.[0m
[32m[I 2023-03-07 17:15:25,401][0m Trial 4 finished with value: 0.7061137676070491 and paramete

Обучите модель с найденными гиперпараметрами на Xtrain, ytrain и оцените ROC-AUC на тестовых данных.

In [79]:
study.best_params

{'num_leaves': 16, 'n_estimators': 35}

In [112]:
model = LGBMClassifier(**study.best_params)
model.fit(Xtrain, ytrain)

pred = model.predict_proba(Xtest)[:,1]

In [113]:
roc_auc_score(ytest, pred)

0.7319564612662965

In [82]:
model.score(Xtest, ytest)

0.81996

## Quiz

Чему равно количество листьев в LGBM после подбора гиперпараметров? 

In [95]:
model.get_params()['num_leaves']

16

## Работа над улучшением модели

* Попробуйте при помощи Optuna подобрать и другие гиперпарамтеры
* Также подберите гиперпараметры у CatBoost (а не только у LightGBM)

### LightGBM optimization

In [114]:
def new_objective_lgbm(trial):    
    max_depth = trial.suggest_int("max_depth", 2, 20)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1, log=True)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)

    score = cross_val_score(LGBMClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators),
                            Xtrain_new, ytrain_new, cv=3, scoring='roc_auc', n_jobs=-1).mean()
    return score


study_2 = optuna.create_study(direction="maximize")
study_2.optimize(new_objective_lgbm, n_trials=30)

[32m[I 2023-03-07 20:35:10,158][0m A new study created in memory with name: no-name-52fc0ab4-71bd-4aef-8833-945aa70f3f62[0m
[32m[I 2023-03-07 20:35:18,493][0m Trial 0 finished with value: 0.6981032998107729 and parameters: {'max_depth': 8, 'learning_rate': 0.0001034956561545059, 'n_estimators': 436}. Best is trial 0 with value: 0.6981032998107729.[0m
[32m[I 2023-03-07 20:35:20,672][0m Trial 1 finished with value: 0.6983135965315833 and parameters: {'max_depth': 13, 'learning_rate': 0.0004136739049266499, 'n_estimators': 131}. Best is trial 1 with value: 0.6983135965315833.[0m
[32m[I 2023-03-07 20:35:27,929][0m Trial 2 finished with value: 0.6989013979527784 and parameters: {'max_depth': 15, 'learning_rate': 0.00015192664886816055, 'n_estimators': 518}. Best is trial 2 with value: 0.6989013979527784.[0m
[32m[I 2023-03-07 20:35:31,150][0m Trial 3 finished with value: 0.6923739200008613 and parameters: {'max_depth': 4, 'learning_rate': 7.390069385331066e-05, 'n_estimators': 

In [115]:
study_2.best_params

{'max_depth': 6, 'learning_rate': 0.011177008656679055, 'n_estimators': 314}

In [118]:
my_params = {'num_leaves': 16, 'n_estimators': 35, 'max_depth': 6, 'learning_rate': 0.011177008656679055, 'n_estimators': 314}

In [120]:
model_best_LGBM = LGBMClassifier(**my_params).fit(Xtrain, ytrain)

In [121]:
pred_best_LGBM = modelLGBM.predict_proba(Xtest)[:,1]

In [122]:
roc_auc_score(ytest, pred_best_LGBM)

0.7365867881158827

### CatBoost optimization

In [124]:
import catboost
train_dataset = catboost.Pool(Xtrain,ytrain, 
                        cat_features=cat_features)                                                      
test_dataset = catboost.Pool(Xtest,ytest,           
                       cat_features=cat_features)

In [139]:
best_model_cb = catboost.CatBoostClassifier(loss_function='Logloss')

In [133]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5,],
        'iterations': [50, 100, 150]}

In [134]:
best_model_cb.grid_search(grid,train_dataset)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
21:	learn: 0.4508087	test: 0.4564342	best: 0.4564342 (21)	total: 685ms	remaining: 3.98s
22:	learn: 0.4499286	test: 0.4556512	best: 0.4556512 (22)	total: 710ms	remaining: 3.92s
23:	learn: 0.4492051	test: 0.4550185	best: 0.4550185 (23)	total: 740ms	remaining: 3.89s
24:	learn: 0.4486142	test: 0.4543849	best: 0.4543849 (24)	total: 770ms	remaining: 3.85s
25:	learn: 0.4477397	test: 0.4535039	best: 0.4535039 (25)	total: 800ms	remaining: 3.82s
26:	learn: 0.4472187	test: 0.4530662	best: 0.4530662 (26)	total: 836ms	remaining: 3.81s
27:	learn: 0.4467662	test: 0.4526833	best: 0.4526833 (27)	total: 861ms	remaining: 3.75s
28:	learn: 0.4462683	test: 0.4521363	best: 0.4521363 (28)	total: 886ms	remaining: 3.7s
29:	learn: 0.4458809	test: 0.4517019	best: 0.4517019 (29)	total: 914ms	remaining: 3.65s
30:	learn: 0.4456175	test: 0.4514264	best: 0.4514264 (30)	total: 943ms	remaining: 3.62s
31:	learn: 0.4452997	test: 0.4511806	be

{'params': {'depth': 10,
  'l2_leaf_reg': 5,
  'iterations': 150,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45

In [137]:
pred_best_cb = best_model_cb.predict_proba(Xtest)[:, 1]

In [138]:
roc_auc_score(ytest, pred_best_cb)

0.7310888575276207

In [140]:
myparams = {'depth': 10,
  'l2_leaf_reg': 5,
  'iterations': 150,
  'learning_rate': 0.1}

In [142]:
modell = CatBoostClassifier(**myparams, cat_features=cat_features).fit(Xtrain, ytrain)

0:	learn: 0.6534158	total: 94.4ms	remaining: 14.1s
1:	learn: 0.6205455	total: 133ms	remaining: 9.85s
2:	learn: 0.5937637	total: 179ms	remaining: 8.75s
3:	learn: 0.5713573	total: 212ms	remaining: 7.75s
4:	learn: 0.5528302	total: 257ms	remaining: 7.46s
5:	learn: 0.5371012	total: 293ms	remaining: 7.03s
6:	learn: 0.5239874	total: 333ms	remaining: 6.81s
7:	learn: 0.5128473	total: 371ms	remaining: 6.59s
8:	learn: 0.5036077	total: 393ms	remaining: 6.16s
9:	learn: 0.4958072	total: 406ms	remaining: 5.69s
10:	learn: 0.4890004	total: 422ms	remaining: 5.33s
11:	learn: 0.4829043	total: 443ms	remaining: 5.1s
12:	learn: 0.4775670	total: 485ms	remaining: 5.11s
13:	learn: 0.4726805	total: 526ms	remaining: 5.11s
14:	learn: 0.4691036	total: 545ms	remaining: 4.91s
15:	learn: 0.4657419	total: 570ms	remaining: 4.77s
16:	learn: 0.4627295	total: 610ms	remaining: 4.77s
17:	learn: 0.4606288	total: 626ms	remaining: 4.59s
18:	learn: 0.4593753	total: 642ms	remaining: 4.43s
19:	learn: 0.4575932	total: 664ms	remaini

In [143]:
predd = modell.predict_proba(Xtest)[:, 1]

In [144]:
roc_auc_score(ytest, predd)

0.7310888575276207

In [151]:
best_model_cb = catboost.CatBoostClassifier(loss_function='CrossEntropy')

In [152]:
best_model_cb.grid_search(grid,train_dataset)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
21:	learn: 0.4508087	test: 0.4564342	best: 0.4564342 (21)	total: 1.07s	remaining: 6.23s
22:	learn: 0.4499286	test: 0.4556512	best: 0.4556512 (22)	total: 1.12s	remaining: 6.16s
23:	learn: 0.4492051	test: 0.4550185	best: 0.4550185 (23)	total: 1.17s	remaining: 6.12s
24:	learn: 0.4486142	test: 0.4543849	best: 0.4543849 (24)	total: 1.21s	remaining: 6.06s
25:	learn: 0.4477397	test: 0.4535039	best: 0.4535039 (25)	total: 1.27s	remaining: 6.05s
26:	learn: 0.4472187	test: 0.4530662	best: 0.4530662 (26)	total: 1.33s	remaining: 6.06s
27:	learn: 0.4467662	test: 0.4526833	best: 0.4526833 (27)	total: 1.38s	remaining: 6s
28:	learn: 0.4462683	test: 0.4521363	best: 0.4521363 (28)	total: 1.4s	remaining: 5.85s
29:	learn: 0.4458809	test: 0.4517019	best: 0.4517019 (29)	total: 1.43s	remaining: 5.71s
30:	learn: 0.4456175	test: 0.4514264	best: 0.4514264 (30)	total: 1.46s	remaining: 5.59s
31:	learn: 0.4452997	test: 0.4511806	best:

{'params': {'depth': 10,
  'l2_leaf_reg': 5,
  'iterations': 150,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45

In [154]:
answer = best_model_cb.predict_proba(Xtest)[:, 1]

In [155]:
roc_auc_score(ytest, answer)

0.7310888575276207

## Quiz

Поделитесь своими результатами!