In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly_express as px
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from catboost import cv
from catboost import Pool
from matplotlib import pyplot as plt
from sklearn.preprocessing import scale


from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go


%matplotlib inline


Model is builded from the scratch

---
What have NOT been made:
- DepTime_cat  24 категорий, удалены хвосты
- DepTime_cat - 24 категорий, удалены хвосты
- Флаги на месяцах
- Dep_minute
- DepTimeLog
- DepTimepairs - как категориальный фич
- DepTimesqrt
---

## Ход тестирования


1. Стандартный baseline: 0.7612620434
2. Отскейленые DepTime и Distance: 0.7612620434
3. Отскейленые DepTime, Distance, DepTimeLog: 0.7605898869
4. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 330: 0.7655446758
5. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 500: 0.7612620434
6. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 450: 0.7605665271
7. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 450 & ['DepTime'] <= 80: 0.7563055661
8. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 350: 0.7603290604
8. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 300: 0.7615391716

***Дропнутый хвост ['DepTime'] >= 330 с лучшим рез. на отлженной выборке приводит к падению на 0.0925 на kaggle по
сравнению со стандатным baseline, не говоря уже о других порогах (500, 459, 350, 300)***
9. Отскейленые DepTime, Distance, DepTimesq: 0.7605898869
10. Отскейленые DepTime, Distance, DepTimesqrt: 0.7605898869
11. DepTime, Distance, DepTimesqrt: 0.7605898869
12. Отскейленые DepTime, Distance; DepTime_cat (категориальный): 0.7769286607; CV_3_folds: 0.7763139 (999)
13. Отскейленые DepTime, Distance; DepTime_cat (категориальный), дропнутые хвосты DepTime_cat: 0.7840374983. На kaggle
ПАДЕНИЕ на 0.0728 по сравнению с п. 12
14. Отскейленые DepTime, Distance, DepTimeLog; DepTime_cat (категориальный): 0.7769629497. На kaggle, по
сравнению с п. 12, ПАДЕНИЕ на 0.00048‬
15. Отскейленые DepTime, Distance; DepTime_cat (категор.), distance_cat (категор.): 0.7757000176; kaggle: 
16. Отскейленые DepTime, Distance; distance_cat DepTime_cat (категор.): 0.7741272155
17. Отскейленые DepTime, Distance; DepTime_cat (категор.), distance_cat (категор.), дропнутые хвосты distance_cat: 
0.778180006; CV_3_folds: 0.7746258 (999)
18. Отскейленые DepTime, Distance; DepTime_cat (категор.), DepTimepairs (категор.): 0.7772042563
; CV_3_folds: 0.7762758 (999)
19. Отскейленые DepTime, Distance, 'Dep_minute'; DepTime_cat (категор.): ***0.8055365684; CV_3_folds: 0.7865618; 
kaggle: 0.75972***
20. DepTime, Distance, 'Dep_minute'; DepTime_cat (категор.): CV_3_folds: 0.7865618 (999)
21. Отскейленые DepTime, Distance, 'Dep_minute'; DepTime_cat (категор.), флаг 'Months [6,7,12]': kaggle: 0.75727




In [2]:
train_df_fd = pd.read_csv('flight_delays_train.csv')
test_df_fd = pd.read_csv('flight_delays_test.csv')
train_df_fd['flight'] = train_df_fd['Origin'] + '-->' + train_df_fd['Dest']
test_df_fd['flight'] = test_df_fd['Origin'] + '-->' + test_df_fd['Dest']
train_df_fd['dep_delayed_15min'] = train_df_fd['dep_delayed_15min'].map({'Y' : 1, 'N' : 0})
categ_feat_idx = np.where(train_df_fd.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]

train_df_fd.Month.replace(['c-%d' % i for i in range(1, 13)], list(range(1, 13)), inplace=True)
test_df_fd.Month.replace(['c-%d' % i for i in range(1, 13)], list(range(1, 13)), inplace=True)
train_df_fd.DayofMonth.replace(['c-%d' % i for i in range(1, 32)], list(range(1, 32)), inplace=True)
test_df_fd.DayofMonth.replace(['c-%d' % i for i in range(1, 32)], list(range(1, 32)), inplace=True)
train_df_fd.DayOfWeek.replace(['c-%d' % i for i in range(1, 8)], list(range(1, 8)), inplace=True)
test_df_fd.DayOfWeek.replace(['c-%d' % i for i in range(1, 8)], list(range(1, 8)), inplace=True)

In [360]:
# Дропнем хвосты

# train_df_fd  = train_df_fd[train_df_fd['DepTime'] >= 330]

In [10]:
# Дистанция и время отправления
train_df_fd['DepTime_cat'] = np.floor(train_df_fd.DepTime / 100).astype('int')
test_df_fd['DepTime_cat'] = np.floor(test_df_fd.DepTime / 100).astype('int')
# train_df_fd = train_df_fd[(train_df_fd['DepTime_cat'] > 4) & (train_df_fd['DepTime_cat'] < 24)]
# train_df_fd['distance_cat'] = np.floor(train_df_fd.Distance / 200).astype('int')
# test_df_fd['distance_cat'] = np.floor(test_df_fd.Distance / 200).astype('int')
# train_df_fd = train_df_fd[train_df_fd['distance_cat'] < 15]

# Дни и месяцы

# train_df_fd['Months [6,7,12]'] = train_df_fd['Month'].apply(lambda x: 1 if x >= 6 \
#                                                          and x <= 7 or x == 12 else 0)
# test_df_fd['Months [6,7,12]'] = test_df_fd['Month'].apply(lambda x: 1 if x >= 6 \
#                                                        and x <= 7  or x == 12 else 0)
# train_df_fd['Months [2,3]'] = train_df_fd['Month'].apply(lambda x: 1 if x >= 2 \
#                                                          and x <= 3 else 0)
# test_df_fd['Months [2,3]'] = test_df_fd['Month'].apply(lambda x: 1 if x >= 2 \
#                                                        and x <= 3 else 0)

# Еще признаки
train_df_fd['Dep_minute'] =  train_df_fd['DepTime']%100
test_df_fd['Dep_minute'] =  test_df_fd['DepTime']%100
# train_df_fd['DepTimeLog'] = train_df_fd['DepTime'].apply(lambda x: np.log(x))
# test_df_fd['DepTimeLog'] = test_df_fd['DepTime'].apply(lambda x: np.log(x))
# train_df_fd['DepTimepairs'] = train_df_fd['DepTime'] * np.append(train_df_fd['DepTime'][1:], 1)
# test_df_fd['DepTimepairs'] = test_df_fd['DepTime'] * np.append(test_df_fd['DepTime'][1:], 1)
# train_df_fd['DepTimesqrt'] = train_df_fd['DepTime'].apply(lambda x: np.sqrt(x))
# test_df_fd['DepTimesqrt'] = test_df_fd['DepTime'].apply(lambda x: np.sqrt(x))

# train_df_fd['DepTimesq'] = train_df_fd['DepTime'].apply(lambda x: x**2)
# test_df_fd['DepTimesq'] = test_df_fd['DepTime'].apply(lambda x: x**2)

In [362]:
# Скейлинг вещественных фичей

train_df_fd['DepTime'] = scale(train_df_fd['DepTime'])
train_df_fd['Distance'] = scale(train_df_fd['Distance'])
test_df_fd['Distance'] = scale(test_df_fd['Distance'])
test_df_fd['DepTime'] = scale(test_df_fd['DepTime'])

train_df_fd['Dep_minute'] = scale(train_df_fd['Dep_minute'])
test_df_fd['Dep_minute'] = scale(test_df_fd['Dep_minute'])

# train_df_fd['distance_cat'] = scale(train_df_fd['distance_cat'])
# test_df_fd['distance_cat'] = scale(test_df_fd['distance_cat'])
# train_df_fd['DepTimesqrt'] = scale(train_df_fd['DepTimesqrt'])
# test_df_fd['DepTimesqrt'] = scale(test_df_fd['DepTimesqrt'])
# train_df_fd['DepTimesq'] = scale(train_df_fd['DepTimesq'])
# test_df_fd['DepTimesq'] = scale(test_df_fd['DepTimesq'])
# train_df_fd['DepTimeLog'] = scale(train_df_fd['DepTimeLog'])
# test_df_fd['DepTimeLog']  = scale(test_df_fd['DepTimeLog'])

In [4]:
y_train = train_df_fd['dep_delayed_15min']
train_df_fd = train_df_fd.drop('dep_delayed_15min', axis=1)

In [5]:
categ_feat_idx = np.append(categ_feat_idx, [9]) 

X_train_part, X_valid, y_train_part, y_valid = train_test_split(train_df_fd, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)
len(X_train_part.columns), X_train_part.columns, categ_feat_idx

(12, Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
        'Origin', 'Dest', 'Distance', 'flight', 'DepTime_cat',
        'Months [6,7,12]', 'Dep_minute'],
       dtype='object'), array([0, 1, 2, 4, 5, 6, 8, 9], dtype=int64))

In [365]:
# %%time
ctb = CatBoostClassifier(random_seed=17, eval_metric = 'AUC')
ctb.fit(X_train_part, y_train_part,
        eval_set=(X_valid, y_valid),
        cat_features=categ_feat_idx,
        early_stopping_rounds = 500,
        verbose = 200,
        use_best_model=True,
        plot=False,
    
       );

Learning rate set to 0.121895
0:	test: 0.6622868	best: 0.6622868 (0)	total: 68.4ms	remaining: 1m 8s
200:	test: 0.7801575	best: 0.7801715 (199)	total: 33.9s	remaining: 2m 14s
400:	test: 0.7923921	best: 0.7923921 (400)	total: 1m 6s	remaining: 1m 39s
600:	test: 0.7975710	best: 0.7975710 (600)	total: 1m 37s	remaining: 1m 5s
800:	test: 0.8032851	best: 0.8032851 (800)	total: 2m 14s	remaining: 33.5s
999:	test: 0.8054953	best: 0.8055366 (995)	total: 2m 50s	remaining: 0us

bestTest = 0.8055365684
bestIteration = 995

Shrink model to first 996 iterations.


In [34]:
# 1. Стандартный baseline

Learning rate set to 0.121895
0:	test: 0.6604672	best: 0.6604672 (0)	total: 174ms	remaining: 2m 53s
200:	test: 0.7504506	best: 0.7504506 (200)	total: 25.1s	remaining: 1m 39s
400:	test: 0.7555314	best: 0.7555350 (397)	total: 50.7s	remaining: 1m 15s
600:	test: 0.7589069	best: 0.7589069 (600)	total: 1m 16s	remaining: 51s
800:	test: 0.7602943	best: 0.7603248 (799)	total: 1m 43s	remaining: 25.7s
999:	test: 0.7611861	best: 0.7612620 (975)	total: 2m 12s	remaining: 0us

bestTest = 0.7612620434
bestIteration = 975

Shrink model to first 976 iterations.


In [44]:
# 2. Отскейленые DepTime и Distance

Learning rate set to 0.121895
0:	test: 0.6604672	best: 0.6604672 (0)	total: 173ms	remaining: 2m 52s
200:	test: 0.7504506	best: 0.7504506 (200)	total: 26.6s	remaining: 1m 45s
400:	test: 0.7555314	best: 0.7555350 (397)	total: 52.6s	remaining: 1m 18s
600:	test: 0.7589069	best: 0.7589069 (600)	total: 1m 18s	remaining: 52.3s
800:	test: 0.7602943	best: 0.7603248 (799)	total: 1m 45s	remaining: 26.2s
999:	test: 0.7611861	best: 0.7612620 (975)	total: 2m 16s	remaining: 0us

bestTest = 0.7612620434
bestIteration = 975

Shrink model to first 976 iterations.


In [53]:
# 3. Отскейленые DepTime, Distance, DepTimeLog

ctb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,DepTime,20.245069
1,UniqueCarrier,11.787557
2,DepTimeLog,10.888502
3,Origin,10.359236
4,Dest,9.921822
5,Distance,8.918329
6,flight,7.832368
7,Month,7.466267
8,DayofMonth,7.064207
9,DayOfWeek,5.516644


In [61]:
# 4. Отскейленые DepTime, Distance и дропнутый хвост ['DepTime'] >= 330

Unnamed: 0,Feature Id,Importances
0,DepTime,29.590615
1,UniqueCarrier,12.272846
2,Origin,10.010156
3,Distance,9.495261
4,Dest,9.304766
5,flight,8.188251
6,Month,8.01714
7,DayofMonth,6.939387
8,DayOfWeek,6.181577


In [308]:
# 15. Отскейленые DepTime, Distance, DepTimeLog, DepTime_cat (категор.), distance_cat (категор.)


ctb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,DepTime_cat,15.956945
1,DepTime,14.943196
2,UniqueCarrier,11.338192
3,Origin,10.596559
4,flight,9.676715
5,Dest,9.416725
6,Month,6.260126
7,Distance,6.20033
8,DayofMonth,6.079388
9,DayOfWeek,5.6567


In [374]:
# 20. Отскейленые DepTime, Distance, 'Dep_minute'; DepTime_cat (категор.)

ctb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,DepTime_cat,22.206985
1,Dep_minute,16.942413
2,flight,13.964117
3,UniqueCarrier,10.553029
4,DepTime,7.680481
5,Origin,7.023748
6,Dest,5.386369
7,Distance,4.425941
8,Month,3.982001
9,DayOfWeek,3.979191


In [375]:
params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'verbose': 200,
          'random_seed': 17
         }
all_train_data = Pool(data=train_df_fd,
                      label=y_train,
                      cat_features=categ_feat_idx
                     )
scores = cv(pool=all_train_data,
            params=params, 
            fold_count=3,
            seed=17, 
            shuffle=True,
            stratified=True, # if True the folds are made by preserving the percentage of samples for each class
            plot=False
           )

0:	test: 0.6808951	best: 0.6808951 (0)	total: 1.2s	remaining: 20m 1s
200:	test: 0.7658020	best: 0.7658020 (200)	total: 3m 11s	remaining: 12m 42s
400:	test: 0.7752498	best: 0.7752498 (400)	total: 6m 4s	remaining: 9m 4s
600:	test: 0.7813799	best: 0.7813799 (600)	total: 9m 19s	remaining: 6m 11s
800:	test: 0.7841923	best: 0.7841923 (800)	total: 12m 29s	remaining: 3m 6s
999:	test: 0.7865618	best: 0.7865618 (999)	total: 15m 35s	remaining: 0us


In [8]:
# Предскажем метки на тестовой выборке

ctb.fit(train_df_fd, y_train,
        cat_features= categ_feat_idx,
        early_stopping_rounds = 200,
        verbose = 200
       );


Learning rate set to 0.061677
0:	total: 217ms	remaining: 3m 36s
200:	total: 34.1s	remaining: 2m 15s
400:	total: 1m 11s	remaining: 1m 47s
600:	total: 1m 52s	remaining: 1m 14s
800:	total: 2m 33s	remaining: 38.1s
999:	total: 3m 16s	remaining: 0us


In [9]:
ctb_test_pred = ctb.predict_proba(test_df_fd)[:, 1]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')


array([0, 1, 2, 4, 5, 6, 8, 9], dtype=int64)