In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

In [2]:
airport = 'KMEM'

In [3]:
x = pd.read_parquet(f'../data/05_model_input/master_{airport.lower()}.pq')
target_name = 'minutes_until_pushback'
end_train = "2022-09-01"

In [None]:
x = x[(x['timestamp'] > '2020-11-02') & (x[target_name] != 0)]

feat_names = list(x.columns)
feat_names.remove("gufi")
feat_names.remove("timestamp")
feat_names.remove(target_name)

cat_features = [i for i in range(len(feat_names)) if '_cat_' in feat_names[i]]

x_train = x[x["timestamp"] < end_train][feat_names]
y_train = x[x["timestamp"] < end_train][target_name] - x_train['etd_time_till_est_dep']
x_val = x[x["timestamp"] >= end_train][feat_names]
y_val = x[x["timestamp"] >= end_train][target_name] - x_val['etd_time_till_est_dep']

print("#" * 20 + ' ' * 5 + "training with ", x_train.shape, ' ' * 5 + '#' * 20)
print("#" * 20 + ' ' * 5 + "validating with ", x_val.shape, ' ' * 5 + '#' * 20)


best_params = None
best_score = 1e9

for eta in [0.005, 0.01, 0.03]:
    for depth in [7, 9, 11]:
        for rsm in [0.7, 0.9, 1]:

            model = CatBoostRegressor(eta=eta,
                                      depth=depth,
                                      rsm=rsm,
                                      max_leaves=21,
                                      l2_leaf_reg=5,
                                      min_data_in_leaf=5000,
                                      n_estimators=20000,
                                      task_type='CPU',
                                      thread_count=-1,
                                      grow_policy='Lossguide',
                                      has_time=True,
                                      random_seed=4,
                                      loss_function='MAE',
                                      boosting_type='Plain',
                                      max_ctr_complexity=12,
                                      bootstrap_type='Bernoulli',
                                      subsample=0.8)

            model.fit(x_train, y_train,
                      eval_set=(x_val, y_val),
                      use_best_model=True,
                      verbose=50,
                      cat_features=cat_features,
                      early_stopping_rounds=60)
            
            if model.best_score_['validation']['MAE'] < best_score:
                best_score = model.best_score_['validation']['MAE']
                best_params = {'eta': eta, 'depth': depth, 'rsm': rsm}
                print(best_score)
                print(best_params)

####################     training with  (1275303, 294)      ####################
####################     validating with  (80802, 294)      ####################
0:	learn: 20.8236633	test: 14.8924156	best: 14.8924156 (0)	total: 1.82s	remaining: 10h 6m 12s
50:	learn: 19.8125660	test: 14.1333130	best: 14.1333130 (50)	total: 1m 12s	remaining: 7h 51m 34s
100:	learn: 19.1505934	test: 13.6444781	best: 13.6444781 (100)	total: 2m 23s	remaining: 7h 50m 46s
150:	learn: 18.7093672	test: 13.3446349	best: 13.3446349 (150)	total: 3m 34s	remaining: 7h 49m 52s
200:	learn: 18.3906932	test: 13.1205332	best: 13.1205332 (200)	total: 4m 50s	remaining: 7h 56m 39s
250:	learn: 18.1473011	test: 12.9468761	best: 12.9468761 (250)	total: 6m 8s	remaining: 8h 3m 34s
300:	learn: 17.9559431	test: 12.7855669	best: 12.7855669 (300)	total: 7m 27s	remaining: 8h 8m 26s
350:	learn: 17.8032214	test: 12.6742701	best: 12.6742701 (350)	total: 8m 58s	remaining: 8h 22m 14s
400:	learn: 17.6719252	test: 12.5831149	best: 12.5831149