In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt

# Prediction analysis

In [2]:
airports = ['KATL', 'KCLT', 'KDEN', 'KDFW', 'KJFK', 'KMEM', 'KMIA', 'KORD', 'KPHX', 'KSEA']

results = pd.DataFrame()
predictions = pd.DataFrame()

for airport in tqdm(airports):

    master = pd.read_parquet(f'../data/05_model_input/master_{airport}.pq')

    model = CatBoostRegressor()
    model.load_model(f'../../submission/models/{airport}_model_v0')
    master['prediction_0'] = model.predict(master[model.feature_names_]) + master['etd_time_till_est_dep']
    
    model = CatBoostRegressor()
    model.load_model(f'../../submission/models/{airport}_model_v1')
    master['prediction_1'] = model.predict(master[model.feature_names_])
    
    model = CatBoostRegressor()
    model.load_model(f'../../submission/models/{airport}_model_v2')
    master['prediction_2'] = model.predict(master[model.feature_names_]) + master['etd_time_till_est_dep']
    
    master['prediction_3'] = (master['prediction_0'] + 
                              master['prediction_1'] + 
                              master['prediction_2']) / 3
    
    model = CatBoostRegressor()
    model.load_model(f'../data/06_models/global_model')
    master['feat_cat_airport']  = airport
    master['global_prediction'] = model.predict(master[model.feature_names_])
    
    master["split"] = "train"
    master.loc[master["timestamp"] >= "2022-09-01", "split"] = "test"
    
    master["err_0"] = abs(master['prediction_0'].astype(int).clip(1, 299) - master['minutes_until_pushback'])
    master["err_1"] = abs(master['prediction_1'].astype(int).clip(1, 299) - master['minutes_until_pushback'])
    master["err_2"] = abs(master['prediction_2'].astype(int).clip(1, 299) - master['minutes_until_pushback'])
    master["err_3"] = abs(master['prediction_3'].astype(int).clip(1, 299) - master['minutes_until_pushback'])
    master["err_4"] = abs(master['global_prediction'].astype(int).clip(1, 299) - master['minutes_until_pushback'])
    
    res = master.groupby("split").agg({'err_0': 'mean', 
                                       'err_1': 'mean', 
                                       'err_2': 'mean', 
                                       'err_3': 'mean',
                                       'err_4': 'mean'
                                      }).reset_index()
    
    res.insert(0, 'airport', airport)
    
    results = pd.concat([results, res])
    
    master['airport'] = airport
    predictions = pd.concat([predictions, 
                             master[['airport', 'timestamp', 
                                     'split', 'minutes_until_pushback', 
                                     'prediction_0',
                                     'prediction_1',
                                     'prediction_2',
                                     'prediction_3',
                                     'global_prediction']]])

results['best'] = results[['err_0', 
                           'err_1', 
                           'err_2', 
                           'err_3']].apply(lambda x: np.argmin(x), axis=1)

results = results.reset_index(drop=True)
results

100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [1:02:31<00:00, 375.11s/it]


Unnamed: 0,airport,split,err_0,err_1,err_2,err_3,err_4,best
0,KATL,test,8.022737,8.01091,8.022737,7.972963,21.594849,3
1,KATL,train,8.317463,8.538818,8.317463,8.337018,22.80626,0
2,KCLT,test,9.372993,9.33603,9.372993,9.283685,22.288479,3
3,KCLT,train,9.342829,9.980115,9.342829,9.465398,23.562944,0
4,KDEN,test,10.169215,10.288024,10.169215,10.128234,23.51798,3
5,KDEN,train,10.39901,10.847324,10.39901,10.469739,25.871643,0
6,KDFW,test,11.416071,11.467684,11.416071,11.361962,26.023106,3
7,KDFW,train,11.412534,11.708426,11.412534,11.455167,25.899617,0
8,KJFK,test,11.865756,11.516387,11.761775,11.548746,23.493132,1
9,KJFK,train,12.596222,13.181846,12.402834,12.57065,26.41634,2
