In [None]:
from datetime import timedelta
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

In [None]:
airports = [
    "KATL",
    "KCLT",
    "KDEN",
    "KDFW",
    "KJFK",
    "KMEM",
    "KMIA",
    "KORD",
    "KPHX",
    "KSEA",
]

DATA_DIRECTORY = Path("./")

In [None]:
submission_format = pd.read_csv(
    DATA_DIRECTORY / "submission_format.csv", parse_dates=["timestamp"]
)
submission_format

In [None]:
submission_format.minutes_until_pushback = submission_format.minutes_until_pushback.astype(np.float32)
submission_format

In [None]:
submission_format.info()

In [None]:
for airport in airports:
    print(airport)
    
#     data prep
    pushback = pd.read_csv(DATA_DIRECTORY / f"{airport}_etd.csv")
    
    pushback['timestamp'] = pd.to_datetime(pushback['timestamp'], errors='coerce')
    pushback['departure_runway_estimated_time'] = pd.to_datetime(pushback['departure_runway_estimated_time'], errors='coerce')

    estimated_pushback = (
        (pushback.departure_runway_estimated_time - pushback.timestamp
        ).dt.total_seconds()
        / 60)

    estimated_pushback = estimated_pushback.clip(lower=0)

    pushback['etd_minus_timestamp'] = estimated_pushback
    
#     train
    X = pushback.etd_minus_timestamp.to_numpy().reshape(-1, 1).astype(np.float32)
    y = pushback.minutes_until_pushback.to_numpy().astype(np.float32)
    
    xg_boost = xgb.XGBRegressor(
                objective="reg:squarederror",
                n_estimators=1000,
                max_depth=7,
                eta=0.1,
                subsample=0.7,
                colsample_bytree=0.8,
                random_state=42,
            )
    xg_boost.fit(X, y)
    
#     rfr = RandomForestRegressor(n_estimators=200, random_state=42)

#     transformer = PolynomialFeatures(degree=3, include_bias=False)
#     x_ = transformer.fit_transform(X)
    
#     reg = LinearRegression().fit(x_, y)
#     print(reg.coef_, reg.intercept_)
    
#      test
    val_pushback = pd.read_csv(DATA_DIRECTORY / f"{airport}_val_etd.csv")

    val_pushback['timestamp'] = pd.to_datetime(val_pushback['timestamp'], errors='coerce')
    val_pushback['departure_runway_estimated_time'] = pd.to_datetime(val_pushback['departure_runway_estimated_time'], errors='coerce')

    val_estimated_pushback = (
        (val_pushback.departure_runway_estimated_time - val_pushback.timestamp
        ).dt.total_seconds()
        / 60)

    val_estimated_pushback = val_estimated_pushback.clip(lower=0).astype(np.float32)

    val_pushback['etd_minus_timestamp'] = val_estimated_pushback
    
#     xx = val_pushback.etd_minus_timestamp.to_numpy().reshape(-1, 1)
#     transformer = PolynomialFeatures(degree=5, include_bias=False)
#     xx_ = transformer.fit_transform(xx)
    
    val_pushback.minutes_until_pushback = xg_boost.predict(val_pushback.etd_minus_timestamp.to_numpy().reshape(-1, 1))

    val_pushback = val_pushback.drop(['departure_runway_estimated_time', 'etd_minus_timestamp'], axis=1)
    
#     print(val_pushback.head())
    
    # merge the dataframes on columns a, b, and c
    merged_df = pd.merge(submission_format, val_pushback[['gufi', 'timestamp', 'airport', 'minutes_until_pushback']],
                         on=['gufi', 'timestamp', 'airport'], how='left')

    # update the value of d in df1 with the value from merged_df
    submission_format['minutes_until_pushback'] = merged_df['minutes_until_pushback_y'].fillna(merged_df['minutes_until_pushback_x'])

#     print(submission_format)

In [None]:
submission_format.describe()

In [None]:
submission_format.minutes_until_pushback = submission_format.minutes_until_pushback.round().astype(int)
submission_format

In [None]:
submission_format.minutes_until_pushback = submission_format.minutes_until_pushback.clip(lower=0).astype(int)
submission_format.describe()

In [None]:
fig, ax = plt.subplots(figsize=(6, 4), dpi=150)
submission_format.minutes_until_pushback.clip(lower=0, upper=200).hist(bins=np.arange(0, 200), ax=ax)
ax.set_title("Distribution of predicted minutes to pushback")
ax.set_ylabel("Number of predictions")
ax.set_xlabel("Minutes to pushback")
_ = plt.show()

In [None]:
submission_format

In [None]:
submission_format.to_csv("xgbr_200.zip", index=False)