In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import csv
import pickle
import os
import time
from collections import deque
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [10]:
# Define thresholds and airport lists
threshold_ml_classifier_model = .50
mae_thresh_bad = 30
mae_thresh_good = 20
bad_airports = ["KDFW", "KJFK", "KMEM", "KMIA"]
good_airports = ["KATL", "KCLT", "KDEN", "KORD", "KPHX", "KSEA"]

# Set comments and model type
comment = "submission etd airlinecode taxitime"
model_type = "xgb classifier"

# Set directories for loading and saving data
raw_label_load_dir = "Data/"
indiv_features_load_dir = f"Training_Extracted_Features/Current_Features/"

unique_timepoint_features = ["taxitime_to_gate"]
timepoint_root = f"{indiv_features_load_dir}timepoint"

unique_gufi_features = ["airlinecode"]
gufi_root = f"{indiv_features_load_dir}gufi"


unique_timepointgufi_features = ["etd"]
timepointgufi_root = f"{indiv_features_load_dir}timepointgufi"


save_dir = f"Models/"

# Set file paths for submission data
indiv_features_load_dir_submission = f"Indiv Engineered Features/ToCombineSubmission/"
timepoint_root_submission = f"{indiv_features_load_dir_submission}timepoint"
gufi_root_submission = f"{indiv_features_load_dir_submission}gufi"
timepointgufi_root_submission = f"{indiv_features_load_dir_submission}timepointgufi"

# Set headers for the prediction output file
header = ["gufi", "timestamp", "airport", "minutes_until_pushback"]

# Create the directory for saving the model and the prediction output file
run_id = 0
model_save_dir = f"{save_dir}{run_id}/"
while os.path.isdir(model_save_dir):
    run_id = run_id + 1
    model_save_dir = f"{save_dir}{run_id}/"
os.mkdir(model_save_dir)
print(f"Model data will be saved at: {model_save_dir}")

# Initialize the overall prediction output file with headers
overall_prediction_file_name_submission = f"{model_save_dir}overall_submission.csv"
with open(overall_prediction_file_name_submission, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)

# Function to split the 'gufi' field into separate columns
def split_gufi(curr_df):
    curr_df[['plane_id', 'departing_airport_code', 'arriving_airport_code']] = curr_df.gufi.str.split('.', expand=True)[[0, 1, 2]]
    curr_df['airline_code'] = curr_df.gufi.str[:3]

Model data will be saved at: Models/6/


In [11]:
list_airports = ["KATL", "KCLT", "KDEN", "KDFW", "KJFK", "KMEM", "KMIA", "KORD", "KPHX", "KSEA"]

# Iterate through each airport in the list_airports
for airport in list_airports:
    print(f'-----------------------------')
    print(f'Doing airport: {airport}')
    feature_cols = ['unix_time']    

    # 1. Load training data and feature files
    raw_labels_load_file = f"{raw_label_load_dir}{airport}/train_labels_{airport}.csv"
    df_data = pd.read_csv(raw_labels_load_file, parse_dates=["timestamp"])
    print(f"Loaded training file from : {raw_labels_load_file}")

    # Load ETD, airlinecode, and taxitime_to_gate features
    # Merge them with the main dataframe (df_data) using appropriate keys

    ###ETD
    etd_file_path = f"{timepointgufi_root}_{airport}_etd.csv"
    df_etd = pd.read_csv(etd_file_path, parse_dates=["timestamp"])
    print(f"Loaded etd features from from : {etd_file_path}")
    df_data = pd.merge(df_data, df_etd, on=['gufi', 'timestamp'])
    etd_features = ['minutes_until_departure_from_timepoint', 'minutes_until_departure_from_timestamp', 'mean_departure_from_timepoint', 'std_departure_from_timepoint']
    feature_cols.extend(etd_features)

    ###airlinecode
    airlinecode_file_path = f"{gufi_root}_{airport}_airlinecode.csv"
    df_airlinecode = pd.read_csv(airlinecode_file_path)
    print(f"Loaded airlinecode features from from : {airlinecode_file_path}")
    df_data = df_data.drop(columns=['airport'])
    df_data = pd.merge(df_data, df_airlinecode, on=['gufi'], how="left")
    airlinecode_features = list(df_airlinecode.columns)
    airlinecode_features.remove('gufi')
    airlinecode_features.remove('airport')
    feature_cols.extend(airlinecode_features)

    ###taxitimetogate
    taxitime_to_gate_file_path = f"{timepoint_root}_{airport}_taxitime_to_gate.csv"
    df_taxitime_to_gate = pd.read_csv(taxitime_to_gate_file_path, parse_dates=["timestamp"])
    print(f"Loaded taxitime to gate features from from : {taxitime_to_gate_file_path}")
    df_data = pd.merge(df_data, df_taxitime_to_gate, on=['timestamp'], how="left")
    taxitime_to_gate_features = list(df_taxitime_to_gate.columns)
    if "Unnamed: 0" in list(taxitime_to_gate_features):
        taxitime_to_gate_features.remove("Unnamed: 0")
    taxitime_to_gate_features.remove('found_counts_taxitime_to_gate')
    taxitime_to_gate_features.remove('timestamp')
    feature_cols.extend(taxitime_to_gate_features)

    # 2. Preprocess the data
    # - Extract year, month, day, and hour information from timestamp
    # - Drop unnecessary columns and handle missing values

    if "Unnamed: 0" in list(df_data):
        df_data.drop(columns=['Unnamed: 0'], inplace=True)

    # extract year, month, day, and hour information
    df_data['unix_time'] = df_data['timestamp'].astype(np.int64)  // 10**9
    df_data.dropna(inplace=True)
    df_data.reset_index(drop=True, inplace=True)


    label_col = ['minutes_until_pushback']
    to_save_dataset = ['gufi','timestamp','airport']
    to_save_dataset.extend(list(set(feature_cols)))
    df_data_used = df_data[to_save_dataset]

    # 3. Split the data into train and test sets using GroupShuffleSplit
    # - Create an internal regressor model and fit it to the training data
    # - Make predictions on the test data and calculate MAE and MSE

    # 4. Filter the data based on a threshold for MAE
    # - Train a lower model (regressor_lower) on the filtered data

    # 5. Save the lower model as a pickle file

    # 6. Train an estimation classifier to predict overestimation or underestimation
    # - Calculate the median underestimation and overestimation
    # - Save the classifier and its parameters as pickle files

    gss = GroupShuffleSplit(n_splits=1, train_size=0.4, test_size=0.6, random_state=42)
    # split the data into train and test sets
    train_index, test_index = next(gss.split(df_data, groups=df_data['gufi']))
    df_internal_train = df_data.iloc[train_index].copy()
    df_internal_test = df_data.iloc[test_index].copy()

    X_internal_train = df_internal_train[list(set(feature_cols))]
    y_internal_train = df_internal_train[label_col]

    X_internal_test = df_internal_test[list(set(feature_cols))]
    y_internal_test = df_internal_test[label_col]
    internal_regressor = xgb.XGBRegressor()  
    print(f"Internal Regressor Len of Training: {len(X_internal_train)}")
    internal_regressor.fit(X_internal_train, y_internal_train)  
    print(f"Internal Regressor Len of Prediction: {len(X_internal_test)}")
    y_internal_pred = np.int32(np.around(internal_regressor.predict(X_internal_test),decimals=0))
    internal_mae = mean_absolute_error(y_internal_test, y_internal_pred)
    internal_mse = mean_squared_error(y_internal_test, y_internal_pred)
    # # Print the results
    print(f"MAE: {internal_mae} & MSE: {internal_mse}")  
    df_internal_test['y_pred'] = y_internal_pred
    df_internal_test['mae'] = np.abs(df_internal_test['y_pred'] - df_internal_test['minutes_until_pushback'])
    df_internal_test['ml_model_used'] = 0
    if airport in bad_airports:
        mae_thresh = mae_thresh_bad
    else:
        mae_thresh = mae_thresh_good
    bool_mask = (df_internal_test['mae']  > mae_thresh)
    print(f'Only {sum(bool_mask)} out of {len(df_internal_test)} were above the threshold of {mae_thresh} and were removed from training.')
    df_internal_test.loc[bool_mask,'ml_model_used'] = 1
    df_internal_test_lower = df_internal_test[df_internal_test.ml_model_used == 0].copy()
    df_internal_test_higher = df_internal_test[df_internal_test.ml_model_used == 1].copy()
    
    X_lower = df_internal_test_lower[list(set(feature_cols))]
    y_lower = df_internal_test_lower[label_col]
 
    regressor_lower = xgb.XGBRegressor()  
    print(f'Trained on {len(X_lower)} many datapoints.')
    regressor_lower.fit(X_lower, y_lower) 
    ##########
    
    model_file_name = f"{model_save_dir}{airport}_{model_type}_NOOUTLIER_{comment}.pkl"
    pickle.dump(regressor_lower, open(model_file_name, "wb"))


    # Training Over and Under Estimation Classifier
    classifier_upper_threshold = 0.50
    classifier_lower_threshold = 0.50
    y_pred = np.int32(np.around(regressor_lower.predict(df_internal_test[list(set(feature_cols))]),decimals=0))   

    X_test = df_internal_test.copy(deep=True)
    X_test['pred_minutes_until_pushback'] = y_pred
    X_test['actual_minutes_until_pushback'] = X_test['minutes_until_pushback']
    X_test['label_estimation'] = 0
    X_test.loc[X_test['pred_minutes_until_pushback'] < X_test['actual_minutes_until_pushback'], 'label_estimation'] = 1
    num_over_samples = len(X_test[X_test['label_estimation'] == 0])
    num_under_samples = len(X_test[X_test['label_estimation'] == 1])
    print(f'Number of Overestimated Samples: {num_over_samples} & Number of Underestimated Samples: {num_under_samples}')
    estimate_classifier = xgb.XGBClassifier()
    estimate_classifier.fit(X_test[list(set(feature_cols)) + ['pred_minutes_until_pushback']], X_test['label_estimation'])
    y_prob_estimate = estimate_classifier.predict_proba(X_test[list(set(feature_cols)) + ['pred_minutes_until_pushback']])[:, 1]
    y_pred_estimate = np.int32(np.around(estimate_classifier.predict(X_test[list(set(feature_cols)) + ['pred_minutes_until_pushback']]),decimals=0)) 
    test_y_binary = X_test['label_estimation']    
    df_test_estimate_underestimate = X_test[y_prob_estimate > classifier_upper_threshold]
    median_underestimation = (df_test_estimate_underestimate['actual_minutes_until_pushback'] - df_test_estimate_underestimate['pred_minutes_until_pushback']).median()
    df_test_estimate_overestimate = X_test[y_prob_estimate < classifier_lower_threshold]
    median_overestimation = (df_test_estimate_overestimate['pred_minutes_until_pushback'] - df_test_estimate_overestimate['actual_minutes_until_pushback']).median()

    # Saving classifier
    model_file_name = f"{model_save_dir}{airport}_estimation_classifier.pkl"
    pickle.dump(estimate_classifier, open(model_file_name, "wb"))
    airport_estimation_param_dict = {}
    airport_estimation_param_dict['median_underestimation'] = median_underestimation
    airport_estimation_param_dict['median_overestimation'] = median_overestimation
    params_file_name = f"{model_save_dir}{airport}_estimation_parameters.pkl"
    pickle.dump(airport_estimation_param_dict, open(params_file_name, "wb"))



-----------------------------
Doing airport: KATL
Loaded training file from : Data/KATL/train_labels_KATL.csv
Loaded etd features from from : Training_Extracted_Features/Current Features/timepointgufi_KATL_etd.csv
Loaded airlinecode features from from : Training_Extracted_Features/Current Features/gufi_KATL_airlinecode.csv
Loaded taxitime to gate features from from : Training_Extracted_Features/Current Features/timepoint_KATL_taxitime_to_gate.csv
Internal Regressor Len of Training: 1252252
Internal Regressor Len of Prediction: 1881193
MAE: 9.841572874234595 & MSE: 330.58379336942033
Only 185511 out of 1881193 were above the threshold of 20 and were removed from training.
Trained on 1695682 many datapoints.
Number of Overestimated Samples: 1106941 & Number of Underestimated Samples: 774252
-----------------------------
Doing airport: KCLT
Loaded training file from : Data/KCLT/train_labels_KCLT.csv
Loaded etd features from from : Training_Extracted_Features/Current Features/timepointgufi