In [11]:
import pandas as pd  # Data manipulation and analysis library
import numpy as np  # Numerical computing library
import xgboost as xgb  # Gradient boosting library
from sklearn.metrics import mean_absolute_error, mean_squared_error  # Evaluation metrics
from sklearn.model_selection import train_test_split, GroupShuffleSplit  # Data splitting functions
from sklearn.tree import DecisionTreeRegressor  # Decision Tree Regressor model
from sklearn.linear_model import LinearRegression  # Linear Regression model
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regressor model
from sklearn.impute import SimpleImputer  # Imputation of missing values
from pathlib import Path  # Object-oriented filesystem paths
from typing import Any  # Type hinting
from loguru import logger  # Logging library
import pandas as pd  # Data manipulation and analysis library (imported twice)
import pickle  # Object serialization and deserialization
import csv  # Reading and writing CSV files
import os  # Operating system interfaces
import re  # Regular expression operations
import math  # Mathematical functions
import time  # Time access and conversions
from tensorflow import keras  # Deep learning library
from collections import deque  # Double-ended queue data structure
from sklearn.metrics import roc_auc_score  # Evaluation metric for binary classification
from xgboost import XGBClassifier  # XGBoost Classifier model

In [12]:
# Model and threshold parameters
model_dir_path = "Models/chosen"
threshold_ml_classifier_model = .50
mae_thresh_bad = 30
mae_thresh_good = 20
bad_airports = ["KDFW", "KJFK", "KMEM", "KMIA"]
good_airports = ["KATL", "KCLT", "KDEN", "KORD", "KPHX", "KSEA"]

# Debug and model type
debug = True
comment = "submission etd airlinecode taxitime"
model_type = "xgb classifier"

# Directory paths for loading data
raw_label_load_dir = "Data/"
indiv_features_load_dir = f"Inference_Extracted_Features/"

# Feature categories
unique_timepoint_features = ["taxitime_to_gate"]
unique_gufi_features = ["airlinecode"]
unique_timepointgufi_features = ["etd"]

# Root directories for feature categories
timepoint_root = f"{indiv_features_load_dir}timepoint"
gufi_root = f"{indiv_features_load_dir}gufi"
timepointgufi_root = f"{indiv_features_load_dir}timepointgufi"

# Create inference save directory
inference_save_dir_root = f"Inference_Predictions/"
inference_save_dir = f"{inference_save_dir_root}/0/"
run_id = 0
while os.path.isdir(inference_save_dir):
    run_id = run_id + 1
    inference_save_dir = f"{inference_save_dir_root}{run_id}/"
os.mkdir(inference_save_dir)
print(f"Model data will be saved at: {inference_save_dir}")

# Filename for overall submission
overall_prediction_file_name_submission = f"{inference_save_dir}overall_submission.csv"

# Directory paths for loading submission data
indiv_features_load_dir_submission = f"Inference_Extracted_Features/Current_Features/"
timepoint_root_submission = f"{indiv_features_load_dir_submission}timepoint"
gufi_root_submission = f"{indiv_features_load_dir_submission}gufi"
timepointgufi_root_submission = f"{indiv_features_load_dir_submission}timepointgufi"

# CSV header
header = ["gufi", "timestamp", "airport", "minutes_until_pushback"]

# Function to split GUFI and add airline_code, plane_id, departing_airport_code, and arriving_airport_code columns
def split_gufi(curr_df):
    try:
        curr_df[['plane_id','departing_airport_code','arriving_airport_code']] =  curr_df.gufi.str.split('.', expand = True)[[0,1,2]]
        curr_df['airline_code'] = curr_df.gufi.str[:3]
    except:
        logger.info("ERRROR IN SPLIT GUFI")
        logger.info(f"{list(curr_df.gufi)}")
    return curr_df

Model data will be saved at: Inference_Predictions//0


In [13]:
def load_model(solution_directory):
    """
    Load any model assets from disk.
    
    Args:
        solution_directory (str): Path to the directory containing the saved models.

    Returns:
        dict: A dictionary containing the loaded models for each airport.
    """
    # List of airports
    airports = ['KATL', 'KCLT', 'KDEN', 'KDFW', 'KJFK', 'KMEM', 'KMIA', 'KORD', 'KPHX', 'KSEA']
    
    # Initialize an empty dictionary to store the loaded models
    model = {}
    
    for curr_airport in airports:
        # Initialize an empty dictionary for the current airport's models
        airport_models = {}
        
        # Load the regressor model
        model_path = f'{solution_directory}/{curr_airport}_xgb classifier_NOOUTLIER_submission etd airlinecode taxitime.pkl'
        logger.info(f"Trying to load model from: {model_path}")
        airport_models['regressor'] = pickle.load(open(model_path, 'rb'))
        
        # Load the classifier model
        model_path = f'{solution_directory}/{curr_airport}_estimation_classifier.pkl'
        airport_models['classifier'] = pickle.load(open(model_path, 'rb'))
        
        # Load the classifier parameters
        model_path = f'{solution_directory}/{curr_airport}_estimation_parameters.pkl'
        airport_models['classifier_params'] = pickle.load(open(model_path, 'rb'))
        
        # Store the loaded models for the current airport
        model[curr_airport] = airport_models
        logger.info(f"Model loaded successfully")
    
    return model

# Load the models from the specified directory
model = load_model(model_dir_path)

2023-05-02 09:26:28.123 | INFO     | __main__:load_model:23 - Trying to load model from: Models/chosen/KATL_xgb classifier_NOOUTLIER_submission etd airlinecode taxitime.pkl
2023-05-02 09:26:28.129 | INFO     | __main__:load_model:36 - Model loaded successfully
2023-05-02 09:26:28.130 | INFO     | __main__:load_model:23 - Trying to load model from: Models/chosen/KCLT_xgb classifier_NOOUTLIER_submission etd airlinecode taxitime.pkl
2023-05-02 09:26:28.136 | INFO     | __main__:load_model:36 - Model loaded successfully
2023-05-02 09:26:28.136 | INFO     | __main__:load_model:23 - Trying to load model from: Models/chosen/KDEN_xgb classifier_NOOUTLIER_submission etd airlinecode taxitime.pkl
2023-05-02 09:26:28.141 | INFO     | __main__:load_model:36 - Model loaded successfully
2023-05-02 09:26:28.142 | INFO     | __main__:load_model:23 - Trying to load model from: Models/chosen/KDFW_xgb classifier_NOOUTLIER_submission etd airlinecode taxitime.pkl
2023-05-02 09:26:28.145 | INFO     | __main_

In [16]:
def grab_airlinecodes(df_predict, airlinecode_features):
    """
    Grab airline codes from the given dataframe and perform one-hot encoding.
    
    Args:
        df_predict (pd.DataFrame): DataFrame containing the prediction data.
        airlinecode_features (list): List of airline codes.

    Returns:
        pd.DataFrame: DataFrame with one-hot encoded airline codes.
    """
    df_predict_copy = df_predict[['gufi']].copy(deep = True)
    df_predict_copy = df_predict_copy.drop_duplicates(subset="gufi")
    df_predict_copy = split_gufi(df_predict_copy)
    airlinecode_features_copy = airlinecode_features
    if "Other" in airlinecode_features_copy:
        airlinecode_features_copy.remove("Other")
    airlines_to_keep = airlinecode_features_copy
    df_predict_copy['airline_code'] = np.where(df_predict_copy['airline_code'].isin(airlines_to_keep), df_predict_copy['airline_code'], 'Other')
    one_hot_encoded = pd.get_dummies(df_predict_copy['airline_code'])
    df_predict_copy = pd.concat([df_predict_copy, one_hot_encoded], axis=1)        
    #df_predict = df_predict.drop(columns=['airport'])
    df_predict = pd.merge(df_predict, df_predict_copy, on=['gufi'], how="left")
    airlinecode_features.append("Other")
    for curr_airline in airlinecode_features:
        if curr_airline not in df_predict.columns:
            df_predict[curr_airline] = 0
    return df_predict


# Load the list of airports
list_airports = ["KATL", "KCLT", "KDEN", "KDFW", "KJFK", "KMEM", "KMIA", "KORD", "KPHX", "KSEA"]

# Iterate through each airport to make predictions
for airport in list_airports:
    print(f'-----------------------------')
    print(f'Doing airport: {airport}')
    curr_model_regression_lower = model[f'{airport}']['regressor']
    estimate_classifier = model[f'{airport}']['classifier']
    estimate_classifier_params = model[f'{airport}']['classifier_params']

    # Load data and preprocess it
    # 1. Load submission data
    # 2. Convert timestamp to pandas datetime
    # 3. Filter data for the current airport

    #getting the airline code of the new planes
    all_trained_features = list(curr_model_regression_lower.get_booster().feature_names)
    airlinecode_features = [curr_feat for curr_feat in all_trained_features if len(curr_feat) == 3]

    ####SUBMISSION
    df_predict = pd.read_csv(f"{raw_label_load_dir}submission_data.csv")
    # Convert timestampe to pandas datetime
    df_predict['timestamp'] = pd.to_datetime(df_predict['timestamp'])   
    df_predict = df_predict[df_predict.airport == airport]


    feature_cols = ['unix_time']
    ###ETD
    etd_file_path = f"{timepointgufi_root_submission}_{airport}_etd.csv"
    df_etd = pd.read_csv(etd_file_path, parse_dates=["timestamp"])
    print(f"Loaded etd features from from : {etd_file_path}")
    df_predict = pd.merge(df_predict, df_etd, on=['gufi', 'timestamp'], how="left")


    # Merge airline code features
    df_predict = grab_airlinecodes(df_predict, airlinecode_features)

    ###taxitimetogate
    taxitime_to_gate_file_path = f"{timepoint_root_submission}_{airport}_taxitime_to_gate.csv"
    df_taxitime_to_gate = pd.read_csv(taxitime_to_gate_file_path, parse_dates=["timestamp"])
    print(f"Loaded taxitime to gate features from from : {taxitime_to_gate_file_path}")
    df_predict = pd.merge(df_predict, df_taxitime_to_gate, on=['timestamp'], how="left")
    taxitime_to_gate_features = list(df_taxitime_to_gate.columns)
    if "Unnamed: 0" in list(taxitime_to_gate_features):
        taxitime_to_gate_features.remove("Unnamed: 0")
    taxitime_to_gate_features.remove('found_counts_taxitime_to_gate')
    taxitime_to_gate_features.remove('timestamp')

    if "Unnamed: 0" in list(df_predict):
        df_predict.drop(columns=['Unnamed: 0'], inplace=True)

    # extract year, month, day, and hour information
    df_predict['unix_time'] = df_predict['timestamp'].astype(np.int64)  // 10**9
    missing_feats = set(all_trained_features) - set(list(df_predict.columns))
    if debug:
        logger.info("Trying now to make the df to feed into predict")
        logger.info(f'Missing these columns: {missing_feats}')
    if len(missing_feats) != 0:
        logger.info(f'Missing these columns: {missing_feats}')
    

    y_pred_lower = curr_model_regression_lower.predict(df_predict[all_trained_features]) 
    
    
    median_underestimation = estimate_classifier_params['median_underestimation']
    median_overestimation = estimate_classifier_params['median_overestimation']
    X_test_lower = df_predict.copy(deep=True)
    X_test_lower['pred_minutes_until_pushback'] = y_pred_lower
    y_prob_estimate = estimate_classifier.predict_proba(X_test_lower[all_trained_features + ['pred_minutes_until_pushback']])[:, 1]
    X_test_lower = X_test_lower.reset_index(drop=True)
    X_test_lower['final_pred_minutes_until_pushback'] = np.where(y_prob_estimate > 0.5, X_test_lower['pred_minutes_until_pushback'] + median_underestimation, X_test_lower['pred_minutes_until_pushback'])
    X_test_lower['final_pred_minutes_until_pushback'] = np.where(y_prob_estimate < 0.5, X_test_lower['final_pred_minutes_until_pushback'] - median_overestimation, X_test_lower['final_pred_minutes_until_pushback'])
    y_pred = X_test_lower['final_pred_minutes_until_pushback'].values

    df_predict['minutes_until_pushback'] = np.int32(np.around(y_pred,decimals=0))   

    if debug:
        logger.info(f"This len df is returned from the predict method: {len(df_predict)}")        
    
    df_predict = df_predict.reset_index(drop=True)

    prediction_file_name_submission = f"{inference_save_dir}{airport}_submission.csv"
    df_predict[["gufi","timestamp","airport","minutes_until_pushback"]].to_csv(prediction_file_name_submission, index = 0)

    df_predict[["gufi","timestamp","airport","minutes_until_pushback"]].to_csv(overall_prediction_file_name_submission, index = 0, mode='a', header = False)


-----------------------------
Doing airport: KATL
Loaded etd features from from : Inference_Extracted_Features/Current_Features/timepointgufi_KATL_etd.csv


2023-05-02 09:27:17.824 | INFO     | __main__:<module>:87 - Trying now to make the df to feed into predict
2023-05-02 09:27:17.825 | INFO     | __main__:<module>:88 - Missing these columns: set()


Loaded taxitime to gate features from from : Inference_Extracted_Features/Current_Features/timepoint_KATL_taxitime_to_gate.csv


2023-05-02 09:27:18.038 | INFO     | __main__:<module>:109 - This len df is returned from the predict method: 303836


-----------------------------
Doing airport: KCLT
Loaded etd features from from : Inference_Extracted_Features/Current_Features/timepointgufi_KCLT_etd.csv


2023-05-02 09:27:20.663 | INFO     | __main__:<module>:87 - Trying now to make the df to feed into predict
2023-05-02 09:27:20.663 | INFO     | __main__:<module>:88 - Missing these columns: set()
2023-05-02 09:27:20.818 | INFO     | __main__:<module>:109 - This len df is returned from the predict method: 198963


Loaded taxitime to gate features from from : Inference_Extracted_Features/Current_Features/timepoint_KCLT_taxitime_to_gate.csv
-----------------------------
Doing airport: KDEN
Loaded etd features from from : Inference_Extracted_Features/Current_Features/timepointgufi_KDEN_etd.csv


2023-05-02 09:27:23.056 | INFO     | __main__:<module>:87 - Trying now to make the df to feed into predict
2023-05-02 09:27:23.057 | INFO     | __main__:<module>:88 - Missing these columns: set()


Loaded taxitime to gate features from from : Inference_Extracted_Features/Current_Features/timepoint_KDEN_taxitime_to_gate.csv


2023-05-02 09:27:23.263 | INFO     | __main__:<module>:109 - This len df is returned from the predict method: 281311


-----------------------------
Doing airport: KDFW
Loaded etd features from from : Inference_Extracted_Features/Current_Features/timepointgufi_KDFW_etd.csv


2023-05-02 09:27:25.893 | INFO     | __main__:<module>:87 - Trying now to make the df to feed into predict
2023-05-02 09:27:25.893 | INFO     | __main__:<module>:88 - Missing these columns: set()


Loaded taxitime to gate features from from : Inference_Extracted_Features/Current_Features/timepoint_KDFW_taxitime_to_gate.csv


2023-05-02 09:27:26.142 | INFO     | __main__:<module>:109 - This len df is returned from the predict method: 297171


-----------------------------
Doing airport: KJFK


2023-05-02 09:27:28.628 | INFO     | __main__:<module>:87 - Trying now to make the df to feed into predict
2023-05-02 09:27:28.629 | INFO     | __main__:<module>:88 - Missing these columns: set()


Loaded etd features from from : Inference_Extracted_Features/Current_Features/timepointgufi_KJFK_etd.csv
Loaded taxitime to gate features from from : Inference_Extracted_Features/Current_Features/timepoint_KJFK_taxitime_to_gate.csv


2023-05-02 09:27:28.720 | INFO     | __main__:<module>:109 - This len df is returned from the predict method: 99604


-----------------------------
Doing airport: KMEM


KeyboardInterrupt: 