In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None
from helper import data_loader, data_loader_train_labels, split_gufi, extract_taxi_to_gate_time,  data_loader_submission_train_labels

In [2]:
bool_submission_prep = 1

# Define the directory path for loading data
load_dir = "Data/"

# Set the save directory based on whether we are preparing for submission or not
if bool_submission_prep:
    sav_dir = f"Inference_Extracted_Features/taxitime_to_gate_{time.time()}/"
else:
    sav_dir = f"Training_Extracted_Features/taxitime_to_gate_{time.time()}/"

# Create the save directory
os.mkdir(f"{sav_dir}")

# Print the created directory path
print(f'Created the following dir: {sav_dir}')

Created the following dir: Inference_Extracted_Features/features_1682574327.6379874/


In [3]:
# Define the list of airport codes to process
list_airports = ["KATL", "KCLT", "KDEN", "KDFW", "KJFK", "KMEM", "KMIA", "KORD", "KPHX", "KSEA"]

# Process each airport in the list
for airport in list_airports:
    airport_short = airport[-3:]
    print(f'-----------------------------')
    print(f'Doing airport: {airport}')

    # Load data for the current airport
    print(f'Loading in dataframes for: {airport}')
    df_config, df_etd, df_first_pos, df_lamp, df_mfs, df_runway_arrival, df_runway_departure, df_standtimes = data_loader(load_dir, airport)
    if bool_submission_prep:
        df_train_labels = data_loader_submission_train_labels(load_dir, airport)
    else:
        df_train_labels = data_loader_train_labels(load_dir, airport)

    # Filter out unnecessary rows from the ETD dataframe
    df_etd_shorter = df_etd[df_etd.timestamp < df_etd.departure_runway_estimated_time]
    unique_timestamps = df_train_labels.timestamp.unique()
    df_standtimes = split_gufi(df_standtimes)

    # Create copies of the dataframes to work with
    df_train_labels_copy = df_train_labels.copy(deep=True)

    df_runway_arrival_copy = df_runway_arrival.copy(deep=True)
    df_runway_arrival_copy['arrival_runway_actual_time'] = pd.to_datetime(df_runway_arrival_copy['arrival_runway_actual_time'])
    df_runway_arrival_copy = df_runway_arrival_copy.sort_values(by='timestamp')
    df_runway_arrival_copy.rename(columns={"timestamp":'timestamp_runway_arrival'}, inplace=True)

    df_standtimes_copy = df_standtimes.copy(deep=True)
    df_standtimes_copy = df_standtimes_copy.sort_values(by='timestamp')
    df_standtimes_copy.rename(columns={"timestamp":'timestamp_standtimes'}, inplace=True)

    # Filter standtimes to keep only the arriving planes at the current airport
    df_standtimes_copy = df_standtimes_copy[df_standtimes_copy.arriving_airport_code == airport_short]
    cols_to_use = list(df_standtimes_copy.columns.difference(df_runway_arrival_copy.columns))
    cols_to_use.insert(0, 'gufi')
    df_arrival_standtimes = df_standtimes_copy[cols_to_use].merge(df_runway_arrival_copy, on=['gufi'], how='left')
    df_arrival_standtimes['taxitime_to_gate'] = (df_arrival_standtimes.arrival_stand_actual_time - df_arrival_standtimes.arrival_runway_actual_time).dt.total_seconds().values / 60
    df_arrival_standtimes.dropna(inplace=True)

    # Iterate through unique timestamps and extract taxi time to gate data
    unique_list_timestamps = df_train_labels.sort_values(by="timestamp").timestamp.unique()
    results = []
    timestamps = []
    for each in tqdm(unique_list_timestamps):
        each = pd.Timestamp(each)
        result = np.array(extract_taxi_to_gate_time(each, df_arrival_standtimes))
        timestamps.append(each)
        results.append(result)
        
    # Create a dataframe with taxi time to gate statistics
    taxitime_to_gate_df = pd.DataFrame(columns=['timestamp', 'found_counts_taxitime_to_gate', 'taxitime_to_gate_mean', 'taxitime_to_gate_std'])
    taxitime_to_gate_df['timestamp'] = timestamps
    taxitime_to_gate_df[['found_counts_taxitime_to_gate', 'taxitime_to_gate_mean', 'taxitime_to_gate_std']] = np.array(results)

    # Filter out rows with no taxi time to gate data
    taxitime_to_gate_df = taxitime_to_gate_df[taxitime_to_gate_df.found_counts_taxitime_to_gate != 0]

    # Save the extracted taxi time to gate data to a CSV file
    etd_sav_path = f"{sav_dir}timepoint_{airport}_taxitime_to_gate.csv"
    taxitime_to_gate_df.to_csv(etd_sav_path, index=False)

-----------------------------
Doing airport: KATL
Loading in dataframes for: KATL
LOading from: Data/submission_data.csv


100%|██████████| 4817/4817 [00:07<00:00, 616.30it/s]


-----------------------------
Doing airport: KCLT
Loading in dataframes for: KCLT
LOading from: Data/submission_data.csv


100%|██████████| 4462/4462 [00:05<00:00, 788.41it/s]


-----------------------------
Doing airport: KDEN
Loading in dataframes for: KDEN
LOading from: Data/submission_data.csv


100%|██████████| 5013/5013 [00:07<00:00, 712.90it/s]


-----------------------------
Doing airport: KDFW
Loading in dataframes for: KDFW


  df_mfs = pd.read_csv(f"{directory}{airport}/{airport}/{airport}_mfs.csv")


LOading from: Data/submission_data.csv


100%|██████████| 4916/4916 [00:08<00:00, 601.58it/s]


-----------------------------
Doing airport: KJFK
Loading in dataframes for: KJFK
LOading from: Data/submission_data.csv


100%|██████████| 4638/4638 [00:03<00:00, 1174.74it/s]


-----------------------------
Doing airport: KMEM
Loading in dataframes for: KMEM
LOading from: Data/submission_data.csv


100%|██████████| 4906/4906 [00:04<00:00, 1138.17it/s]


-----------------------------
Doing airport: KMIA
Loading in dataframes for: KMIA
LOading from: Data/submission_data.csv


100%|██████████| 4765/4765 [00:04<00:00, 1063.60it/s]


-----------------------------
Doing airport: KORD
Loading in dataframes for: KORD
LOading from: Data/submission_data.csv


100%|██████████| 4670/4670 [00:06<00:00, 694.33it/s]


-----------------------------
Doing airport: KPHX
Loading in dataframes for: KPHX
LOading from: Data/submission_data.csv


100%|██████████| 4963/4963 [00:05<00:00, 909.18it/s]


-----------------------------
Doing airport: KSEA
Loading in dataframes for: KSEA


  df_mfs = pd.read_csv(f"{directory}{airport}/{airport}/{airport}_mfs.csv")


LOading from: Data/submission_data.csv


100%|██████████| 4880/4880 [00:05<00:00, 935.13it/s]
