In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 100)
pd.options.mode.chained_assignment = None
from helper import extract_etdv3, data_loader_etd, data_loader_submission_etd

In [3]:
bool_submission_prep = 1

# Set the save directory based on whether we are preparing for submission or not
if bool_submission_prep:
    sav_dir = f"Inference_Extracted_Features/etd_{time.time()}/"    
else:
    sav_dir = f"Training_Extracted_Features/etd_{time.time()}/"

# Define the directory path for loading data
load_dir = "Data/"

# Create the save directory
os.mkdir(f"{sav_dir}")

# Print the created directory path
print(f'Created the following dir: {sav_dir}')

Created the following dir: Inference_Extracted_Features/etd_1682865453.7603784/


In [4]:
# List of airports to process
list_airports = ["KATL", "KCLT", "KDEN", "KDFW", "KJFK", "KMEM", "KMIA", "KORD", "KPHX", "KSEA"]

# Process each airport in the list
for airport in list_airports:
    print(f'-----------------------------')
    print(f'Doing airport: {airport}')

    # Load data for the current airport
    print(f'Loading in dataframes for: {airport}')
    if bool_submission_prep:
        df_train_labels, df_etd = data_loader_submission_etd(load_dir, airport)
    else:
        df_train_labels, df_etd = data_loader_etd(load_dir, airport)

    # Feature Engineering for unique timestamp
    print(f'Feature Engineering for: {airport} doing unique timestamp')
    
    # Prepare dataframes for processing
    df_final_labels_etd = df_train_labels.copy(deep=True)
    df_final_labels_etd = df_final_labels_etd.sort_values(by=['gufi', 'timestamp']).reset_index(drop=True)
    df_etd_limited = df_etd[df_etd.gufi.isin(df_final_labels_etd.gufi.unique())].sort_values(by=['gufi', 'timestamp']).reset_index(drop=True)
    df_final_labels_etd = df_final_labels_etd[df_final_labels_etd.gufi.isin(df_etd.gufi.unique())].sort_values(by=['gufi', 'timestamp']).reset_index(drop=True)

    # Initialize necessary variables and dataframes
    unique_planes = list(df_final_labels_etd.gufi.unique())
    df_etd_limited_no_dup = df_etd_limited.drop_duplicates(subset=['gufi'])
    new_planes_indexes_etd = list(df_etd_limited_no_dup.index)
    dropped_fin_labels = len(df_train_labels) - len(df_final_labels_etd)
    print(f'There were {dropped_fin_labels} gufis from train labels because they werent in etd')

    # Create a dataframe to store the extracted features
    df_final_labels_etd_exp = df_final_labels_etd.copy(deep=True)
    df_final_labels_etd_exp = df_final_labels_etd_exp.drop_duplicates(subset=['gufi'])
    new_planes_indexes_labels = list(df_final_labels_etd_exp.index)

    # Process each unique plane and extract the estimated time of departure (ETD) features
    curr_plane_counter = 1
    list_dicts_etd = []
    for index_counter, curr_train_labels_indx in tqdm(enumerate(new_planes_indexes_labels), total=len(new_planes_indexes_labels)):
        if len(new_planes_indexes_labels) - 1 == index_counter:
            break
        df_etd_limited_plane = df_etd_limited.iloc[new_planes_indexes_etd[curr_plane_counter - 1]:new_planes_indexes_etd[curr_plane_counter]]
        curr_gufi = df_final_labels_etd.gufi.iloc[curr_train_labels_indx]
        list_timepoints = list(df_final_labels_etd.timestamp.iloc[curr_train_labels_indx:new_planes_indexes_labels[index_counter + 1]])
        dep_times_dicts = extract_etdv3(curr_gufi, list_timepoints, df_etd_limited_plane)
        list_dicts_etd.extend(dep_times_dicts)
        curr_plane_counter += 1

    # Save the extracted ETD features to a CSV file
    time_etd_df = pd.DataFrame(list_dicts_etd)
    etd_sav_path = f"{sav_dir}timepointgufi_{airport}_etd.csv"
    print(f'Saving the following file: {etd_sav_path}')
    time_etd_df.to_csv(etd_sav_path, index = False)

-----------------------------
Doing airport: KATL
Loading in dataframes for: KATL
LOading from: Data/submission_data.csv
Feature Engineering for: KATL doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 50164/50165 [04:40<00:00, 178.52it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KATL_etd.csv
-----------------------------
Doing airport: KCLT
Loading in dataframes for: KCLT
LOading from: Data/submission_data.csv
Feature Engineering for: KCLT doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 35707/35708 [03:11<00:00, 186.83it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KCLT_etd.csv
-----------------------------
Doing airport: KDEN
Loading in dataframes for: KDEN
LOading from: Data/submission_data.csv
Feature Engineering for: KDEN doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 42521/42522 [04:23<00:00, 161.65it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KDEN_etd.csv
-----------------------------
Doing airport: KDFW
Loading in dataframes for: KDFW
LOading from: Data/submission_data.csv
Feature Engineering for: KDFW doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 46278/46279 [04:40<00:00, 165.25it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KDFW_etd.csv
-----------------------------
Doing airport: KJFK
Loading in dataframes for: KJFK
LOading from: Data/submission_data.csv
Feature Engineering for: KJFK doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 16608/16609 [01:33<00:00, 176.91it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KJFK_etd.csv
-----------------------------
Doing airport: KMEM
Loading in dataframes for: KMEM
LOading from: Data/submission_data.csv
Feature Engineering for: KMEM doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 13859/13860 [01:50<00:00, 125.12it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KMEM_etd.csv
-----------------------------
Doing airport: KMIA
Loading in dataframes for: KMIA
LOading from: Data/submission_data.csv
Feature Engineering for: KMIA doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 20922/20923 [01:59<00:00, 174.76it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KMIA_etd.csv
-----------------------------
Doing airport: KORD
Loading in dataframes for: KORD
LOading from: Data/submission_data.csv
Feature Engineering for: KORD doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 46707/46708 [04:40<00:00, 166.80it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KORD_etd.csv
-----------------------------
Doing airport: KPHX
Loading in dataframes for: KPHX
LOading from: Data/submission_data.csv
Feature Engineering for: KPHX doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 26023/26024 [02:31<00:00, 171.46it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KPHX_etd.csv
-----------------------------
Doing airport: KSEA
Loading in dataframes for: KSEA
LOading from: Data/submission_data.csv
Feature Engineering for: KSEA doing unique timestamp
There were 0 gufis from train labels because they werent in etd


100%|█████████▉| 26139/26140 [02:29<00:00, 175.07it/s]


Saving the following file: Inference_Extracted_Features/etd_1682865453.7603784/timepointgufi_KSEA_etd.csv
