In [2]:
import pandas as pd
import numpy as np
import time
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None
from helper import data_loader_train_labels, split_gufi, data_loader_submission_train_labels

In [3]:
bool_submission_prep = 1

# Set the save directory based on the flag (either for submission or training)
if bool_submission_prep:
    sav_dir = f"Inference_Extracted_Features/airplane_code_{time.time()}/"    
else:
    sav_dir = f"Training_Extracted_Features/airplane_code_{time.time()}/"

# Set the directory from which data will be loaded
load_dir = "Data/"

# Create the save directory
os.mkdir(f"{sav_dir}")

# Print the created directory's name for confirmation
print(f'Created the following dir: {sav_dir}')

Created the following dir: Inference_Extracted_Features/airplane_code_1683005181.9964302/


In [3]:
# List of airports to process
list_airports = ["KATL", "KCLT", "KDEN", "KDFW", "KJFK", "KMEM", "KMIA", "KORD", "KPHX", "KSEA"]

# Process each airport
for airport in list_airports:
    print(f'-----------------------------')
    print(f'Doing airport: {airport}')

    # Load data for the current airport
    print(f'Loading in dataframes for: {airport}')
    if bool_submission_prep:
        df_train_labels = data_loader_submission_train_labels(load_dir, airport)
    else:
        df_train_labels = data_loader_train_labels(load_dir, airport)
        
    # Create a deep copy of the DataFrame to avoid modifying the original data
    df_train_labels_copy = df_train_labels.copy(deep = True)

    # Split airline codes
    print(f'Splitting airline code for: {airport}')
    df_train_labels_copy = df_train_labels_copy.drop_duplicates(subset="gufi")
    df_train_labels_copy_no_dup = split_gufi(df_train_labels_copy)
    x = df_train_labels_copy.airline_code.value_counts()

    # Keep only the top 25 airline codes, and label the rest as 'Other'
    airlines_to_keep = x.keys()[0:25]
    df_train_labels_copy['airline_code'] = np.where(df_train_labels_copy['airline_code'].isin(airlines_to_keep), df_train_labels_copy['airline_code'], 'Other')

    # One-hot encode the airline codes
    one_hot_encoded = pd.get_dummies(df_train_labels_copy['airline_code'])
    df_train_labels_copy = pd.concat([df_train_labels_copy, one_hot_encoded], axis=1)

    # Save the processed data to a CSV file
    cols_to_save = ['gufi', 'airport', 'Other']
    cols_to_save.extend(airlines_to_keep)
    df_train_labels_copy[cols_to_save].head()
    df_train_labels_copy[cols_to_save].to_csv(f"{sav_dir}gufi_{airport}_airlinecode.csv", index = False)

-----------------------------
Doing airport: KATL
Loading in dataframes for: KATL
LOading from: Data/submission_data.csv
Splitting airline code for: KATL
-----------------------------
Doing airport: KCLT
Loading in dataframes for: KCLT
LOading from: Data/submission_data.csv
Splitting airline code for: KCLT
-----------------------------
Doing airport: KDEN
Loading in dataframes for: KDEN
LOading from: Data/submission_data.csv
Splitting airline code for: KDEN
-----------------------------
Doing airport: KDFW
Loading in dataframes for: KDFW
LOading from: Data/submission_data.csv
Splitting airline code for: KDFW
-----------------------------
Doing airport: KJFK
Loading in dataframes for: KJFK
LOading from: Data/submission_data.csv
Splitting airline code for: KJFK
-----------------------------
Doing airport: KMEM
Loading in dataframes for: KMEM
LOading from: Data/submission_data.csv
Splitting airline code for: KMEM
-----------------------------
Doing airport: KMIA
Loading in dataframes for: