In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency

In [2]:
pd.set_option('display.max_columns', None) 

In [3]:
df = pd.read_csv("2023_TEST.csv")
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,ARR_TIME,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",600,621.0,21.0,1.0,1.0,856.0,16.0,1.0,1.0,0.0,,100.0,95.0,619.0,16.0,0.0,0.0,0.0,0.0
1,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",1140,1149.0,9.0,0.0,0.0,1423.0,3.0,0.0,0.0,0.0,,100.0,94.0,619.0,,,,,
2,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",1720,1813.0,53.0,1.0,3.0,2056.0,56.0,1.0,3.0,0.0,,100.0,103.0,619.0,1.0,0.0,3.0,0.0,52.0
3,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10800,1080003,"Burbank, CA",1830,1916.0,46.0,1.0,3.0,2010.0,45.0,1.0,3.0,0.0,,115.0,114.0,672.0,1.0,0.0,0.0,0.0,44.0
4,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10821,1082106,"Baltimore, MD",1510,1557.0,47.0,1.0,3.0,2135.0,40.0,1.0,2.0,0.0,,225.0,218.0,1670.0,16.0,0.0,0.0,0.0,24.0


In [5]:
df.shape

(1867296, 31)

In [4]:
df.isnull().sum()

YEAR                            0
QUARTER                         0
MONTH                           0
DAY_OF_MONTH                    0
DAY_OF_WEEK                     0
MKT_CARRIER_AIRLINE_ID          0
ORIGIN_AIRPORT_ID               0
ORIGIN_AIRPORT_SEQ_ID           0
ORIGIN_CITY_NAME                0
DEST_AIRPORT_ID                 0
DEST_AIRPORT_SEQ_ID             0
DEST_CITY_NAME                  0
CRS_DEP_TIME                    0
DEP_TIME                     7097
DEP_DELAY_NEW                7144
DEP_DEL15                    7144
DEP_DELAY_GROUP              7144
ARR_TIME                     8003
ARR_DELAY_NEW               10828
ARR_DEL15                   10828
ARR_DELAY_GROUP             10828
CANCELLED                       0
CANCELLATION_CODE         1859701
CRS_ELAPSED_TIME                0
ACTUAL_ELAPSED_TIME         10828
DISTANCE                        0
CARRIER_DELAY             1556068
WEATHER_DELAY             1556068
NAS_DELAY                 1556068
SECURITY_DELAY

In [8]:
cols_to_filter = [
    "DEP_TIME",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
    "ARR_TIME",
    "ARR_DELAY_NEW",
    "ARR_DEL15",
    "ARR_DELAY_GROUP",
    "ACTUAL_ELAPSED_TIME"
]

flight_data = df.dropna(subset=cols_to_filter)
df.shape, flight_data.shape

((1867296, 31), (1856468, 31))

In [9]:
# This is Chloe's code for feature engineering

# ---------------------------------------------------------
# Create FL_DATE (needed for daily counts)
# ---------------------------------------------------------
df = flight_data.rename(columns={'DAY_OF_MONTH': 'DAY'}) # for some reason was complaining about day_of_month
df['FL_DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

# ---------------------------------------------------------
# 1) ORIGIN BUCKET (quartiles of flights per day per origin)
# ---------------------------------------------------------
origin_counts = (
    df.groupby(['FL_DATE', 'ORIGIN_AIRPORT_ID'])
      .size()
      .reset_index(name='origin_flights_day')
)

df = df.merge(origin_counts, on=['FL_DATE', 'ORIGIN_AIRPORT_ID'], how='left')

df['origin_bucket'] = pd.qcut(df['origin_flights_day'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 2) DESTINATION BUCKET (quartiles of flights per day per destination)
# ---------------------------------------------------------
dest_counts = (
    df.groupby(['FL_DATE', 'DEST_AIRPORT_ID'])
      .size()
      .reset_index(name='dest_flights_day')
)

df = df.merge(dest_counts, on=['FL_DATE', 'DEST_AIRPORT_ID'], how='left')

df['destination_bucket'] = pd.qcut(df['dest_flights_day'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 3) DISTANCE BUCKET (quartiles of distance)
# ---------------------------------------------------------
df['distance_bucket'] = pd.qcut(df['DISTANCE'], q=4, labels=[1, 2, 3, 4]) #bottom quartile is 1, top quartile is 4

# ---------------------------------------------------------
# 4) AIRLINE BUCKET
# bottom 6 = bucket 0
# top 4    = bucket 1
# ---------------------------------------------------------
airline_counts = df['MKT_CARRIER_AIRLINE_ID'].value_counts()

bottom_6 = airline_counts.sort_values().index[:6]
top_4 = airline_counts.sort_values(ascending=False).index[:4]

df['airline_bucket'] = None
df.loc[df['MKT_CARRIER_AIRLINE_ID'].isin(bottom_6), 'airline_bucket'] = 0
df.loc[df['MKT_CARRIER_AIRLINE_ID'].isin(top_4), 'airline_bucket'] = 1

# ---------------------------------------------------------
# Final preview
# ---------------------------------------------------------
df[['origin_bucket', 'destination_bucket', 'distance_bucket', 'airline_bucket']].head()

Unnamed: 0,origin_bucket,destination_bucket,distance_bucket,airline_bucket
0,1,2,2,1
1,1,2,2,1
2,1,2,2,1
3,1,1,3,1
4,1,2,4,1


In [13]:
df_completed = df[df['CANCELLED'] == 0] # Use non cancelled flights + base off of departure delay not arrival delay
df_completed.shape

(1856468, 38)

In [14]:
cluster_features = [
    "airline_bucket", 
    "origin_bucket", 
    "destination_bucket",
    "lagged_delay_flag", 
    "prev_real_delay"
]

In [15]:
""" One Hot Encode categorical features and standarize continuous"""

# cat_features = ["MKT_CARRIER_AIRLINE_ID", "ORIGIN_AIRPORT_ID", "DEST_AIRPORT_ID", "region"]
num_features = ['airline_bucket', 'origin_bucket', 'destination_bucket', 'lagged_delay_flag', 'prev_real_delay']

preprocess = ColumnTransformer(
    transformers=[
        # ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", StandardScaler(), num_features)
    ]
)

In [25]:
import pandas as pd

df = df_completed.copy()

possible_aircraft_cols = ["TAIL_NUM", "AIRCRAFT_ID", "ACFT_ID", "N_NUMBER"]
aircraft_candidates = [c for c in possible_aircraft_cols if c in df.columns]

if aircraft_candidates:
    aircraft_col = aircraft_candidates[0]
else:
    aircraft_col = "MKT_CARRIER_AIRLINE_ID"

dep_time_str = df["CRS_DEP_TIME"].astype(int).astype(str).str.zfill(4)
dep_hour = dep_time_str.str[:2].astype(int)
dep_min = dep_time_str.str[2:].astype(int)

# dep_date = pd.to_datetime(
#     {
#         "year": df["YEAR"],
#         "month": df["MONTH"],
#         "day": df["DAY_OF_MONTH"],
#     }
# )

# df["CRS_DEP_DT"] = dep_date + pd.to_timedelta(dep_hour, unit="h") + pd.to_timedelta(dep_min, unit="m")

df = df.sort_values([aircraft_col, "FL_DATE"])

g = df.groupby(aircraft_col, group_keys=False)

df["prev_dest_airport"] = g["DEST_AIRPORT_ID"].shift(1)
df["prev_dep_dt"] = g["FL_DATE"].shift(1)

df["prev_real_delay"] = (
    g["CARRIER_DELAY"].shift(1).fillna(0) +
    g["WEATHER_DELAY"].shift(1).fillna(0) +
    g["NAS_DELAY"].shift(1).fillna(0) +
    g["SECURITY_DELAY"].shift(1).fillna(0) +
    g["LATE_AIRCRAFT_DELAY"].shift(1).fillna(0)
)

df["hours_since_prev"] = (df["FL_DATE"] - df["prev_dep_dt"]).dt.total_seconds() / 3600

cond_same_route = df["prev_dest_airport"].eq(df["ORIGIN_AIRPORT_ID"])
cond_within_1h = df["hours_since_prev"].between(0, 1)
cond_prev_delayed = df["prev_real_delay"] > 0

df["lagged_delay_flag"] = (cond_same_route & cond_within_1h & cond_prev_delayed).astype(int)
df["lag_delay_minutes"] = df["prev_real_delay"].where(df["lagged_delay_flag"] == 1, 0)

df = df.sort_index()

flight_data["FL_DATE"] = df["FL_DATE"]
flight_data["lagged_delay_flag"] = df["lagged_delay_flag"]
flight_data["lag_delay_minutes"] = df["lag_delay_minutes"]

sample_lagged = flight_data[flight_data["lagged_delay_flag"] == 1].head(20)

if not sample_lagged.empty:
    idx = sample_lagged.index[0]
    carrier = flight_data.loc[idx, "MKT_CARRIER_AIRLINE_ID"]
    y = flight_data.loc[idx, "YEAR"]
    m = flight_data.loc[idx, "MONTH"]
    d = flight_data.loc[idx, "DAY_OF_MONTH"]

    same_group = flight_data[
        (flight_data["MKT_CARRIER_AIRLINE_ID"] == carrier) &
        (flight_data["YEAR"] == y) &
        (flight_data["MONTH"] == m) &
        (flight_data["DAY_OF_MONTH"] == d)
    ].sort_values("FL_DATE")

    same_group[[
        "ORIGIN_AIRPORT_ID",
        "DEST_AIRPORT_ID",
        "FL_DATE",
        "ARR_DELAY_NEW",
        "CARRIER_DELAY",
        "WEATHER_DELAY",
        "NAS_DELAY",
        "SECURITY_DELAY",
        "LATE_AIRCRAFT_DELAY",
        "lagged_delay_flag",
        "lag_delay_minutes"
    ]]
df["prev_real_delay"] = df["prev_real_delay"] 
df["lagged_delay_flag"] = df["lagged_delay_flag"]

In [26]:
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,ARR_TIME,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FL_DATE,origin_flights_day,origin_bucket,dest_flights_day,destination_bucket,distance_bucket,airline_bucket,prev_dest_airport,prev_dep_dt,prev_real_delay,hours_since_prev,lagged_delay_flag,lag_delay_minutes
0,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",600,621.0,21.0,1.0,1.0,856.0,16.0,1.0,1.0,0.0,,100.0,95.0,619.0,16.0,0.0,0.0,0.0,0.0,2023-10-01,78,1,273,2,2,1,,NaT,0.0,,0,0.0
1,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",1140,1149.0,9.0,0.0,0.0,1423.0,3.0,0.0,0.0,0.0,,100.0,94.0,619.0,,,,,,2023-10-01,78,1,273,2,2,1,10423.0,2023-10-01,16.0,0.0,0,0.0
2,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",1720,1813.0,53.0,1.0,3.0,2056.0,56.0,1.0,3.0,0.0,,100.0,103.0,619.0,1.0,0.0,3.0,0.0,52.0,2023-10-01,78,1,273,2,2,1,10423.0,2023-10-01,0.0,0.0,0,0.0
3,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10800,1080003,"Burbank, CA",1830,1916.0,46.0,1.0,3.0,2010.0,45.0,1.0,3.0,0.0,,115.0,114.0,672.0,1.0,0.0,0.0,0.0,44.0,2023-10-01,78,1,85,1,3,1,10423.0,2023-10-01,56.0,0.0,0,0.0
4,2023,4,10,1,7,19393,10140,1014005,"Albuquerque, NM",10821,1082106,"Baltimore, MD",1510,1557.0,47.0,1.0,3.0,2135.0,40.0,1.0,2.0,0.0,,225.0,218.0,1670.0,16.0,0.0,0.0,0.0,24.0,2023-10-01,78,1,286,2,4,1,10800.0,2023-10-01,45.0,0.0,0,0.0


In [27]:
cluster_features = [
    "airline_bucket", 
    "origin_bucket", 
    "destination_bucket",
    "lagged_delay_flag", 
    "prev_real_delay"
]

In [28]:
""" One Hot Encode categorical features and standarize continuous"""

# cat_features = ["MKT_CARRIER_AIRLINE_ID", "ORIGIN_AIRPORT_ID", "DEST_AIRPORT_ID", "region"]
num_features = ['airline_bucket', 'origin_bucket', 'destination_bucket', 'lagged_delay_flag', 'prev_real_delay']

preprocess = ColumnTransformer(
    transformers=[
        # ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", StandardScaler(), num_features)
    ]
)

In [30]:
k = 3  # Start with this but should do elbow method??
kmeans = Pipeline([
    ("prep", preprocess),
    ("cluster", KMeans(n_clusters=k, random_state=42)) # Should I set the n_init value??
])

kmeans.fit(df[cluster_features])
df["cluster"] = kmeans.predict(df[cluster_features])

In [31]:
df.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY', 'DAY_OF_WEEK',
       'MKT_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID',
       'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID',
       'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW',
       'DEP_DEL15', 'DEP_DELAY_GROUP', 'ARR_TIME', 'ARR_DELAY_NEW',
       'ARR_DEL15', 'ARR_DELAY_GROUP', 'CANCELLED', 'CANCELLATION_CODE',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'FL_DATE', 'origin_flights_day', 'origin_bucket', 'dest_flights_day',
       'destination_bucket', 'distance_bucket', 'airline_bucket',
       'prev_dest_airport', 'prev_dep_dt', 'prev_real_delay',
       'hours_since_prev', 'lagged_delay_flag', 'lag_delay_minutes',
       'cluster'],
      dtype='object')

In [33]:
df.to_csv("2023_TEST.csv", index=False)