In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.impute import SimpleImputer as Imputer
import gdown

In [2]:
file_id = "13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r"
url = f"https://drive.google.com/uc?id={file_id}"

gdown.download(url, "flight_data.csv", quiet=False)

flight_data = pd.read_csv("flight_data.csv")

Downloading...
From (original): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r
From (redirected): https://drive.google.com/uc?id=13elVDJJ6JUH2gsj36CxPjkLryE2rnl-r&confirm=t&uuid=de179424-95f4-4f92-ac6f-18b25fa7bcc1
To: /content/flight_data.csv
100%|██████████| 1.32G/1.32G [00:05<00:00, 227MB/s]


In [3]:
pd.set_option("display.max_columns", None)
flight_data.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,ARR_TIME,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,index
0,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",700,707.0,7.0,0.0,0.0,950.0,15.0,1.0,1.0,0.0,,95.0,103.0,619.0,7.0,0.0,8.0,0.0,0.0,0
1,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,1042302,"Austin, TX",1830,1826.0,0.0,0.0,-1.0,2112.0,2.0,0.0,0.0,0.0,,100.0,106.0,619.0,0.0,0.0,0.0,0.0,0.0,1
2,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10800,1080003,"Burbank, CA",1420,1426.0,6.0,0.0,0.0,1516.0,0.0,0.0,-1.0,0.0,,130.0,110.0,672.0,0.0,0.0,0.0,0.0,0.0,2
3,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10821,1082106,"Baltimore, MD",1500,1514.0,14.0,0.0,0.0,2050.0,15.0,1.0,1.0,0.0,,215.0,216.0,1670.0,14.0,0.0,1.0,0.0,0.0,3
4,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",11259,1125904,"Dallas, TX",530,527.0,0.0,0.0,-1.0,805.0,0.0,0.0,-1.0,0.0,,105.0,98.0,580.0,0.0,0.0,0.0,0.0,0.0,4


In [4]:
print(flight_data.columns)

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
       'MKT_CARRIER_AIRLINE_ID', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID',
       'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID',
       'DEST_CITY_NAME', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY_NEW',
       'DEP_DEL15', 'DEP_DELAY_GROUP', 'ARR_TIME', 'ARR_DELAY_NEW',
       'ARR_DEL15', 'ARR_DELAY_GROUP', 'CANCELLED', 'CANCELLATION_CODE',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'index'],
      dtype='object')


In [4]:
import pandas as pd

df = flight_data.copy()

# Pick an "aircraft" identifier column:
#    Try to find a true aircraft id; if none, fall back
#    to MKT_CARRIER_AIRLINE_ID as an approximation.
possible_aircraft_cols = ["TAIL_NUM", "AIRCRAFT_ID", "ACFT_ID", "N_NUMBER"]

aircraft_candidates = [c for c in possible_aircraft_cols if c in df.columns]

if aircraft_candidates:
    aircraft_col = aircraft_candidates[0]
else:
    # Fallback: group by carrier as a proxy for "same plane"
    aircraft_col = "MKT_CARRIER_AIRLINE_ID"

print(f"Using '{aircraft_col}' as the aircraft grouping column.")

# Build a proper scheduled departure datetime (CRS_DEP_DT)

# Convert CRS_DEP_TIME like 700, 1830 → hours and minutes
dep_time_str = df["CRS_DEP_TIME"].astype(int).astype(str).str.zfill(4)
dep_hour = dep_time_str.str[:2].astype(int)
dep_min = dep_time_str.str[2:].astype(int)

# Build date from YEAR, MONTH, DAY_OF_MONTH
dep_date = pd.to_datetime(
    {
        "year": df["YEAR"],
        "month": df["MONTH"],
        "day": df["DAY_OF_MONTH"],
    }
)

# Full scheduled departure timestamp
df["CRS_DEP_DT"] = dep_date + pd.to_timedelta(dep_hour, unit="h") + pd.to_timedelta(dep_min, unit="m")

# Sort by aircraft and scheduled departure time
df = df.sort_values([aircraft_col, "CRS_DEP_DT"])

# Get previous flight info for the same aircraft group
g = df.groupby(aircraft_col, group_keys=False)

df["prev_dest_airport"] = g["DEST_AIRPORT_ID"].shift(1)
df["prev_arr_delay"] = g["ARR_DELAY_NEW"].shift(1)
df["prev_dep_dt"] = g["CRS_DEP_DT"].shift(1)

# Time difference between current and previous scheduled departures in hours
df["hours_since_prev"] = (df["CRS_DEP_DT"] - df["prev_dep_dt"]).dt.total_seconds() / 3600

cond_same_route = df["prev_dest_airport"].eq(df["ORIGIN_AIRPORT_ID"])
cond_within_24h = df["hours_since_prev"].le(24)
cond_prev_delayed = df["prev_arr_delay"].fillna(0).gt(0)

df["lagged_delay_flag"] = (cond_same_route & cond_within_24h & cond_prev_delayed).astype(int)

# Lag delay in minutes: equal to previous flight's delay when lagged, else 0
df["lag_delay_minutes"] = df["prev_arr_delay"].where(df["lagged_delay_flag"].eq(1), other=0).fillna(0)

# Copy back to your original DataFrame, preserving original row order
df = df.sort_index()

flight_data["lagged_delay_flag"] = df["lagged_delay_flag"]
flight_data["lag_delay_minutes"] = df["lag_delay_minutes"]


Using 'MKT_CARRIER_AIRLINE_ID' as the aircraft grouping column.


In [6]:
flight_data[flight_data["lagged_delay_flag"] == 1].head(10)

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,ARR_TIME,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,CANCELLED,CANCELLATION_CODE,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,index,lagged_delay_flag,lag_delay_minutes
142,2024,1,1,1,1,19393,10397,1039707,"Atlanta, GA",14107,1410702,"Phoenix, AZ",620,623.0,3.0,0.0,0.0,837.0,0.0,0.0,-1.0,0.0,,260.0,254.0,1587.0,0.0,0.0,0.0,0.0,0.0,142,1,6.0
245,2024,1,1,1,1,19393,10423,1042302,"Austin, TX",13232,1323202,"Chicago, IL",1830,1833.0,3.0,0.0,0.0,2103.0,3.0,0.0,0.0,0.0,,150.0,150.0,972.0,0.0,0.0,0.0,0.0,0.0,245,1,2.0
385,2024,1,1,1,1,19393,10693,1069302,"Nashville, TN",12954,1295407,"Long Beach, CA",1310,1308.0,0.0,0.0,-1.0,1530.0,0.0,0.0,-2.0,0.0,,280.0,262.0,1785.0,0.0,0.0,0.0,0.0,0.0,385,1,2.0
648,2024,1,1,1,1,19393,10821,1082106,"Baltimore, MD",12451,1245102,"Jacksonville, FL",1100,1055.0,0.0,0.0,-1.0,1250.0,0.0,0.0,-1.0,0.0,,125.0,115.0,663.0,0.0,0.0,0.0,0.0,0.0,648,1,11.0
665,2024,1,1,1,1,19393,10821,1082106,"Baltimore, MD",13204,1320402,"Orlando, FL",1555,1554.0,0.0,0.0,-1.0,1818.0,0.0,0.0,-1.0,0.0,,155.0,144.0,787.0,0.0,0.0,0.0,0.0,0.0,665,1,3.0
924,2024,1,1,1,1,19393,11259,1125904,"Dallas, TX",11697,1169706,"Fort Lauderdale, FL",630,626.0,0.0,0.0,-1.0,950.0,0.0,0.0,-1.0,0.0,,155.0,144.0,1108.0,0.0,0.0,0.0,0.0,0.0,924,1,6.0
972,2024,1,1,1,1,19393,11259,1125904,"Dallas, TX",12992,1299206,"Little Rock, AR",2050,2050.0,0.0,0.0,0.0,2146.0,0.0,0.0,-1.0,0.0,,65.0,56.0,296.0,0.0,0.0,0.0,0.0,0.0,972,1,18.0
1041,2024,1,1,1,1,19393,11259,1125904,"Dallas, TX",14683,1468305,"San Antonio, TX",1810,1807.0,0.0,0.0,-1.0,1907.0,0.0,0.0,-1.0,0.0,,70.0,60.0,247.0,0.0,0.0,0.0,0.0,0.0,1041,1,21.0
1286,2024,1,1,1,1,19393,11292,1129202,"Denver, CO",13502,1350202,"Montrose/Delta, CO",850,844.0,0.0,0.0,-1.0,937.0,0.0,0.0,-1.0,0.0,,60.0,53.0,196.0,0.0,0.0,0.0,0.0,0.0,1286,1,24.0
1627,2024,1,1,1,1,19393,12191,1219103,"Houston, TX",11697,1169706,"Fort Lauderdale, FL",1920,1916.0,0.0,0.0,-1.0,2218.0,0.0,0.0,-2.0,0.0,,135.0,122.0,957.0,0.0,0.0,0.0,0.0,0.0,1627,1,17.0


In [5]:
print("Total lagged flights:", flight_data["lagged_delay_flag"].sum())

Total lagged flights: 96603
