In [5]:
# Basic libraries
import pandas as pd
import numpy as np

# ML & preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# PyTorch & TabTransformer
import torch
from tab_transformer_pytorch import TabTransformer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# SHAP for feature importance
import shap

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cpu


In [6]:
# Load master dataset
df = pd.read_csv(r"C:\Users\ASUS\OneDrive\Desktop\UA\master_df.csv")

# Quick look
print(df.shape)
df.head()


(8099, 33)


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,...,high_passenger_load,Delay_norm,ssr_norm,bag_ratio_norm,passenger_load_norm,tight_ground_time_flag,high_passenger_load_flag,Flight_Difficulty_Score,daily_rank,difficulty_category
0,OO,4792,2025-08-04,ORD,ROA,2025-08-04 17:57:00+00:00,2025-08-04T21:04:00Z,2025-08-04 18:04:00+00:00,2025-08-04T20:52:00Z,76,...,False,0.023211,0.0,0.057143,0.117647,0,0,0.041921,474.0,Easy
1,UA,920,2025-08-03,ORD,LHR,2025-08-03 18:05:00+00:00,2025-08-04T08:20:00Z,2025-08-03 18:27:00+00:00,2025-08-04T08:06:00Z,167,...,False,0.037718,0.111111,0.105714,0.318786,0,0,0.118437,157.0,Medium
2,UA,1776,2025-08-10,ORD,PHL,2025-08-10 18:20:00+00:00,2025-08-10T21:35:00Z,2025-08-10 20:11:00+00:00,2025-08-10T23:26:00Z,166,...,False,0.123791,0.111111,0.108571,0.335863,0,0,0.148247,91.0,Difficult
3,OO,5790,2025-08-06,ORD,CRW,2025-08-06 18:20:00+00:00,2025-08-06T21:04:00Z,2025-08-06 20:05:00+00:00,2025-08-06T22:42:00Z,50,...,False,0.117988,0.0,0.114286,0.098672,0,0,0.077988,265.0,Medium
4,UA,1398,2025-08-05,ORD,ATL,2025-08-05 18:20:00+00:00,2025-08-05T21:29:00Z,2025-08-05 18:16:00+00:00,2025-08-05T21:49:00Z,166,...,False,0.012573,0.0,0.145714,0.252372,0,0,0.083389,263.0,Medium


In [7]:
# Continuous features
cont_features = ['Delay_minutes', 'total_passengers', 'transfer_to_checked_ratio', 'tight_ground_time_flag', 'ssr_count']

# Categorical features (example)
cat_features = ['scheduled_departure_station_code', 'scheduled_arrival_station_code', 'fleet_type', 'carrier']

# Target
target = 'Flight_Difficulty_Score'


In [8]:
# Encode categorical features
encoders = {}
for col in cat_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


In [9]:
X_cat = df[cat_features].values.astype(np.int64)
X_cont = df[cont_features].values.astype(np.float32)
y = df[target].values.astype(np.float32)

# Split
X_cat_train, X_cat_test, X_cont_train, X_cont_test, y_train, y_test = train_test_split(
    X_cat, X_cont, y, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_cat_train = torch.tensor(X_cat_train, dtype=torch.long).to(device)
X_cat_test = torch.tensor(X_cat_test, dtype=torch.long).to(device)
X_cont_train = torch.tensor(X_cont_train, dtype=torch.float32).to(device)
X_cont_test = torch.tensor(X_cont_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1,1).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1,1).to(device)


In [10]:
# Number of unique values for each categorical feature
categories = [len(np.unique(df[col])) for col in cat_features]

model = TabTransformer(
    categories=categories,
    num_continuous=len(cont_features),
    dim=64,
    depth=4,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4,2)
).to(device)

# Loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [11]:
# Training parameters
epochs = 50
batch_size = 64
n_samples = X_cont_train.shape[0]

for epoch in range(epochs):
    permutation = torch.randperm(n_samples)
    epoch_loss = 0

    for i in range(0, n_samples, batch_size):
        indices = permutation[i:i+batch_size]
        x_cat_batch = X_cat_train[indices]
        x_cont_batch = X_cont_train[indices]
        y_batch = y_train[indices]

        optimizer.zero_grad()
        output = model(x_categ=x_cat_batch, x_cont=x_cont_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * len(y_batch)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/n_samples:.4f}")


Epoch 1/50, Loss: 0.5300
Epoch 2/50, Loss: 0.0020
Epoch 3/50, Loss: 0.0019
Epoch 4/50, Loss: 0.0018
Epoch 5/50, Loss: 0.0018
Epoch 6/50, Loss: 0.0017
Epoch 7/50, Loss: 0.0016
Epoch 8/50, Loss: 0.0015
Epoch 9/50, Loss: 0.0015
Epoch 10/50, Loss: 0.0014
Epoch 11/50, Loss: 0.0014
Epoch 12/50, Loss: 0.0014
Epoch 13/50, Loss: 0.0014
Epoch 14/50, Loss: 0.0013
Epoch 15/50, Loss: 0.0013
Epoch 16/50, Loss: 0.0013
Epoch 17/50, Loss: 0.0013
Epoch 18/50, Loss: 0.0013
Epoch 19/50, Loss: 0.0013
Epoch 20/50, Loss: 0.0013
Epoch 21/50, Loss: 0.0013
Epoch 22/50, Loss: 0.0013
Epoch 23/50, Loss: 0.0013
Epoch 24/50, Loss: 0.0013
Epoch 25/50, Loss: 0.0012
Epoch 26/50, Loss: 0.0012
Epoch 27/50, Loss: 0.0012
Epoch 28/50, Loss: 0.0012
Epoch 29/50, Loss: 0.0012
Epoch 30/50, Loss: 0.0012
Epoch 31/50, Loss: 0.0012
Epoch 32/50, Loss: 0.0012
Epoch 33/50, Loss: 0.0012
Epoch 34/50, Loss: 0.0012
Epoch 35/50, Loss: 0.0012
Epoch 36/50, Loss: 0.0012
Epoch 37/50, Loss: 0.0012
Epoch 38/50, Loss: 0.0012
Epoch 39/50, Loss: 0.

In [14]:
model.eval()
with torch.no_grad():
    y_pred = model(x_categ=X_cat_test, x_cont=X_cont_test)
    mse = criterion(y_pred, y_test).item()
    print("Test MSE:", mse)

# Optional: Convert predictions to numpy
y_pred_np = y_pred.cpu().numpy()
y_test_np = y_test.cpu().numpy()


Test MSE: 0.0013817859580740333


In [23]:
# Define which columns are categorical and which are continuous
categorical_features = ['scheduled_departure_station_code', 'scheduled_arrival_station_code', 'fleet_type', 'carrier']
continuous_features = [col for col in df.columns if col not in categorical_features + ['Flight_Difficulty_Score']]

# Also make sure your encoders dictionary exists
# Example: encoders[col] = LabelEncoder() fitted on master_df[col]


In [24]:
# Example new flight data (replace with actual features)
new_flight_cont = np.array([[15, 180, 0.3, 0, 2]], dtype=np.float32)  # continuous features

# Categorical: use safe transform
new_flight_cat = []
for i, col in enumerate(categorical_features):
    le = encoders[col]
    val = 'B737'  # Example, replace with real
    if val in le.classes_:
        new_flight_cat.append(le.transform([val])[0])
    else:
        new_flight_cat.append(-1)  # unknown category

new_flight_cat = torch.tensor([new_flight_cat], dtype=torch.long).to(device)
new_flight_cont = torch.tensor(new_flight_cont, dtype=torch.float32).to(device)

# Predict
model.eval()
with torch.no_grad():
    difficulty_pred = model(x_categ=new_flight_cat, x_cont=new_flight_cont).item()

print("Predicted Flight Difficulty Score:", round(difficulty_pred, 3))


Predicted Flight Difficulty Score: 0.044


In [25]:
# Example 2: new flight data (continuous features)
new_flight_cont_2 = np.array([[25, 220, 0.5, 1, 3]], dtype=np.float32)  # replace with real continuous values

# Categorical: use safe transform with unknown handling
new_flight_cat_2 = []
for i, col in enumerate(categorical_features):
    le = encoders[col]
    val = 'A320'  # Example aircraft/fleet, replace with real
    if val in le.classes_:
        new_flight_cat_2.append(le.transform([val])[0])
    else:
        new_flight_cat_2.append(-1)  # unknown category

new_flight_cat_2 = torch.tensor([new_flight_cat_2], dtype=torch.long).to(device)
new_flight_cont_2 = torch.tensor(new_flight_cont_2, dtype=torch.float32).to(device)

# Predict
model.eval()
with torch.no_grad():
    difficulty_pred_2 = model(x_categ=new_flight_cat_2, x_cont=new_flight_cont_2).item()

print("Predicted Flight Difficulty Score (Sample 2):", round(difficulty_pred_2, 3))


Predicted Flight Difficulty Score (Sample 2): 0.045


In [43]:
# --- Continuous features for new flight ---
# Make sure the number of values = number of continuous features your model expects
new_flight_cont = np.array([[12, 200, 0.5, 1, 3]], dtype=np.float32)  # replace with realistic values

# --- Categorical features ---
# Provide actual categories used in your model
new_flight_cat = []
for i, col in enumerate(categorical_features):
    le = encoders[col]
    val = None
    if col == 'scheduled_departure_station_code':
        val = 'JFK'
    elif col == 'scheduled_arrival_station_code':
        val = 'SFO'
    elif col == 'fleet_type':
        val = 'A320'
    elif col == 'carrier':
        val = 'UA'
    
    # Safe transform
    if val in le.classes_:
        new_flight_cat.append(le.transform([val])[0])
    else:
        new_flight_cat.append(-1)  # unknown category

# Convert to torch tensors
new_flight_cat = torch.tensor([new_flight_cat], dtype=torch.long).to(device)
new_flight_cont = torch.tensor(new_flight_cont, dtype=torch.float32).to(device)

# --- Predict ---
model.eval()
with torch.no_grad():
    difficulty_pred = model(x_categ=new_flight_cat, x_cont=new_flight_cont).item()

print("Predicted Flight Difficulty Score:", round(difficulty_pred, 3))


Predicted Flight Difficulty Score: 0.05


In [45]:
# --- Continuous features for new flight ---
# Make sure the number of values = number of continuous features your model expects
new_flight_cont = np.array([[12, 200, 0.5, 1, 3]], dtype=np.float32)  # replace with realistic values

# --- Categorical features ---
# Provide actual categories used in your model
new_flight_cat = []
for i, col in enumerate(categorical_features):
    le = encoders[col]
    val = None
    if col == 'scheduled_departure_station_code':
        val = 'JFK'
    elif col == 'scheduled_arrival_station_code':
        val = 'SFO'
    elif col == 'fleet_type':
        val = 'A320'
    elif col == 'carrier':
        val = 'UA'
    
    # Safe transform
    if val in le.classes_:
        new_flight_cat.append(le.transform([val])[0])
    else:
        new_flight_cat.append(-1)  # unknown category

# Convert to torch tensors
new_flight_cat = torch.tensor([new_flight_cat], dtype=torch.long).to(device)
new_flight_cont = torch.tensor(new_flight_cont, dtype=torch.float32).to(device)

# --- Predict ---
model.eval()
with torch.no_grad():
    difficulty_pred = model(x_categ=new_flight_cat, x_cont=new_flight_cont).item()

print("Predicted Flight Difficulty Score:", round(difficulty_pred, 3))


Predicted Flight Difficulty Score: 0.05
