# Multi-tower neural network

## Imports

In [0]:
# Standard library
import copy
import os
from typing import Dict, List, Tuple

# Data processing
import numpy as np
import pandas as pd

# PySpark
from pyspark.sql import Window
import pyspark.sql.functions as sf  # Alias to avoid conflict with torch.nn.functional

# Scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_auc_score, mean_squared_error, mean_absolute_error

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# MLflow for experiment tracking
import mlflow
import mlflow.pytorch

print(f"PyTorch version: {torch.__version__}")
print(f"MLflow version: {mlflow.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")



2.21.3
Created new experiment with ID: 1920497510759002


## Helper functions

In [0]:
# =============================================================================
# Training Configuration
# =============================================================================
RANDOM_SEED = 42
NUM_EPOCHS = 20
BATCH_SIZE = 2048
LEARNING_RATE = 1e-3
PATIENCE = 5  # Early stopping patience

# Loss weighting: alpha * regression_loss + (1-alpha) * classification_loss
LOSS_ALPHA = 0.5

# Classification positive weight (for class imbalance)
POS_WEIGHT = 4.0

# Delay threshold for binary classification (minutes)
DELAY_THRESHOLD = 15.0

# =============================================================================
# Data Paths
# =============================================================================
BASE_PATH = "dbfs:/student-groups/Group_2_2"
TRAIN_PATH = f"{BASE_PATH}/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train.parquet/"
VAL_PATH = f"{BASE_PATH}/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/val.parquet/"
TEST_PATH = f"{BASE_PATH}/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/test.parquet/"
CV_DATA_PATH = f"{BASE_PATH}/1_year_custom_joined/fe_graph_and_holiday_nnfeat/cv_splits"
PREDICTIONS_PATH = f"{BASE_PATH}/1_year_custom_joined/nn_predictions"

# =============================================================================
# MLflow Configuration
# =============================================================================
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-nn-tower-hyperparameters"

# =============================================================================
# Feature Definitions
# =============================================================================

# Categorical features (for learned embeddings)
CATEGORICAL_COLS = [
    "OP_UNIQUE_CARRIER",       # Airline carrier code
    "ORIGIN_AIRPORT_SEQ_ID",   # Origin airport
    "DEST_AIRPORT_SEQ_ID",     # Destination airport
    "route",                   # Route string (origin-dest pair)
    "AIRPORT_HUB_CLASS",       # Hub classification
    "AIRLINE_CATEGORY",        # Airline type
]

# Numerical features (to be standardized)
NUMERICAL_COLS = [
    # Flight characteristics
    "DISTANCE",
    "CRS_ELAPSED_TIME",
    
    # Historical delay features (Phase 2 Feature Engineering)
    "prev_flight_delay_in_minutes",
    "origin_delays_4h",
    "delay_origin_7d",
    "delay_origin_carrier_7d",
    "delay_route_7d",
    "flight_count_24h",
    "AVG_TAXI_OUT_ORIGIN",
    "AVG_ARR_DELAY_ORIGIN",
    
    # Graph-based airport features
    "in_degree",
    "out_degree",
    "weighted_in_degree",
    "weighted_out_degree",
    "betweenness",
    "closeness",
    "N_RUNWAYS",
    
    # Weather features
    "HourlyVisibility",
    "HourlyStationPressure",
    "HourlyWindSpeed",
    "HourlyDryBulbTemperature",
    "HourlyDewPointTemperature",
    "HourlyRelativeHumidity",
    "HourlyAltimeterSetting",
    "HourlyWetBulbTemperature",
    "HourlyPrecipitation",
    "HourlyCloudCoverage",
    "HourlyCloudElevation",
    
    # Congestion features (engineered)
    "ground_flights_last_hour",
    "arrivals_last_hour",
    
    # Cyclic time features
    "dow_sin", "dow_cos",  # Day of week
    "doy_sin", "doy_cos",  # Day of year
]

# Time feature for Time2Vec encoding
TIME_COL = "CRS_DEP_MINUTES"

# Target variable
TARGET_COL = "DEP_DELAY_NEW"

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

## Datasets (Custom Join)

In [0]:
# Load pre-processed datasets with engineered features
train_df = spark.read.parquet(TRAIN_PATH)
val_df = spark.read.parquet(VAL_PATH)
test_df = spark.read.parquet(TEST_PATH)

print(f"Training set:   {train_df.count():,} records")
print(f"Validation set: {val_df.count():,} records")
print(f"Test set:       {test_df.count():,} records")

path,name,size,modificationTime
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/cv_splits/,cv_splits/,0,1765328209514
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/stacked_input_optimized/,stacked_input_optimized/,0,1765328209514
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/training_splits/,training_splits/,0,1765328209514


In [0]:
# For cross-validation, load the CV splits dataset
cv_df = spark.read.parquet(CV_DATA_PATH)
folds = sorted([row['fold_id'] for row in cv_df.select("fold_id").distinct().collect()])
print(f"Cross-validation folds available: {folds}")

## Feature Engineering

In [0]:
def add_time_features(df):
    """
    Add cyclic time features using sine/cosine encoding.
    
    This encoding preserves the cyclical nature of time (e.g., hour 23 is close to hour 0).
    
    Features added:
        - dep_hour_sin/cos: Hour of day (24-hour cycle)
        - dow_sin/cos: Day of week (7-day cycle)
        - doy_sin/cos: Day of year (365-day cycle)
    """
    df = df.withColumn("dep_hour", sf.col("CRS_DEP_MINUTES") / 60.0)
    df = df.withColumn("day_of_year", sf.dayofyear("utc_timestamp").cast("double"))
    
    # Cyclic encoding: x -> (sin(2π * x / period), cos(2π * x / period))
    df = df.withColumn("dep_hour_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
    df = df.withColumn("dep_hour_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
    df = df.withColumn("dow_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
    df = df.withColumn("dow_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
    df = df.withColumn("doy_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
    df = df.withColumn("doy_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
    
    return df

In [0]:
def add_weather_deltas(df):
    """
    Add 3-hour weather change features to capture weather trends.
    
    Computes the difference between current weather values and values from 3 hours ago
    at the same airport. Useful for detecting deteriorating or improving conditions.
    
    Features added (for each weather metric):
        - {metric}_3h_change: Change from 3 hours prior
    """
    WEATHER_COLS = [
        "HourlyVisibility", 
        "HourlyStationPressure",
        "HourlyDryBulbTemperature", 
        "HourlyWindSpeed",
        "HourlyPrecipitation"
    ]
    
    window = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_timestamp")
    
    for col_name in WEATHER_COLS:
        lag_col = sf.lag(col_name, 3).over(window)
        delta = sf.col(col_name) - lag_col
        df = df.withColumn(
            f"{col_name}_3h_change",
            sf.when(lag_col.isNull(), sf.lit(None)).otherwise(delta)
        )
    
    return df

In [0]:
def add_congestion_features(df):
    """
    Add origin airport congestion feature.
    
    Counts the number of flights departing from the same origin airport
    in the hour preceding the current flight (excluding the current flight).
    
    Features added:
        - ground_flights_last_hour: Count of departures in past hour
    """
    df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
    
    window = (Window
              .partitionBy("ORIGIN_AIRPORT_SEQ_ID")
              .orderBy("utc_ts_sec")
              .rangeBetween(-3600, 0))  # 1 hour = 3600 seconds
    
    df = df.withColumn(
        "ground_flights_last_hour",
        sf.count("utc_ts_sec").over(window) - 1  # Exclude current flight
    )
    
    return df

In [0]:
def add_dest_congestion_features(df):
    """
    Add destination airport congestion feature.
    
    Counts the number of flights arriving at the destination airport
    in the hour preceding the current flight's scheduled departure.
    
    Features added:
        - arrivals_last_hour: Count of arrivals at destination in past hour
    """
    df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
    
    window = (Window
              .partitionBy("DEST_AIRPORT_SEQ_ID")
              .orderBy("utc_ts_sec")
              .rangeBetween(-3600, 0))
    
    df = df.withColumn(
        "arrivals_last_hour",
        sf.count("utc_ts_sec").over(window) - 1
    )
    
    return df


def apply_feature_engineering(df):
    """Apply all feature engineering transformations to a DataFrame."""
    return (df
            .transform(add_time_features)
            .transform(add_weather_deltas)
            .transform(add_congestion_features)
            .transform(add_dest_congestion_features))

In [0]:
# Apply feature engineering to all datasets
train_df_fe = apply_feature_engineering(train_df)
val_df_fe = apply_feature_engineering(val_df)
test_df_fe = apply_feature_engineering(test_df)

print("Feature engineering complete.")


In [0]:
<a name="preprocessing"></a>
## 4. Data Preprocessing

### 4.1 Categorical Encoding

Build category-to-index mappings from training data only, then apply to all datasets.


Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train
Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/val
Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/test


## Start here when running experiment

In [0]:
def build_category_maps(df: pd.DataFrame, categorical_cols: List[str]) -> Dict[str, Dict]:
    """
    Build category-to-integer mappings from training data.
    
    Args:
        df: Training DataFrame (Pandas)
        categorical_cols: List of categorical column names
    
    Returns:
        Dictionary mapping column names to {category: index} dictionaries.
        Index 0 is reserved for unknown (UNK) categories.
    """
    cat_maps = {}
    for col in categorical_cols:
        unique_vals = df[col].astype(str).unique().tolist()
        cat_maps[col] = {"UNK": 0, **{v: i + 1 for i, v in enumerate(sorted(unique_vals))}}
    return cat_maps


def apply_category_maps(df: pd.DataFrame, cat_maps: Dict, categorical_cols: List[str]) -> pd.DataFrame:
    """
    Encode categorical columns using pre-built mappings.
    
    Args:
        df: DataFrame to encode
        cat_maps: Category mappings from build_category_maps()
        categorical_cols: List of categorical column names
    
    Returns:
        DataFrame with encoded categorical columns (unseen values -> 0)
    """
    df = df.copy()
    for col in categorical_cols:
        mapping = cat_maps[col]
        df[col] = df[col].astype(str).apply(lambda x: mapping.get(x, 0))
    return df


def compute_embedding_dims(cat_maps: Dict, categorical_cols: List[str]) -> Tuple[List[int], List[int]]:
    """
    Compute embedding dimensions using the rule of thumb: min(64, cardinality^0.3).
    
    Returns:
        Tuple of (category_dimensions, embedding_dimensions)
    """
    cat_dims = [len(cat_maps[c]) for c in categorical_cols]
    emb_dims = [min(64, int(n ** 0.3)) for n in cat_dims]
    return cat_dims, emb_dims

## Vectorization for Torch

In [0]:
class FlightDataset(Dataset):
    """
    PyTorch Dataset for flight delay prediction.
    
    Handles three input types:
        - Categorical features (as long tensors for embeddings)
        - Numerical features (as float tensors)
        - Time feature (for Time2Vec encoding)
    """
    
    def __init__(self, df: pd.DataFrame, categorical_cols: List[str], 
                 numerical_cols: List[str], time_col: str, target_col: str):
        """
        Args:
            df: Preprocessed Pandas DataFrame
            categorical_cols: List of categorical column names
            numerical_cols: List of numerical column names
            time_col: Name of the time column
            target_col: Name of the target column
        """
        self.cat = torch.tensor(df[categorical_cols].values, dtype=torch.long)
        self.num = torch.tensor(df[numerical_cols].values, dtype=torch.float32)
        self.time = torch.tensor(df[time_col].values, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(df[target_col].values, dtype=torch.float32).unsqueeze(1)
    
    def __len__(self) -> int:
        return len(self.y)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, ...]:
        return self.cat[idx], self.num[idx], self.time[idx], self.y[idx]

In [0]:
def prepare_data_for_training(
    train_spark_df, 
    val_spark_df,
    categorical_cols: List[str],
    numerical_cols: List[str],
    time_col: str,
    target_col: str
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, StandardScaler, List[int], List[int]]:
    """
    Complete data preparation pipeline: Spark -> Pandas -> encoded/scaled.
    
    Fits encoders and scaler on training data only, then transforms both sets.
    
    Returns:
        Tuple of (train_pd, val_pd, cat_maps, scaler, cat_dims, emb_dims)
    """
    # Select relevant columns and convert to Pandas
    all_cols = categorical_cols + numerical_cols + [time_col, target_col]
    train_pd = train_spark_df.select(all_cols).toPandas()
    val_pd = val_spark_df.select(all_cols).toPandas()
    
    # Build and apply categorical encodings (fit on train only)
    cat_maps = build_category_maps(train_pd, categorical_cols)
    train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
    val_pd = apply_category_maps(val_pd, cat_maps, categorical_cols)
    
    # Fit and apply standard scaling (fit on train only)
    scaler = StandardScaler()
    train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
    val_pd[numerical_cols] = scaler.transform(val_pd[numerical_cols])
    
    # Compute embedding dimensions
    cat_dims, emb_dims = compute_embedding_dims(cat_maps, categorical_cols)
    
    return train_pd, val_pd, cat_maps, scaler, cat_dims, emb_dims


## Encode categoricals; build encoder dictionary for train data only

In [0]:
class ResBlock(nn.Module):
    """
    Residual block with LayerNorm and GELU activation.
    
    Architecture: LayerNorm -> Linear -> GELU -> Dropout -> Linear -> Add residual
    
    This allows the network to learn identity mappings easily while enabling
    deeper architectures without vanishing gradients.
    """
    
    def __init__(self, dim: int, dropout: float = 0.1):
        super().__init__()
        self.ln = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, dim)
        self.fc2 = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = F.gelu(self.fc1(self.ln(x)))
        h = self.dropout(h)
        h = self.fc2(h)
        return x + h  # Residual connection

In [0]:
class Time2Vec(nn.Module):
    """
    Time2Vec: Learnable time encoding layer.
    
    Learns both linear and periodic representations of time, allowing the model
    to capture trends and cyclical patterns (daily, weekly, seasonal).
    
    Output: [linear_component, sin(learned_frequencies * t)]
    
    Reference: Kazemi et al., "Time2Vec: Learning a Vector Representation of Time" (2019)
    """
    
    def __init__(self, k: int):
        """
        Args:
            k: Number of periodic components (output dim = k + 1)
        """
        super().__init__()
        self.wb = nn.Linear(1, 1)  # Linear trend
        self.ws = nn.Linear(1, k)  # Periodic components
    
    def forward(self, t: torch.Tensor) -> torch.Tensor:
        linear = self.wb(t)
        periodic = torch.sin(self.ws(t))
        return torch.cat([linear, periodic], dim=-1)

In [0]:
# -----------------------------
# Apply to train/val/test
# -----------------------------
# Build mapping dicts from training data only
cat_maps = build_category_maps(train_pd, categorical_cols)

# Apply safely to all datasets
train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
val_pd   = apply_category_maps(val_pd, cat_maps, categorical_cols)
test_pd  = apply_category_maps(test_pd, cat_maps, categorical_cols)

In [0]:
class ResFiLMMLP(nn.Module):
    """
    Multi-Tower Neural Network with Feature-wise Linear Modulation (FiLM).
    
    Architecture:
    ┌─────────────────┐     ┌─────────────────┐     ┌─────────────────┐
    │  Embedding      │     │   Numeric       │     │   Time2Vec      │
    │  Tower          │     │   Tower         │     │   Encoder       │
    │  (Categorical)  │     │   (ResBlocks)   │     │   (Departure)   │
    └────────┬────────┘     └────────┬────────┘     └────────┬────────┘
             │                       │                       │
             │    ┌──────────────────┘                       │
             │    │  FiLM Modulation                         │
             │    │  γ * emb + β                             │
             ▼    ▼                                          │
    ┌─────────────────┐                                      │
    │  Modulated      │◄─────────────────────────────────────┘
    │  Embeddings     │
    └────────┬────────┘
             │
             ▼
    ┌─────────────────────────────────────────────────────────┐
    │              Concatenated Features                       │
    └────────────────────────┬────────────────────────────────┘
                             │
             ┌───────────────┴───────────────┐
             ▼                               ▼
    ┌─────────────────┐             ┌─────────────────┐
    │  Regression     │             │  Classification │
    │  Head (MAE)     │             │  Head (BCE)     │
    │  → Delay mins   │             │  → P(delayed)   │
    └─────────────────┘             └─────────────────┘
    """
    
    def __init__(
        self,
        cat_dims: List[int],
        emb_dims: List[int],
        num_numerical: int,
        time_dim: int = 8,
        hidden_dim: int = 256,
        num_res_blocks: int = 4,
        emb_dropout: float = 0.05,
        num_dropout: float = 0.1,
        film_dropout: float = 0.1,
        final_dropout: float = 0.2
    ):
        """
        Args:
            cat_dims: Number of categories for each categorical feature
            emb_dims: Embedding dimension for each categorical feature
            num_numerical: Number of numerical features
            time_dim: Dimension of Time2Vec periodic components
            hidden_dim: Hidden dimension for numeric tower
            num_res_blocks: Number of residual blocks in numeric tower
            emb_dropout: Dropout rate for embeddings
            num_dropout: Dropout rate in residual blocks
            film_dropout: Dropout rate for FiLM parameters
            final_dropout: Dropout rate in prediction heads
        """
        super().__init__()
        
        # === Embedding Tower ===
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_dim, emb_dim)
            for cat_dim, emb_dim in zip(cat_dims, emb_dims)
        ])
        self.emb_total = sum(emb_dims)
        self.emb_dropout = nn.Dropout(emb_dropout)
        
        # === Numeric Tower ===
        self.fc_num = nn.Linear(num_numerical, hidden_dim)
        self.res_blocks = nn.ModuleList([
            ResBlock(hidden_dim, dropout=num_dropout)
            for _ in range(num_res_blocks)
        ])
        
        # === FiLM Layer ===
        # Learns scale (γ) and shift (β) for embedding modulation
        self.film = nn.Linear(hidden_dim, 2 * self.emb_total)
        self.film_dropout = nn.Dropout(film_dropout)
        
        # === Time2Vec ===
        self.t2v = Time2Vec(time_dim)
        
        # === Fusion Dimension ===
        # modulated_emb + numeric_hidden + time2vec + raw_time
        fused_dim = self.emb_total + hidden_dim + (time_dim + 1) + 1
        
        # === Regression Head (delay minutes) ===
        self.reg_head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(128, 1)
        )
        
        # === Classification Head (delayed yes/no) ===
        self.clf_head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(128, 1)  # Raw logit (apply sigmoid for probability)
        )
    
    def forward(
        self, 
        x_cat: torch.Tensor, 
        x_num: torch.Tensor, 
        x_time: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass.
        
        Args:
            x_cat: Categorical features [batch, num_cat_features]
            x_num: Numerical features [batch, num_numerical]
            x_time: Time feature [batch, 1]
        
        Returns:
            Tuple of (regression_output, classification_logit)
        """
        # Embedding tower
        emb = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        emb = torch.cat(emb, dim=-1)
        emb = self.emb_dropout(emb)
        
        # Numeric tower
        h = F.gelu(self.fc_num(x_num))
        for block in self.res_blocks:
            h = block(h)
        
        # FiLM modulation: γ * emb + β
        gamma, beta = torch.chunk(self.film(h), 2, dim=-1)
        gamma = self.film_dropout(gamma)
        beta = self.film_dropout(beta)
        emb_mod = gamma * emb + beta
        
        # Time2Vec encoding
        t_feat = self.t2v(x_time)
        
        # Concatenate all features
        z = torch.cat([emb_mod, h, t_feat, x_time], dim=-1)
        
        # Dual prediction heads
        reg_out = self.reg_head(z)
        clf_out = self.clf_head(z)
        
        return reg_out, clf_out

OP_UNIQUE_CARRIER: 18 categories -> embedding dim 2
ORIGIN_AIRPORT_SEQ_ID: 373 categories -> embedding dim 5
DEST_AIRPORT_SEQ_ID: 388 categories -> embedding dim 5
route: 6252 categories -> embedding dim 13
AIRPORT_HUB_CLASS: 7 categories -> embedding dim 1
AIRLINE_CATEGORY: 4 categories -> embedding dim 1


## Normalize numerical features

In [0]:
def train_one_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion_reg: nn.Module,
    criterion_clf: nn.Module,
    device: torch.device,
    alpha: float = 0.5,
    delay_threshold: float = 15.0
) -> float:
    """
    Train model for one epoch.
    
    Args:
        model: The neural network model
        dataloader: Training data loader
        optimizer: Optimizer instance
        criterion_reg: Regression loss function (e.g., L1Loss)
        criterion_clf: Classification loss function (e.g., BCEWithLogitsLoss)
        device: Device to train on
        alpha: Weight for regression loss (1-alpha for classification)
        delay_threshold: Threshold in minutes for binary classification
    
    Returns:
        Average loss for the epoch
    """
    model.train()
    total_loss = 0.0
    
    for cat, num, time, y in dataloader:
        cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
        
        optimizer.zero_grad()
        reg_out, clf_out = model(cat, num, time)
        
        # Binary target: delayed if >= threshold
        y_bin = (y >= delay_threshold).float()
        
        # Combined loss
        loss_reg = criterion_reg(reg_out, y)
        loss_clf = criterion_clf(clf_out, y_bin)
        loss = alpha * loss_reg + (1 - alpha) * loss_clf
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

## Load Torch

In [0]:
def evaluate_model(
    model: nn.Module,
    dataloader: DataLoader,
    device: torch.device,
    delay_threshold: float = 15.0
) -> Dict[str, float]:
    """
    Evaluate model on multiple metrics.
    
    Args:
        model: The neural network model
        dataloader: Evaluation data loader
        device: Device to evaluate on
        delay_threshold: Threshold in minutes for binary classification
    
    Returns:
        Dictionary with MAE, RMSE, AUC, and F2 score
    """
    model.eval()
    preds_reg, preds_clf = [], []
    targets_reg, targets_clf = [], []
    
    with torch.no_grad():
        for cat, num, time, y in dataloader:
            cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
            reg_out, clf_out = model(cat, num, time)
            
            preds_reg.append(reg_out.cpu())
            preds_clf.append(torch.sigmoid(clf_out).cpu())
            targets_reg.append(y.cpu())
            targets_clf.append((y >= delay_threshold).float().cpu())
    
    # Concatenate all batches
    y_pred_reg = torch.cat(preds_reg).numpy()
    y_pred_clf = torch.cat(preds_clf).numpy()
    y_true_reg = torch.cat(targets_reg).numpy()
    y_true_clf = torch.cat(targets_clf).numpy()
    
    # Compute metrics
    mae = mean_absolute_error(y_true_reg, y_pred_reg)
    rmse = np.sqrt(mean_squared_error(y_true_reg, y_pred_reg))
    
    try:
        auc = roc_auc_score(y_true_clf, y_pred_clf)
    except ValueError:
        auc = 0.5  # Handle edge case with single class
    
    # F2 score at threshold 0.5
    y_pred_bin = (y_pred_clf > 0.5).astype(int)
    f2 = fbeta_score(y_true_clf, y_pred_bin, beta=2)
    
    return {"mae": mae, "rmse": rmse, "auc": auc, "f2_score": f2}

In [0]:
def prepare_fold_data(
    fold_df,
    categorical_cols: List[str],
    numerical_cols: List[str],
    time_col: str,
    target_col: str
) -> Tuple[pd.DataFrame, pd.DataFrame, List[int], List[int]]:
    """
    Prepare data for a single cross-validation fold.
    
    Applies feature engineering, encoding, and scaling to train/validation splits.
    
    Args:
        fold_df: Spark DataFrame containing the fold data with 'split_type' column
        categorical_cols: List of categorical column names
        numerical_cols: List of numerical column names
        time_col: Name of time column
        target_col: Name of target column
    
    Returns:
        Tuple of (train_pd, val_pd, cat_dims, emb_dims)
    """
    # Split by split_type (excludes 'gap' data)
    train_spark = fold_df.filter(sf.col("split_type") == "train")
    val_spark = fold_df.filter(sf.col("split_type") == "validation")
    
    # Apply feature engineering
    train_fe = apply_feature_engineering(train_spark)
    val_fe = apply_feature_engineering(val_spark)
    
    # Convert to Pandas and preprocess
    all_cols = categorical_cols + numerical_cols + [time_col, target_col]
    train_pd = train_fe.select(all_cols).toPandas()
    val_pd = val_fe.select(all_cols).toPandas()
    
    # Encode categoricals (fit on train)
    cat_maps = build_category_maps(train_pd, categorical_cols)
    train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
    val_pd = apply_category_maps(val_pd, cat_maps, categorical_cols)
    
    # Scale numericals (fit on train)
    scaler = StandardScaler()
    train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
    val_pd[numerical_cols] = scaler.transform(val_pd[numerical_cols])
    
    # Compute embedding dimensions
    cat_dims, emb_dims = compute_embedding_dims(cat_maps, categorical_cols)
    
    return train_pd, val_pd, cat_dims, emb_dims


## Model Definition (ResFiLM-MLP)

In [0]:
def run_cross_validation(
    cv_df,
    folds: List[int],
    categorical_cols: List[str],
    numerical_cols: List[str],
    time_col: str,
    target_col: str,
    config: Dict
) -> Dict[str, List[float]]:
    """
    Run k-fold cross-validation with MLflow tracking.
    
    Args:
        cv_df: Spark DataFrame with all CV folds
        folds: List of fold IDs to process
        categorical_cols: Categorical feature columns
        numerical_cols: Numerical feature columns
        time_col: Time column name
        target_col: Target column name
        config: Training configuration dict
    
    Returns:
        Dictionary with CV summary metrics
    """
    cv_summary = {"best_val_f2": [], "best_val_mae": []}
    
    mlflow.end_run()  # End any existing run
    
    with mlflow.start_run(run_name="CV_Orchestrator_ResFiLM_MLP") as parent_run:
        mlflow.log_params({
            "num_epochs": config["num_epochs"],
            "batch_size": config["batch_size"],
            "learning_rate": config["learning_rate"],
            "patience": config["patience"],
            "loss_alpha": config["alpha"],
            "delay_threshold": config["delay_threshold"]
        })
        
        for fold in folds:
            print(f"\n{'='*50}")
            print(f"Starting Fold {fold}")
            print(f"{'='*50}")
            
            with mlflow.start_run(run_name=f"Fold_{fold}", nested=True):
                # Prepare fold data
                fold_data = cv_df.filter(sf.col("fold_id") == fold)
                train_pd, val_pd, cat_dims, emb_dims = prepare_fold_data(
                    fold_data, categorical_cols, numerical_cols, time_col, target_col
                )
                
                # Create data loaders
                train_ds = FlightDataset(train_pd, categorical_cols, numerical_cols, time_col, target_col)
                val_ds = FlightDataset(val_pd, categorical_cols, numerical_cols, time_col, target_col)
                train_dl = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
                val_dl = DataLoader(val_ds, batch_size=config["batch_size"])
                
                # Initialize model
                model = ResFiLMMLP(
                    cat_dims=cat_dims,
                    emb_dims=emb_dims,
                    num_numerical=len(numerical_cols),
                    time_dim=8
                ).to(config["device"])
                
                optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"])
                criterion_reg = nn.L1Loss()
                criterion_clf = nn.BCEWithLogitsLoss(
                    pos_weight=torch.tensor([config["pos_weight"]]).to(config["device"])
                )
                
                # Training loop with early stopping
                best_f2 = -1.0
                best_mae = float('inf')
                best_model_state = None
                early_stop_counter = 0
                
                for epoch in range(config["num_epochs"]):
                    # Train
                    train_loss = train_one_epoch(
                        model, train_dl, optimizer, criterion_reg, criterion_clf,
                        config["device"], config["alpha"], config["delay_threshold"]
                    )
                    
                    # Evaluate
                    train_metrics = evaluate_model(model, train_dl, config["device"], config["delay_threshold"])
                    val_metrics = evaluate_model(model, val_dl, config["device"], config["delay_threshold"])
                    
                    print(f"  Epoch {epoch:2d} | "
                          f"Train F2: {train_metrics['f2_score']:.3f} | "
                          f"Val F2: {val_metrics['f2_score']:.3f} | "
                          f"Val MAE: {val_metrics['mae']:.2f}")
                    
                    # Log metrics
                    mlflow.log_metrics({
                        "train_loss": train_loss,
                        "train_f2": train_metrics['f2_score'],
                        "val_f2": val_metrics['f2_score'],
                        "val_mae": val_metrics['mae'],
                        "val_rmse": val_metrics['rmse'],
                        "val_auc": val_metrics['auc']
                    }, step=epoch)
                    
                    # Track best F2 model
                    if val_metrics['f2_score'] > best_f2:
                        best_f2 = val_metrics['f2_score']
                        best_model_state = copy.deepcopy(model.state_dict())
                        early_stop_counter = 0
                    else:
                        early_stop_counter += 1
                    
                    # Track best MAE
                    if val_metrics['mae'] < best_mae:
                        best_mae = val_metrics['mae']
                    
                    # Early stopping
                    if early_stop_counter >= config["patience"]:
                        print(f"  Early stopping at epoch {epoch}")
                        break
                
                # Log best model
                if best_model_state:
                    model.load_state_dict(best_model_state)
                    mlflow.pytorch.log_model(model, f"model_fold_{fold}_best_f2")
                
                print(f"  >> Fold {fold} Best Val F2: {best_f2:.4f}, Best MAE: {best_mae:.2f}")
                
                cv_summary["best_val_f2"].append(best_f2)
                cv_summary["best_val_mae"].append(best_mae)
        
        # Log CV summary
        avg_f2 = np.mean(cv_summary['best_val_f2'])
        avg_mae = np.mean(cv_summary['best_val_mae'])
        mlflow.log_metrics({"cv_avg_best_f2": avg_f2, "cv_avg_best_mae": avg_mae})
        
        print(f"\n{'='*50}")
        print(f"Cross-Validation Complete")
        print(f"{'='*50}")
        print(f"Average Best F2: {avg_f2:.4f}")
        print(f"Average Best MAE: {avg_mae:.2f}")
    
    return cv_summary

## Prepare fold logic

In [0]:
# Setup MLflow experiment
spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")

# Training configuration
training_config = {
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "patience": PATIENCE,
    "alpha": LOSS_ALPHA,
    "pos_weight": POS_WEIGHT,
    "delay_threshold": DELAY_THRESHOLD,
    "device": DEVICE
}

print("Training configuration:")
for k, v in training_config.items():
    print(f"  {k}: {v}")

In [0]:
# Run cross-validation
cv_results = run_cross_validation(
    cv_df=cv_df,
    folds=folds,
    categorical_cols=CATEGORICAL_COLS,
    numerical_cols=NUMERICAL_COLS,
    time_col=TIME_COL,
    target_col=TARGET_COL,
    config=training_config
)

In [0]:
<a name="results"></a>
## 8. Results Summary

In [0]:
# Display CV results
results_df = pd.DataFrame({
    "Fold": list(range(1, len(cv_results["best_val_f2"]) + 1)),
    "Best F2 Score": cv_results["best_val_f2"],
    "Best MAE (min)": cv_results["best_val_mae"]
})

print("\n" + "="*60)
print("Cross-Validation Results by Fold")
print("="*60)
print(results_df.to_string(index=False))
print("\n" + "-"*60)
print(f"Average F2 Score: {np.mean(cv_results['best_val_f2']):.4f} ± {np.std(cv_results['best_val_f2']):.4f}")
print(f"Average MAE:      {np.mean(cv_results['best_val_mae']):.2f} ± {np.std(cv_results['best_val_mae']):.2f} minutes")
print("="*60)

In [0]:
### Key Findings

"""
The ResFiLM-MLP model demonstrates strong performance on flight delay prediction:

1. **Classification (F2 Score)**
   - Average F2: 0.735 across 10 folds
   - Best fold: 0.779 (Fold 4)
   - The F2 metric prioritizes recall, appropriate for delay prediction where
     missing a delay (false negative) is more costly than a false alarm.

2. **Regression (MAE)**
   - Average MAE: ~12-14 minutes across most folds
   - Notable variation between folds due to seasonal patterns in delay data
   - Folds 9-10 show lower MAE but also lower F2, suggesting less delay variation

3. **Architecture Benefits**
   - FiLM modulation allows categorical embeddings to be conditioned on 
     numerical features (weather, congestion)
   - Time2Vec captures both linear trends and periodic patterns in departure times
   - Multi-task learning improves feature representations for both tasks

4. **Training Observations**
   - Early stopping typically triggers at epochs 6-15
   - Classification loss converges faster than regression loss
   - Positive class weighting (4.0) effectively handles class imbalance
"""

print("Notebook execution complete. Models logged to MLflow.")

Numerical columns count after deduplication: 34
Starting Cross-Validation on Folds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

--- Starting Fold 1 ---
Fold 1 | Ep 0 | Train F2: 0.712 | Val F2: 0.722 | Train MAE: 11.385 | Val MAE: 12.040
Fold 1 | Ep 1 | Train F2: 0.723 | Val F2: 0.699 | Train MAE: 11.307 | Val MAE: 12.026
Fold 1 | Ep 2 | Train F2: 0.734 | Val F2: 0.745 | Train MAE: 11.254 | Val MAE: 11.970
Fold 1 | Ep 3 | Train F2: 0.724 | Val F2: 0.711 | Train MAE: 11.227 | Val MAE: 11.955
Fold 1 | Ep 4 | Train F2: 0.728 | Val F2: 0.724 | Train MAE: 11.134 | Val MAE: 11.910
Fold 1 | Ep 5 | Train F2: 0.729 | Val F2: 0.734 | Train MAE: 11.085 | Val MAE: 11.937
Fold 1 | Ep 6 | Train F2: 0.730 | Val F2: 0.728 | Train MAE: 11.083 | Val MAE: 11.910
Fold 1 | Ep 7 | Train F2: 0.730 | Val F2: 0.742 | Train MAE: 11.046 | Val MAE: 11.928
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 1 Best Val F2: 0.7450

--- Starting Fold 2 ---
Fold 2 | Ep 0 | Train F2: 0.716 | Val F2: 0.726 | Train MAE: 11.799 | Val MAE: 13.205
Fold 2 | Ep 1 | Train F2: 0.731 | Val F2: 0.745 | Train MAE: 11.625 | Val MAE: 12.925
Fold 2 | Ep 2 | Train F2: 0.729 | Val F2: 0.743 | Train MAE: 11.684 | Val MAE: 12.980
Fold 2 | Ep 3 | Train F2: 0.729 | Val F2: 0.735 | Train MAE: 11.557 | Val MAE: 13.000
Fold 2 | Ep 4 | Train F2: 0.735 | Val F2: 0.747 | Train MAE: 11.462 | Val MAE: 12.906
Fold 2 | Ep 5 | Train F2: 0.730 | Val F2: 0.743 | Train MAE: 11.391 | Val MAE: 12.941
Fold 2 | Ep 6 | Train F2: 0.739 | Val F2: 0.753 | Train MAE: 11.355 | Val MAE: 12.897
Fold 2 | Ep 7 | Train F2: 0.735 | Val F2: 0.747 | Train MAE: 11.444 | Val MAE: 13.037
Fold 2 | Ep 8 | Train F2: 0.739 | Val F2: 0.751 | Train MAE: 11.285 | Val MAE: 12.942
Fold 2 | Ep 9 | Train F2: 0.741 | Val F2: 0.755 | Train MAE: 11.346 | Val MAE: 13.006
Fold 2 | Ep 10 | Train F2: 0.733 | Val F2: 0.745 | Train MAE: 11.244 | Val MAE: 12.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 2 Best Val F2: 0.7547

--- Starting Fold 3 ---
Fold 3 | Ep 0 | Train F2: 0.737 | Val F2: 0.777 | Train MAE: 11.715 | Val MAE: 14.357
Fold 3 | Ep 1 | Train F2: 0.735 | Val F2: 0.777 | Train MAE: 11.685 | Val MAE: 14.309
Fold 3 | Ep 2 | Train F2: 0.731 | Val F2: 0.755 | Train MAE: 11.608 | Val MAE: 14.448
Fold 3 | Ep 3 | Train F2: 0.736 | Val F2: 0.752 | Train MAE: 11.519 | Val MAE: 14.477
Fold 3 | Ep 4 | Train F2: 0.738 | Val F2: 0.769 | Train MAE: 11.443 | Val MAE: 14.282
Fold 3 | Ep 5 | Train F2: 0.738 | Val F2: 0.755 | Train MAE: 11.405 | Val MAE: 14.182
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 3 Best Val F2: 0.7774

--- Starting Fold 4 ---
Fold 4 | Ep 0 | Train F2: 0.718 | Val F2: 0.771 | Train MAE: 12.035 | Val MAE: 15.477
Fold 4 | Ep 1 | Train F2: 0.712 | Val F2: 0.769 | Train MAE: 11.832 | Val MAE: 15.049
Fold 4 | Ep 2 | Train F2: 0.712 | Val F2: 0.757 | Train MAE: 11.984 | Val MAE: 15.446
Fold 4 | Ep 3 | Train F2: 0.735 | Val F2: 0.777 | Train MAE: 12.124 | Val MAE: 15.572
Fold 4 | Ep 4 | Train F2: 0.735 | Val F2: 0.779 | Train MAE: 11.562 | Val MAE: 14.980
Fold 4 | Ep 5 | Train F2: 0.725 | Val F2: 0.775 | Train MAE: 11.616 | Val MAE: 14.973
Fold 4 | Ep 6 | Train F2: 0.740 | Val F2: 0.777 | Train MAE: 11.932 | Val MAE: 15.307
Fold 4 | Ep 7 | Train F2: 0.737 | Val F2: 0.775 | Train MAE: 11.701 | Val MAE: 15.158
Fold 4 | Ep 8 | Train F2: 0.740 | Val F2: 0.778 | Train MAE: 11.423 | Val MAE: 14.905
Fold 4 | Ep 9 | Train F2: 0.737 | Val F2: 0.779 | Train MAE: 11.458 | Val MAE: 14.947
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 4 Best Val F2: 0.7791

--- Starting Fold 5 ---
Fold 5 | Ep 0 | Train F2: 0.740 | Val F2: 0.756 | Train MAE: 12.019 | Val MAE: 14.078
Fold 5 | Ep 1 | Train F2: 0.735 | Val F2: 0.757 | Train MAE: 11.947 | Val MAE: 14.134
Fold 5 | Ep 2 | Train F2: 0.728 | Val F2: 0.753 | Train MAE: 11.910 | Val MAE: 14.053
Fold 5 | Ep 3 | Train F2: 0.740 | Val F2: 0.758 | Train MAE: 11.856 | Val MAE: 13.997
Fold 5 | Ep 4 | Train F2: 0.745 | Val F2: 0.758 | Train MAE: 11.789 | Val MAE: 13.971
Fold 5 | Ep 5 | Train F2: 0.743 | Val F2: 0.758 | Train MAE: 11.729 | Val MAE: 13.967
Fold 5 | Ep 6 | Train F2: 0.742 | Val F2: 0.758 | Train MAE: 11.745 | Val MAE: 14.103
Fold 5 | Ep 7 | Train F2: 0.739 | Val F2: 0.754 | Train MAE: 11.662 | Val MAE: 13.889
Fold 5 | Ep 8 | Train F2: 0.738 | Val F2: 0.758 | Train MAE: 11.654 | Val MAE: 13.892
Fold 5 | Ep 9 | Train F2: 0.748 | Val F2: 0.758 | Train MAE: 11.604 | Val MAE: 13.942
Fold 5 | Ep 10 | Train F2: 0.741 | Val F2: 0.757 | Train MAE: 11.607 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 5 Best Val F2: 0.7586

--- Starting Fold 6 ---
Fold 6 | Ep 0 | Train F2: 0.732 | Val F2: 0.733 | Train MAE: 12.137 | Val MAE: 13.982
Fold 6 | Ep 1 | Train F2: 0.735 | Val F2: 0.756 | Train MAE: 12.083 | Val MAE: 13.915
Fold 6 | Ep 2 | Train F2: 0.729 | Val F2: 0.745 | Train MAE: 12.117 | Val MAE: 14.104
Fold 6 | Ep 3 | Train F2: 0.742 | Val F2: 0.755 | Train MAE: 11.984 | Val MAE: 13.886
Fold 6 | Ep 4 | Train F2: 0.746 | Val F2: 0.758 | Train MAE: 11.903 | Val MAE: 13.992
Fold 6 | Ep 5 | Train F2: 0.740 | Val F2: 0.749 | Train MAE: 11.885 | Val MAE: 14.067
Fold 6 | Ep 6 | Train F2: 0.747 | Val F2: 0.759 | Train MAE: 11.852 | Val MAE: 13.967
Fold 6 | Ep 7 | Train F2: 0.748 | Val F2: 0.760 | Train MAE: 11.783 | Val MAE: 13.927
Fold 6 | Ep 8 | Train F2: 0.747 | Val F2: 0.758 | Train MAE: 11.827 | Val MAE: 14.057
Fold 6 | Ep 9 | Train F2: 0.748 | Val F2: 0.757 | Train MAE: 11.734 | Val MAE: 13.979
Fold 6 | Ep 10 | Train F2: 0.742 | Val F2: 0.753 | Train MAE: 11.674 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 6 Best Val F2: 0.7599

--- Starting Fold 7 ---
Fold 7 | Ep 0 | Train F2: 0.737 | Val F2: 0.736 | Train MAE: 12.902 | Val MAE: 13.798
Fold 7 | Ep 1 | Train F2: 0.741 | Val F2: 0.729 | Train MAE: 12.819 | Val MAE: 13.736
Fold 7 | Ep 2 | Train F2: 0.744 | Val F2: 0.737 | Train MAE: 12.756 | Val MAE: 13.714
Fold 7 | Ep 3 | Train F2: 0.754 | Val F2: 0.751 | Train MAE: 12.705 | Val MAE: 13.682
Fold 7 | Ep 4 | Train F2: 0.752 | Val F2: 0.750 | Train MAE: 12.629 | Val MAE: 13.626
Fold 7 | Ep 5 | Train F2: 0.738 | Val F2: 0.733 | Train MAE: 12.648 | Val MAE: 13.669
Fold 7 | Ep 6 | Train F2: 0.748 | Val F2: 0.740 | Train MAE: 12.591 | Val MAE: 13.654
Fold 7 | Ep 7 | Train F2: 0.756 | Val F2: 0.751 | Train MAE: 12.517 | Val MAE: 13.616
Fold 7 | Ep 8 | Train F2: 0.751 | Val F2: 0.744 | Train MAE: 12.544 | Val MAE: 13.644
Fold 7 | Ep 9 | Train F2: 0.755 | Val F2: 0.750 | Train MAE: 12.465 | Val MAE: 13.597
Fold 7 | Ep 10 | Train F2: 0.745 | Val F2: 0.731 | Train MAE: 12.578 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 7 Best Val F2: 0.7537

--- Starting Fold 8 ---
Fold 8 | Ep 0 | Train F2: 0.752 | Val F2: 0.696 | Train MAE: 13.112 | Val MAE: 11.134
Fold 8 | Ep 1 | Train F2: 0.750 | Val F2: 0.690 | Train MAE: 12.960 | Val MAE: 11.082
Fold 8 | Ep 2 | Train F2: 0.758 | Val F2: 0.703 | Train MAE: 12.902 | Val MAE: 11.054
Fold 8 | Ep 3 | Train F2: 0.752 | Val F2: 0.692 | Train MAE: 12.868 | Val MAE: 11.031
Fold 8 | Ep 4 | Train F2: 0.755 | Val F2: 0.700 | Train MAE: 12.797 | Val MAE: 11.099
Fold 8 | Ep 5 | Train F2: 0.756 | Val F2: 0.701 | Train MAE: 12.767 | Val MAE: 11.073
Fold 8 | Ep 6 | Train F2: 0.757 | Val F2: 0.701 | Train MAE: 12.706 | Val MAE: 11.038
Fold 8 | Ep 7 | Train F2: 0.757 | Val F2: 0.703 | Train MAE: 12.724 | Val MAE: 11.208
Fold 8 | Ep 8 | Train F2: 0.757 | Val F2: 0.700 | Train MAE: 12.642 | Val MAE: 11.123
Fold 8 | Ep 9 | Train F2: 0.756 | Val F2: 0.700 | Train MAE: 12.590 | Val MAE: 11.059
Fold 8 | Ep 10 | Train F2: 0.755 | Val F2: 0.695 | Train MAE: 12.589 | Val MAE: 11.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 8 Best Val F2: 0.7039

--- Starting Fold 9 ---
Fold 9 | Ep 0 | Train F2: 0.754 | Val F2: 0.644 | Train MAE: 13.414 | Val MAE: 8.722
Fold 9 | Ep 1 | Train F2: 0.758 | Val F2: 0.662 | Train MAE: 13.489 | Val MAE: 8.848
Fold 9 | Ep 2 | Train F2: 0.755 | Val F2: 0.637 | Train MAE: 13.218 | Val MAE: 8.670
Fold 9 | Ep 3 | Train F2: 0.757 | Val F2: 0.639 | Train MAE: 13.179 | Val MAE: 8.755
Fold 9 | Ep 4 | Train F2: 0.758 | Val F2: 0.638 | Train MAE: 13.159 | Val MAE: 8.802
Fold 9 | Ep 5 | Train F2: 0.761 | Val F2: 0.662 | Train MAE: 13.348 | Val MAE: 8.708
Fold 9 | Ep 6 | Train F2: 0.752 | Val F2: 0.620 | Train MAE: 13.086 | Val MAE: 8.719
Fold 9 | Ep 7 | Train F2: 0.758 | Val F2: 0.630 | Train MAE: 12.949 | Val MAE: 8.716
Fold 9 | Ep 8 | Train F2: 0.757 | Val F2: 0.619 | Train MAE: 12.936 | Val MAE: 8.636
Fold 9 | Ep 9 | Train F2: 0.760 | Val F2: 0.640 | Train MAE: 12.887 | Val MAE: 8.711
Fold 9 | Ep 10 | Train F2: 0.762 | Val F2: 0.662 | Train MAE: 12.855 | Val MAE: 8.646
  Early



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 9 Best Val F2: 0.6624

--- Starting Fold 10 ---
Fold 10 | Ep 0 | Train F2: 0.743 | Val F2: 0.652 | Train MAE: 13.126 | Val MAE: 8.689
Fold 10 | Ep 1 | Train F2: 0.751 | Val F2: 0.657 | Train MAE: 13.017 | Val MAE: 8.782
Fold 10 | Ep 2 | Train F2: 0.748 | Val F2: 0.617 | Train MAE: 12.916 | Val MAE: 8.591
Fold 10 | Ep 3 | Train F2: 0.748 | Val F2: 0.624 | Train MAE: 12.857 | Val MAE: 8.661
Fold 10 | Ep 4 | Train F2: 0.750 | Val F2: 0.635 | Train MAE: 12.849 | Val MAE: 8.708
Fold 10 | Ep 5 | Train F2: 0.752 | Val F2: 0.655 | Train MAE: 12.761 | Val MAE: 8.569
Fold 10 | Ep 6 | Train F2: 0.745 | Val F2: 0.641 | Train MAE: 12.818 | Val MAE: 8.712
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 10 Best Val F2: 0.6572

=== CV Complete ===
Avg Best F2: 0.7352
