# Multi-tower neural network

## Imports

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from xgboost.spark import SparkXGBRegressor

from mlflow.models import infer_signature


import random
import numpy as np
import pandas as pd

import mlflow
print(mlflow.__version__)

import os

spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true")

RANDOM_SEED = 0
# Define experiment name with proper Databricks path
EXPERIMENT_NAME = "/Shared/team_2_2/mlflow-nn-tower-hyperparameters"
# Create the experiment if it doesn't exist
try:
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if experiment is None:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"Created new experiment with ID: {experiment_id}")
    else:
        print(f"Using existing experiment: {experiment.name}")
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(f"Error with experiment setup: {e}")
    # Fallback to default experiment in workspace
    mlflow.set_experiment(f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/default")



2.21.3
Created new experiment with ID: 1920497510759002


## Helper functions

In [0]:
def checkpoint_dataset(dataset, file_path):
    # Create base folder
    section = "2"
    number = "2"
    base_folder = f"dbfs:/student-groups/Group_{section}_{number}"
    dbutils.fs.mkdirs(base_folder)
    # Create subfolders if file_path contains directories
    full_path = f"{base_folder}/{file_path}.parquet"
    subfolder = "/".join(full_path.split("/")[:-1])
    dbutils.fs.mkdirs(subfolder)
    # Save dataset as a parquet file
    dataset.write.mode("overwrite").parquet(full_path)
    print(f"Checkpointed {file_path}")

## Datasets (Custom Join)

In [0]:
display(dbutils.fs.ls("dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday"))

path,name,size,modificationTime
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/cv_splits/,cv_splits/,0,1765328209514
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/stacked_input_optimized/,stacked_input_optimized/,0,1765328209514
dbfs:/student-groups/Group_2_2/3_month_custom_joined/fe_graph_and_holiday/training_splits/,training_splits/,0,1765328209514


In [0]:
train_df = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday/training_splits/train.parquet/"
)
val_df = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday/training_splits/validation.parquet/"
)
test_df = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday/training_splits/test.parquet/"
)

## Feature Engineering

In [0]:
# -----------------------------
# Time-derived cyclic features
# -----------------------------
def add_time_features(df):
    # Hour of departure as float
    df = df.withColumn("dep_hour", F.col("CRS_DEP_MINUTES") / 60.0)

    # Day of year
    df = df.withColumn("day_of_year", F.dayofyear("utc_timestamp").cast("double"))

    # Cyclic transforms (double precision)
    df = df.withColumn("dep_hour_sin", F.sin(2 * F.lit(np.pi) * F.col("dep_hour") / 24))
    df = df.withColumn("dep_hour_cos", F.cos(2 * F.lit(np.pi) * F.col("dep_hour") / 24))

    df = df.withColumn("dow_sin", F.sin(2 * F.lit(np.pi) * F.col("DAY_OF_WEEK") / 7))
    df = df.withColumn("dow_cos", F.cos(2 * F.lit(np.pi) * F.col("DAY_OF_WEEK") / 7))

    df = df.withColumn("doy_sin", F.sin(2 * F.lit(np.pi) * F.col("day_of_year") / 365))
    df = df.withColumn("doy_cos", F.cos(2 * F.lit(np.pi) * F.col("day_of_year") / 365))
    return df

In [0]:
# -----------------------------
# Weather delta features (3-hour changes)
# -----------------------------
def add_weather_deltas(df):
    w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_timestamp")
    
    for col in [
        "HourlyVisibility", "HourlyStationPressure",
        "HourlyDryBulbTemperature", "HourlyWindSpeed",
        "HourlyPrecipitation"
    ]:
        lag_col = F.lag(col, 3).over(w)
        delta_col = F.col(col) - lag_col
        # Use lag value if missing instead of 0 to avoid small bias
        df = df.withColumn(
            f"{col}_3h_change",
            F.when(lag_col.isNull(), F.lit(None)).otherwise(delta_col)
        )
    return df

In [0]:
# -----------------------------
# Origin congestion features
# -----------------------------
def add_congestion_features(df):
    # Rolling window: 1 hour before current event
    df = df.withColumn("utc_ts_sec", F.col("utc_timestamp").cast("long"))

    w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_ts_sec").rangeBetween(-3600, 0)

    df = df.withColumn(
        "ground_flights_last_hour",
        F.count("utc_ts_sec").over(w) - 1
    )

    return df

In [0]:
# -----------------------------
# Destination congestion features
# -----------------------------
def add_dest_congestion_features(df):
    # Convert timestamp to seconds
    df = df.withColumn("utc_ts_sec", F.col("utc_timestamp").cast("long"))
    
    # Rolling window: 1 hour (3600 seconds) before current row
    w = Window.partitionBy("DEST_AIRPORT_SEQ_ID") \
              .orderBy("utc_ts_sec") \
              .rangeBetween(-3600, 0)
    
    df = df.withColumn(
        "arrivals_last_hour",
        F.count("utc_ts_sec").over(w) - 1  # exclude current row
    )
    return df

In [0]:
### Apply additional feature engineering
train_df_fe = (train_df
               .transform(add_time_features)
               .transform(add_weather_deltas)
               .transform(add_congestion_features)
               .transform(add_dest_congestion_features))

val_df_fe   = (val_df
               .transform(add_time_features)
               .transform(add_weather_deltas)
               .transform(add_congestion_features)
               .transform(add_dest_congestion_features))

test_df_fe  = (test_df
               .transform(add_time_features)
               .transform(add_weather_deltas)
               .transform(add_congestion_features)
               .transform(add_dest_congestion_features))


In [0]:
## Checkpoint updated data

checkpoint_dataset(train_df_fe, "1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train")
checkpoint_dataset(val_df_fe, "1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/val")
checkpoint_dataset(test_df_fe, "1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/test")


Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train
Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/val
Checkpointed 1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/test


## Start here when running experiment

In [0]:
## Test if it can be read
train_df_fe = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/train.parquet/"
)
val_df_fe = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/val.parquet/"
)
test_df = spark.read.parquet(
    "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday_nnfeat/training_splits/test.parquet/"
)

## Vectorization for Torch

In [0]:
### Column organization / feature selection

# -----------------------------
# Categorical features (for embeddings)
# -----------------------------
categorical_cols = [
    "OP_UNIQUE_CARRIER",       # Airline carrier code
    "ORIGIN_AIRPORT_SEQ_ID",   # Origin airport
    "DEST_AIRPORT_SEQ_ID",     # Destination airport
    "route",                   # Route string
    "AIRPORT_HUB_CLASS",       # Hub classification
    "AIRLINE_CATEGORY",        # Airline type
]

# -----------------------------
# Numerical features (to normalize)
# -----------------------------
numerical_cols = [
    ### baseline features
    "DISTANCE",
    "CRS_ELAPSED_TIME",
    "prev_flight_delay_in_minutes",   # Phase 2 Feature Eng
    "origin_delays_4h",               # Phase 2 Feature Eng
    "delay_origin_7d",                # Phase 2 Feature Eng
    "delay_origin_carrier_7d",        # Phase 2 Feature Eng
    "delay_route_7d",                 # Phase 2 Feature Eng
    "flight_count_24h",               # Phase 2 Feature Eng
    "AVG_TAXI_OUT_ORIGIN",            # Phase 2 Feature Eng
    "AVG_ARR_DELAY_ORIGIN",           # Phase 2 Feature Eng

    ### graph features
    # "page_rank",
    "in_degree",
    "out_degree",
    "weighted_in_degree",
    "weighted_out_degree",
    "betweenness",
    "closeness",
    "N_RUNWAYS",

    ### weather (raw only)
    "HourlyVisibility",
    "HourlyStationPressure",
    "HourlyWindSpeed",
    "HourlyDryBulbTemperature",
    "HourlyDewPointTemperature",
    "HourlyRelativeHumidity",
    "HourlyAltimeterSetting",
    "HourlyVisibility",
    "HourlyStationPressure",
    "HourlyWetBulbTemperature",
    "HourlyPrecipitation",
    "HourlyCloudCoverage",
    "HourlyCloudElevation",

    ### congestion
    "ground_flights_last_hour",     # New feature engineered
    "arrivals_last_hour",           # New feature engineered

    ### time features               # New feature engineered start
    # "dep_hour_sin",
    # "dep_hour_cos",
    "dow_sin",
    "dow_cos",
    "doy_sin",
    "doy_cos",                      # New feature engineered end
]

# -----------------------------
# Time feature (optional numerical input)
# -----------------------------
time_col = "CRS_DEP_MINUTES"  # Exact departure minute, optional as input

# -----------------------------
# Target
# -----------------------------
target_col = "DEP_DELAY_NEW"

In [0]:
## Convert PySpark, Pandas to Torch-ready dicts
train_pd = train_df_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()
val_pd   = val_df_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()
test_pd  = test_df_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()


## Encode categoricals; build encoder dictionary for train data only

In [0]:
# -----------------------------
# Build category maps from training data
# -----------------------------
def build_category_maps(train_df, categorical_cols):
    """
    Creates a mapping of unique category values to integer IDs for each categorical column.
    UNK (unknown) is mapped to 0.
    
    Returns:
        dict: {column_name: {category_value: id, ...}, ...}
    """
    cat_maps = {}

    for c in categorical_cols:
        # Get unique categories in training set as strings
        uniques = train_df[c].astype(str).unique().tolist()

        # Map categories to integers, reserving 0 for UNK
        cat_maps[c] = {"UNK": 0, **{v: i + 1 for i, v in enumerate(sorted(uniques))}}

    return cat_maps

In [0]:
# -----------------------------
# Apply category maps to any dataset
# -----------------------------
def apply_category_maps(df, cat_maps, categorical_cols):
    """
    Encodes categorical columns using pre-built category maps.
    Unseen values are mapped to 0 (UNK).
    
    Returns:
        pd.DataFrame: copy of input df with encoded categorical columns
    """
    df = df.copy()

    for c in categorical_cols:
        mapping = cat_maps[c]
        # Convert to string, map to integer IDs, unseen -> 0
        df[c] = df[c].astype(str).apply(lambda x: mapping.get(x, 0))

    return df

In [0]:
# -----------------------------
# Apply to train/val/test
# -----------------------------
# Build mapping dicts from training data only
cat_maps = build_category_maps(train_pd, categorical_cols)

# Apply safely to all datasets
train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
val_pd   = apply_category_maps(val_pd, cat_maps, categorical_cols)
test_pd  = apply_category_maps(test_pd, cat_maps, categorical_cols)

In [0]:
# -----------------------------
# Compute embedding sizes
# -----------------------------
# Number of unique categories for each column (for embedding layers)
cat_dims = [len(cat_maps[c]) for c in categorical_cols]

# Embedding dimensions (rule of thumb: min(64, int(cardinality ** 0.3)))
emb_dims = [min(64, int(n**0.3)) for n in cat_dims]

# Sanity check
for c, dim, emb in zip(categorical_cols, cat_dims, emb_dims):
    print(f"{c}: {dim} categories -> embedding dim {emb}")

OP_UNIQUE_CARRIER: 18 categories -> embedding dim 2
ORIGIN_AIRPORT_SEQ_ID: 373 categories -> embedding dim 5
DEST_AIRPORT_SEQ_ID: 388 categories -> embedding dim 5
route: 6252 categories -> embedding dim 13
AIRPORT_HUB_CLASS: 7 categories -> embedding dim 1
AIRLINE_CATEGORY: 4 categories -> embedding dim 1


## Normalize numerical features

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
val_pd[numerical_cols]   = scaler.transform(val_pd[numerical_cols])
test_pd[numerical_cols]  = scaler.transform(test_pd[numerical_cols])

## Load Torch

In [0]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [0]:
from torch.utils.data import Dataset, DataLoader
import torch

class FlightDataset(Dataset):
    def __init__(self, df):
        # Categorical features (long for embeddings)
        self.cat = torch.tensor(df[categorical_cols].values, dtype=torch.long)
        
        # Numerical features (float)
        self.num = torch.tensor(df[numerical_cols].values, dtype=torch.float32)
        
        # Time feature (optional, float)
        self.time = torch.tensor(df[time_col].values, dtype=torch.float32).unsqueeze(1)
        
        # Target
        self.y = torch.tensor(df[target_col].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.cat[idx], self.num[idx], self.time[idx], self.y[idx]

# Create datasets
train_ds = FlightDataset(train_pd)
val_ds   = FlightDataset(val_pd)
test_ds  = FlightDataset(test_pd)

# Create data loaders
train_dl = DataLoader(train_ds, batch_size=2048, shuffle=True, pin_memory=True)
val_dl   = DataLoader(val_ds, batch_size=2048, pin_memory=True)
test_dl  = DataLoader(test_ds, batch_size=2048, pin_memory=True)


## Model Definition (ResFiLM-MLP)

In [0]:
# -----------------------------
# Residual Block
# -----------------------------
class ResBlock(nn.Module):
    def __init__(self, dim, dropout=0.1):
        super().__init__()
        self.ln = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, dim)
        self.fc2 = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = F.gelu(self.fc1(self.ln(x)))
        h = self.dropout(h)
        h = self.fc2(h)
        return x + h


# -----------------------------
# Time2Vec
# -----------------------------
class Time2Vec(nn.Module):
    def __init__(self, k):
        super().__init__()
        self.wb = nn.Linear(1, 1)
        self.ws = nn.Linear(1, k)

    def forward(self, t):
        b = self.wb(t)
        s = torch.sin(self.ws(t))
        return torch.cat([b, s], dim=-1)


# -----------------------------
# ResFiLM MLP (F2-optimized)
# -----------------------------
class ResFiLMMLPft(nn.Module):
    def __init__(
        self,
        cat_dims,
        emb_dims,
        num_numerical,
        time_dim=8,
        emb_dropout=0.05,
        num_dropout=0.1,
        film_dropout=0.1,
        final_dropout=0.2
    ):
        super().__init__()

        # --- Embedding tower ---
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat_dim, emb_dim)
            for cat_dim, emb_dim in zip(cat_dims, emb_dims)
        ])
        self.emb_total = sum(emb_dims)
        self.emb_dropout = nn.Dropout(emb_dropout)

        # --- Numeric tower ---
        self.fc_num = nn.Linear(num_numerical, 256)
        self.res_blocks = nn.ModuleList([
            ResBlock(256, dropout=num_dropout)
            for _ in range(4)
        ])

        # --- FiLM for embeddings ---
        self.film = nn.Linear(256, 2 * self.emb_total)
        self.film_dropout = nn.Dropout(film_dropout)

        # --- Time2Vec ---
        self.t2v = Time2Vec(time_dim)

        # --- Optional: classification-specific FiLM ---
        self.clf_film = nn.Linear(256, 2 * self.emb_total)
        self.clf_film_dropout = nn.Dropout(film_dropout)

        # --- Final fusion dimension ---
        fused_dim = 256 + self.emb_total + (time_dim + 1) + 1

        # -------------------------------
        # Multi-task heads
        # -------------------------------

        # Regression head (delay minutes)
        self.reg_head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(128, 1)  # raw regression output
        )

        # Classification head (delay yes/no) â€“ deeper
        self.clf_head = nn.Sequential(
            nn.Linear(fused_dim, 256),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(256, 128),
            nn.GELU(),
            nn.Dropout(final_dropout),
            nn.Linear(128, 1)  # raw logit, no Sigmoid
        )

    def forward(self, x_cat, x_num, x_time):

        # --- Embeddings ---
        emb = [emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.embeddings)]
        emb = torch.cat(emb, dim=-1)
        emb = self.emb_dropout(emb)

        # --- Numeric tower ---
        h = F.gelu(self.fc_num(x_num))
        for block in self.res_blocks:
            h = block(h)

        # --- FiLM modulation ---
        gamma, beta = torch.chunk(self.film(h), 2, dim=-1)
        gamma = self.film_dropout(gamma)
        beta = self.film_dropout(beta)
        emb_mod = gamma * emb + beta

        # --- Time2Vec ---
        t_feat = self.t2v(x_time)

        # --- Fuse towers ---
        z = torch.cat([emb_mod, h, t_feat, x_time], dim=-1)

        # --- Output tasks ---
        reg_out = self.reg_head(z)       # regression output
        clf_out = self.clf_head(z)       # classification logit (no sigmoid)

        return reg_out, clf_out


# -----------------------------
# Example F2-focused loss function
# -----------------------------
def f2_loss(logits, targets, pos_weight=4.0):
    # logits: raw outputs from clf_head
    # targets: binary labels (0/1)
    weight = torch.tensor([pos_weight], device=logits.device)
    return nn.BCEWithLogitsLoss(pos_weight=weight)(logits, targets)

## Prepare fold logic

In [0]:
from sklearn.metrics import fbeta_score, roc_auc_score, mean_squared_error, mean_absolute_error
import copy
import numpy as np
import torch
import mlflow

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_epoch(model, dataloader, optimizer, criterion_reg, criterion_clf, device, alpha=0.5):
    """
    Train one epoch. 
    alpha: Weighting between Regression (MAE) and Classification (F2/BCE).
    """
    model.train()
    total_loss = 0
    
    for cat, num, time, y in dataloader:
        cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
        
        optimizer.zero_grad()
        reg_out, clf_out = model(cat, num, time)
        
        # Create binary target: (Delay > 0)
        y_bin = (y > 0).float()
        
        loss_reg = criterion_reg(reg_out, y)
        loss_clf = criterion_clf(clf_out, y_bin)
        
        # Combine losses
        loss = (alpha * loss_reg) + ((1 - alpha) * loss_clf)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    """
    Evaluates on MAE, RMSE, AUC, and F2.
    """
    model.eval()
    preds_reg, preds_clf = [], []
    targets_reg, targets_clf = [], []
    
    with torch.no_grad():
        for cat, num, time, y in dataloader:
            cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
            reg_out, clf_out = model(cat, num, time)
            
            preds_reg.append(reg_out.cpu())
            preds_clf.append(torch.sigmoid(clf_out).cpu()) 
            targets_reg.append(y.cpu())
            targets_clf.append((y > 0).float().cpu())
            
    # Concatenate
    y_pred_reg = torch.cat(preds_reg).numpy()
    y_pred_clf = torch.cat(preds_clf).numpy()
    y_true_reg = torch.cat(targets_reg).numpy()
    y_true_clf = torch.cat(targets_clf).numpy()
    
    # --- Metrics ---
    # 1. Regression Metrics
    mae = mean_absolute_error(y_true_reg, y_pred_reg)
    rmse = np.sqrt(mean_squared_error(y_true_reg, y_pred_reg))
    
    # 2. Classification Metrics
    try:
        auc = roc_auc_score(y_true_clf, y_pred_clf)
    except:
        auc = 0.5 # Handle edge case if only 1 class in batch
        
    # F2 Score (Threshold 0.5, strictly penalizes False Negatives)
    y_pred_bin = (y_pred_clf > 0.5).astype(int)
    f2 = fbeta_score(y_true_clf, y_pred_bin, beta=2)
    
    return {"mae": mae, "rmse": rmse, "auc": auc, "f2_score": f2}

In [0]:
def prepare_fold_data(fold_df, categorical_cols, numerical_cols, time_col, target_col):
    """
    Takes the raw Spark DF for a specific fold, splits into train/val,
    applies FE, fits scalers/encoders on Train, transforms Val.
    """
    # 1. Split by split_type
    # CHANGE: Explicitly filter for 'validation' to exclude 'gap' data
    train_df_spark = fold_df.filter(F.col("split_type") == "train")
    val_df_spark = fold_df.filter(F.col("split_type") == "validation")

    # 2. Apply Feature Engineering (Redundant if already in parquet, but ensures consistency)
    train_fe = (train_df_spark
                .transform(add_time_features)
                .transform(add_weather_deltas)
                .transform(add_congestion_features)
                .transform(add_dest_congestion_features))
    
    val_fe = (val_df_spark
              .transform(add_time_features)
              .transform(add_weather_deltas)
              .transform(add_congestion_features)
              .transform(add_dest_congestion_features))

    # 3. Convert to Pandas for Torch
    train_pd = train_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()
    val_pd = val_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()

    # 4. Fit Encoders (Train Only)
    cat_maps = build_category_maps(train_pd, categorical_cols)
    train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
    val_pd = apply_category_maps(val_pd, cat_maps, categorical_cols)

    # 5. Fit Scaler (Train Only)
    scaler = StandardScaler()
    train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
    val_pd[numerical_cols] = scaler.transform(val_pd[numerical_cols])

    # 6. Calc Dimensions for Model
    cat_dims = [len(cat_maps[c]) for c in categorical_cols]
    emb_dims = [min(64, int(n**0.3)) for n in cat_dims]

    return train_pd, val_pd, cat_dims, emb_dims

In [0]:
import torch.nn.functional as F
import pyspark.sql.functions as sf

In [0]:
import pyspark.sql.functions as sf  # Safe alias for Spark
import torch.nn.functional as F     # Safe alias for Torch
import numpy as np
import pandas as pd
from pyspark.sql import Window
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score, roc_auc_score, mean_squared_error, mean_absolute_error
import copy
import mlflow

# ==========================================
# 1. FIX: Deduplicate Numerical Columns
# ==========================================
numerical_cols = list(dict.fromkeys(numerical_cols))
print(f"Numerical columns count after deduplication: {len(numerical_cols)}")

# ==========================================
# 2. Redefine FE Functions (Namespace Safe)
# ==========================================
def add_time_features(df):
    df = df.withColumn("dep_hour", sf.col("CRS_DEP_MINUTES") / 60.0)
    df = df.withColumn("day_of_year", sf.dayofyear("utc_timestamp").cast("double"))
    df = df.withColumn("dep_hour_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
    df = df.withColumn("dep_hour_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
    df = df.withColumn("dow_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
    df = df.withColumn("dow_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
    df = df.withColumn("doy_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
    df = df.withColumn("doy_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
    return df

def add_weather_deltas(df):
    w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_timestamp")
    for col_name in ["HourlyVisibility", "HourlyStationPressure", "HourlyDryBulbTemperature", "HourlyWindSpeed", "HourlyPrecipitation"]:
        lag_col = sf.lag(col_name, 3).over(w)
        delta_col = sf.col(col_name) - lag_col
        df = df.withColumn(f"{col_name}_3h_change", sf.when(lag_col.isNull(), sf.lit(None)).otherwise(delta_col))
    return df

def add_congestion_features(df):
    df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
    w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_ts_sec").rangeBetween(-3600, 0)
    df = df.withColumn("ground_flights_last_hour", sf.count("utc_ts_sec").over(w) - 1)
    return df

def add_dest_congestion_features(df):
    df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
    w = Window.partitionBy("DEST_AIRPORT_SEQ_ID").orderBy("utc_ts_sec").rangeBetween(-3600, 0)
    df = df.withColumn("arrivals_last_hour", sf.count("utc_ts_sec").over(w) - 1)
    return df

# ==========================================
# 3. Fold Prep & Prediction Helper
# ==========================================
def prepare_fold_data(fold_df, categorical_cols, numerical_cols, time_col, target_col):
    train_df_spark = fold_df.filter(sf.col("split_type") == "train")
    val_df_spark   = fold_df.filter(sf.col("split_type") == "validation")

    train_fe = (train_df_spark.transform(add_time_features).transform(add_weather_deltas)
                .transform(add_congestion_features).transform(add_dest_congestion_features))
    val_fe   = (val_df_spark.transform(add_time_features).transform(add_weather_deltas)
                .transform(add_congestion_features).transform(add_dest_congestion_features))

    train_pd = train_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()
    val_pd   = val_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()

    cat_maps = build_category_maps(train_pd, categorical_cols)
    train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
    val_pd   = apply_category_maps(val_pd, cat_maps, categorical_cols)

    scaler = StandardScaler()
    train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
    val_pd[numerical_cols]   = scaler.transform(val_pd[numerical_cols])

    cat_dims = [len(cat_maps[c]) for c in categorical_cols]
    emb_dims = [min(64, int(n**0.3)) for n in cat_dims]

    return train_pd, val_pd, cat_dims, emb_dims

def save_predictions_to_parquet(model, dataloader, device, save_path, fold_id):
    """
    Generates predictions using the model and saves them to a Parquet file.
    """
    model.eval()
    preds_reg, preds_clf = [], []
    targets_reg, targets_clf = [], []
    
    with torch.no_grad():
        for cat, num, time, y in dataloader:
            cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
            reg_out, clf_out = model(cat, num, time)
            
            preds_reg.append(reg_out.cpu().numpy())
            preds_clf.append(torch.sigmoid(clf_out).cpu().numpy())
            targets_reg.append(y.cpu().numpy())
            targets_clf.append((y >= 15.0).float().cpu().numpy()) # Consistent with >15m logic
    
    # Flatten arrays
    flat_preds_reg = np.concatenate(preds_reg).flatten()
    flat_preds_clf = np.concatenate(preds_clf).flatten()
    flat_targets_reg = np.concatenate(targets_reg).flatten()
    flat_targets_clf = np.concatenate(targets_clf).flatten()
    
    # Create Pandas DataFrame
    pdf = pd.DataFrame({
        "fold_id": fold_id,
        "target_delay_minutes": flat_targets_reg,
        "pred_delay_minutes": flat_preds_reg,
        "target_is_delayed": flat_targets_clf,
        "pred_prob_delayed": flat_preds_clf
    })
    
    # Save via Spark
    print(f"  >> Saving {len(pdf)} predictions to {save_path}...")
    spark.createDataFrame(pdf).write.mode("overwrite").parquet(save_path)

# ==========================================
# 4. Main Execution Loop
# ==========================================
def get_device(): return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_epoch(model, dataloader, optimizer, criterion_reg, criterion_clf, device, alpha=0.5):
    model.train()
    total_loss = 0
    for cat, num, time, y in dataloader:
        cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
        optimizer.zero_grad()
        reg_out, clf_out = model(cat, num, time)
        y_bin = (y >= 15.0).float()
        loss = (alpha * criterion_reg(reg_out, y)) + ((1 - alpha) * criterion_clf(clf_out, y_bin))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    preds_reg, preds_clf, targets_reg, targets_clf = [], [], [], []
    with torch.no_grad():
        for cat, num, time, y in dataloader:
            cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
            reg_out, clf_out = model(cat, num, time)
            preds_reg.append(reg_out.cpu())
            preds_clf.append(torch.sigmoid(clf_out).cpu())
            targets_reg.append(y.cpu())
            targets_clf.append((y >= 15.0).float().cpu())
            
    y_pred_reg, y_pred_clf = torch.cat(preds_reg).numpy(), torch.cat(preds_clf).numpy()
    y_true_reg, y_true_clf = torch.cat(targets_reg).numpy(), torch.cat(targets_clf).numpy()
    
    mae = mean_absolute_error(y_true_reg, y_pred_reg)
    rmse = np.sqrt(mean_squared_error(y_true_reg, y_pred_reg))
    try: auc = roc_auc_score(y_true_clf, y_pred_clf)
    except: auc = 0.5
    f2 = fbeta_score(y_true_clf, (y_pred_clf > 0.5).astype(int), beta=2)
    return {"mae": mae, "rmse": rmse, "auc": auc, "f2_score": f2}

# --- Config ---
CV_DATA_PATH = "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday_nnfeat/cv_splits"
PREDS_SAVE_PATH = "dbfs:/student-groups/Group_2_2/1_year_custom_joined/nn_predictions" # <--- NEW PATH
NUM_EPOCHS, BATCH_SIZE, LR, PATIENCE = 20, 2048, 1e-3, 5
DEVICE = get_device()

cv_full_df = spark.read.parquet(CV_DATA_PATH)
folds = sorted([row['fold_id'] for row in cv_full_df.select("fold_id").distinct().collect()])
print(f"Starting Cross-Validation on Folds: {folds}")

mlflow.end_run()
with mlflow.start_run(run_name="CV_Orchestrator_MAE_F2") as parent_run:
    cv_summary = {"best_val_f2": [], "best_val_mae": []}
    
    for fold in folds:
        print(f"\n--- Starting Fold {fold} ---")
        with mlflow.start_run(run_name=f"Fold_{fold}", nested=True):
            fold_data = cv_full_df.filter(sf.col("fold_id") == fold)
            
            # --- PREPARE DATA ---
            train_pd, val_pd, cat_dims, emb_dims = prepare_fold_data(
                fold_data, categorical_cols, numerical_cols, time_col, target_col
            )
            
            train_dl = DataLoader(FlightDataset(train_pd), batch_size=BATCH_SIZE, shuffle=True)
            val_dl = DataLoader(FlightDataset(val_pd), batch_size=BATCH_SIZE)
            
            # --- MODEL ---
            model = ResFiLMMLP(cat_dims, emb_dims, len(numerical_cols), time_dim=8).to(DEVICE)
            optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
            crit_reg = nn.L1Loss()
            crit_clf = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.0]).to(DEVICE))

            # --- TRAINING ---
            best_f2 = -1.0; best_mae = float('inf')
            best_model_f2_state = None; best_model_mae_state = None
            early_stop_counter = 0
            
            for epoch in range(NUM_EPOCHS):
                train_loss = train_epoch(model, train_dl, optimizer, crit_reg, crit_clf, DEVICE)
                train_metrics = evaluate(model, train_dl, DEVICE)
                val_metrics = evaluate(model, val_dl, DEVICE)
                
                print(f"Fold {fold} | Ep {epoch} | Train F2: {train_metrics['f2_score']:.3f} | Val F2: {val_metrics['f2_score']:.3f}")
                
                mlflow.log_metrics({
                    "train_loss": train_loss,
                    "val_f2": val_metrics['f2_score'], "val_mae": val_metrics['mae'],
                    "val_rmse": val_metrics['rmse']
                }, step=epoch)
                
                # Track Best F2
                if val_metrics['f2_score'] > best_f2:
                    best_f2 = val_metrics['f2_score']
                    best_model_f2_state = copy.deepcopy(model.state_dict())
                    early_stop_counter = 0
                    mlflow.log_metrics({"best_val_f2_so_far": best_f2}, step=epoch)
                else:
                    early_stop_counter += 1

                # Track Best MAE
                if val_metrics['mae'] < best_mae:
                    best_mae = val_metrics['mae']
                    best_model_mae_state = copy.deepcopy(model.state_dict())

                if early_stop_counter >= PATIENCE:
                    print("  Early stopping triggered.")
                    break

            # --- SAVE PREDICTIONS & MODELS ---
            
            # 1. Load Best F2 Model & Save Predictions
            if best_model_f2_state:
                model.load_state_dict(best_model_f2_state)
                
                # Log Model
                example_input = next(iter(val_dl))
                ex_in = (example_input[0].to(DEVICE), example_input[1].to(DEVICE), example_input[2].to(DEVICE))
                mlflow.pytorch.log_model(model, f"model_fold_{fold}_best_f2", input_example=ex_in)
                
                # Save Predictions (New Step)
                save_path = f"{PREDS_SAVE_PATH}/fold_{fold}"
                save_predictions_to_parquet(model, val_dl, DEVICE, save_path, fold)
                
            # 2. Log Best MAE Model (Backup)
            if best_model_mae_state:
                model.load_state_dict(best_model_mae_state)
                mlflow.pytorch.log_model(model, f"model_fold_{fold}_best_mae")
            
            cv_summary["best_val_f2"].append(best_f2)
            cv_summary["best_val_mae"].append(best_mae)

    print(f"\n=== CV Complete ===")
    print(f"Avg Best F2: {np.mean(cv_summary['best_val_f2']):.4f}")
    mlflow.log_metric("cv_avg_best_f2", np.mean(cv_summary['best_val_f2']))

In [0]:
# import pyspark.sql.functions as sf  # Safe alias for Spark
# import torch.nn.functional as F     # Safe alias for Torch
# import numpy as np
# from pyspark.sql import Window
# import torch
# from torch.utils.data import DataLoader, Dataset
# import torch.nn as nn
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import fbeta_score, roc_auc_score, mean_squared_error, mean_absolute_error
# import copy
# import mlflow

# # ==========================================
# # 1. FIX: Deduplicate Numerical Columns
# # ==========================================
# numerical_cols = list(dict.fromkeys(numerical_cols))
# print(f"Numerical columns count after deduplication: {len(numerical_cols)}")

# # ==========================================
# # 2. Redefine FE Functions (Namespace Safe)
# # ==========================================
# def add_time_features(df):
#     df = df.withColumn("dep_hour", sf.col("CRS_DEP_MINUTES") / 60.0)
#     df = df.withColumn("day_of_year", sf.dayofyear("utc_timestamp").cast("double"))
#     df = df.withColumn("dep_hour_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
#     df = df.withColumn("dep_hour_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("dep_hour") / 24))
#     df = df.withColumn("dow_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
#     df = df.withColumn("dow_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("DAY_OF_WEEK") / 7))
#     df = df.withColumn("doy_sin", sf.sin(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
#     df = df.withColumn("doy_cos", sf.cos(2 * sf.lit(np.pi) * sf.col("day_of_year") / 365))
#     return df

# def add_weather_deltas(df):
#     w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_timestamp")
#     for col_name in ["HourlyVisibility", "HourlyStationPressure", "HourlyDryBulbTemperature", "HourlyWindSpeed", "HourlyPrecipitation"]:
#         lag_col = sf.lag(col_name, 3).over(w)
#         delta_col = sf.col(col_name) - lag_col
#         df = df.withColumn(f"{col_name}_3h_change", sf.when(lag_col.isNull(), sf.lit(None)).otherwise(delta_col))
#     return df

# def add_congestion_features(df):
#     df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
#     w = Window.partitionBy("ORIGIN_AIRPORT_SEQ_ID").orderBy("utc_ts_sec").rangeBetween(-3600, 0)
#     df = df.withColumn("ground_flights_last_hour", sf.count("utc_ts_sec").over(w) - 1)
#     return df

# def add_dest_congestion_features(df):
#     df = df.withColumn("utc_ts_sec", sf.col("utc_timestamp").cast("long"))
#     w = Window.partitionBy("DEST_AIRPORT_SEQ_ID").orderBy("utc_ts_sec").rangeBetween(-3600, 0)
#     df = df.withColumn("arrivals_last_hour", sf.count("utc_ts_sec").over(w) - 1)
#     return df

# # ==========================================
# # 3. Fold Prep (Strict Filtering)
# # ==========================================
# def prepare_fold_data(fold_df, categorical_cols, numerical_cols, time_col, target_col):
#     train_df_spark = fold_df.filter(sf.col("split_type") == "train")
#     val_df_spark   = fold_df.filter(sf.col("split_type") == "validation")

#     train_fe = (train_df_spark.transform(add_time_features).transform(add_weather_deltas)
#                 .transform(add_congestion_features).transform(add_dest_congestion_features))
#     val_fe   = (val_df_spark.transform(add_time_features).transform(add_weather_deltas)
#                 .transform(add_congestion_features).transform(add_dest_congestion_features))

#     train_pd = train_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()
#     val_pd   = val_fe.select(categorical_cols + numerical_cols + [time_col, target_col]).toPandas()

#     cat_maps = build_category_maps(train_pd, categorical_cols)
#     train_pd = apply_category_maps(train_pd, cat_maps, categorical_cols)
#     val_pd   = apply_category_maps(val_pd, cat_maps, categorical_cols)

#     scaler = StandardScaler()
#     train_pd[numerical_cols] = scaler.fit_transform(train_pd[numerical_cols])
#     val_pd[numerical_cols]   = scaler.transform(val_pd[numerical_cols])

#     cat_dims = [len(cat_maps[c]) for c in categorical_cols]
#     emb_dims = [min(64, int(n**0.3)) for n in cat_dims]

#     return train_pd, val_pd, cat_dims, emb_dims

# # ==========================================
# # 4. Main Execution Loop
# # ==========================================
# def get_device(): return torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def train_epoch(model, dataloader, optimizer, criterion_reg, criterion_clf, device, alpha=0.5):
#     model.train()
#     total_loss = 0
#     for cat, num, time, y in dataloader:
#         cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
#         optimizer.zero_grad()
#         reg_out, clf_out = model(cat, num, time)
#         y_bin = (y > 0).float()
#         loss = (alpha * criterion_reg(reg_out, y)) + ((1 - alpha) * criterion_clf(clf_out, y_bin))
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     return total_loss / len(dataloader)

# def evaluate(model, dataloader, device):
#     model.eval()
#     preds_reg, preds_clf, targets_reg, targets_clf = [], [], [], []
#     with torch.no_grad():
#         for cat, num, time, y in dataloader:
#             cat, num, time, y = cat.to(device), num.to(device), time.to(device), y.to(device)
#             reg_out, clf_out = model(cat, num, time)
#             preds_reg.append(reg_out.cpu()); preds_clf.append(torch.sigmoid(clf_out).cpu())
#             targets_reg.append(y.cpu()); targets_clf.append((y > 0).float().cpu())
            
#     y_pred_reg, y_pred_clf = torch.cat(preds_reg).numpy(), torch.cat(preds_clf).numpy()
#     y_true_reg, y_true_clf = torch.cat(targets_reg).numpy(), torch.cat(targets_clf).numpy()
    
#     mae = mean_absolute_error(y_true_reg, y_pred_reg)
#     rmse = np.sqrt(mean_squared_error(y_true_reg, y_pred_reg))
#     try: auc = roc_auc_score(y_true_clf, y_pred_clf)
#     except: auc = 0.5
#     f2 = fbeta_score(y_true_clf, (y_pred_clf > 0.5).astype(int), beta=2)
#     return {"mae": mae, "rmse": rmse, "auc": auc, "f2_score": f2}

# # --- Config ---
# CV_DATA_PATH = "dbfs:/student-groups/Group_2_2/1_year_custom_joined/fe_graph_and_holiday_nnfeat/cv_splits"
# NUM_EPOCHS, BATCH_SIZE, LR, PATIENCE = 20, 2048, 1e-3, 5
# DEVICE = get_device()

# cv_full_df = spark.read.parquet(CV_DATA_PATH)
# folds = sorted([row['fold_id'] for row in cv_full_df.select("fold_id").distinct().collect()])
# print(f"Starting Cross-Validation on Folds: {folds}")

# mlflow.end_run()
# with mlflow.start_run(run_name="CV_Orchestrator_MAE_F2") as parent_run:
#     cv_summary = {"best_val_f2": [], "final_val_mae": []}
    
#     for fold in folds:
#         print(f"\n--- Starting Fold {fold} ---")
#         with mlflow.start_run(run_name=f"Fold_{fold}", nested=True):
#             fold_data = cv_full_df.filter(sf.col("fold_id") == fold)
            
#             # --- PREPARE DATA ---
#             train_pd, val_pd, cat_dims, emb_dims = prepare_fold_data(
#                 fold_data, categorical_cols, numerical_cols, time_col, target_col
#             )
            
#             train_dl = DataLoader(FlightDataset(train_pd), batch_size=BATCH_SIZE, shuffle=True)
#             val_dl = DataLoader(FlightDataset(val_pd), batch_size=BATCH_SIZE)
            
#             # --- MODEL ---
#             model = ResFiLMMLP(cat_dims, emb_dims, len(numerical_cols), time_dim=8).to(DEVICE)
#             optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
#             crit_reg = nn.L1Loss()
#             crit_clf = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.0]).to(DEVICE))

#             # --- TRAINING ---
#             best_f2 = -1.0; best_model_state = None; early_stop_counter = 0
            
#             for epoch in range(NUM_EPOCHS):
#                 train_loss = train_epoch(model, train_dl, optimizer, crit_reg, crit_clf, DEVICE)
                
#                 # --- ADDED: Evaluate on Train and Val ---
#                 train_metrics = evaluate(model, train_dl, DEVICE)
#                 val_metrics = evaluate(model, val_dl, DEVICE)
                
#                 # --- UPDATED PRINT ---
#                 print(f"Fold {fold} | Ep {epoch} | "
#                       f"Train F2: {train_metrics['f2_score']:.3f} | Val F2: {val_metrics['f2_score']:.3f} | "
#                       f"Train MAE: {train_metrics['mae']:.3f} | Val MAE: {val_metrics['mae']:.3f}")
                
#                 # --- UPDATED LOGGING ---
#                 mlflow.log_metrics({
#                     "train_loss": train_loss,
#                     "train_f2": train_metrics['f2_score'],
#                     "train_mae": train_metrics['mae'],
#                     "val_f2": val_metrics['f2_score'], 
#                     "val_mae": val_metrics['mae'],
#                     "val_rmse": val_metrics['rmse'], 
#                     "val_auc": val_metrics['auc']
#                 }, step=epoch)
                
#                 if val_metrics['f2_score'] > best_f2:
#                     best_f2 = val_metrics['f2_score']
#                     best_model_state = copy.deepcopy(model.state_dict())
#                     early_stop_counter = 0
#                     mlflow.log_metrics({"best_val_f2_so_far": best_f2}, step=epoch)
#                 else:
#                     early_stop_counter += 1
#                     if early_stop_counter >= PATIENCE:
#                         print("  Early stopping triggered."); break

#             if best_model_state:
#                 model.load_state_dict(best_model_state)
#                 mlflow.pytorch.log_model(model, f"model_fold_{fold}_best")
#                 print(f"  >> Fold {fold} Best Val F2: {best_f2:.4f}")
            
#             cv_summary["best_val_f2"].append(best_f2)
#             cv_summary["final_val_mae"].append(val_metrics['mae'])

#     print(f"\n=== CV Complete ===\nAvg Best F2: {np.mean(cv_summary['best_val_f2']):.4f}")
#     mlflow.log_metric("cv_avg_best_f2", np.mean(cv_summary['best_val_f2']))

Numerical columns count after deduplication: 34
Starting Cross-Validation on Folds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

--- Starting Fold 1 ---
Fold 1 | Ep 0 | Train F2: 0.712 | Val F2: 0.722 | Train MAE: 11.385 | Val MAE: 12.040
Fold 1 | Ep 1 | Train F2: 0.723 | Val F2: 0.699 | Train MAE: 11.307 | Val MAE: 12.026
Fold 1 | Ep 2 | Train F2: 0.734 | Val F2: 0.745 | Train MAE: 11.254 | Val MAE: 11.970
Fold 1 | Ep 3 | Train F2: 0.724 | Val F2: 0.711 | Train MAE: 11.227 | Val MAE: 11.955
Fold 1 | Ep 4 | Train F2: 0.728 | Val F2: 0.724 | Train MAE: 11.134 | Val MAE: 11.910
Fold 1 | Ep 5 | Train F2: 0.729 | Val F2: 0.734 | Train MAE: 11.085 | Val MAE: 11.937
Fold 1 | Ep 6 | Train F2: 0.730 | Val F2: 0.728 | Train MAE: 11.083 | Val MAE: 11.910
Fold 1 | Ep 7 | Train F2: 0.730 | Val F2: 0.742 | Train MAE: 11.046 | Val MAE: 11.928
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 1 Best Val F2: 0.7450

--- Starting Fold 2 ---
Fold 2 | Ep 0 | Train F2: 0.716 | Val F2: 0.726 | Train MAE: 11.799 | Val MAE: 13.205
Fold 2 | Ep 1 | Train F2: 0.731 | Val F2: 0.745 | Train MAE: 11.625 | Val MAE: 12.925
Fold 2 | Ep 2 | Train F2: 0.729 | Val F2: 0.743 | Train MAE: 11.684 | Val MAE: 12.980
Fold 2 | Ep 3 | Train F2: 0.729 | Val F2: 0.735 | Train MAE: 11.557 | Val MAE: 13.000
Fold 2 | Ep 4 | Train F2: 0.735 | Val F2: 0.747 | Train MAE: 11.462 | Val MAE: 12.906
Fold 2 | Ep 5 | Train F2: 0.730 | Val F2: 0.743 | Train MAE: 11.391 | Val MAE: 12.941
Fold 2 | Ep 6 | Train F2: 0.739 | Val F2: 0.753 | Train MAE: 11.355 | Val MAE: 12.897
Fold 2 | Ep 7 | Train F2: 0.735 | Val F2: 0.747 | Train MAE: 11.444 | Val MAE: 13.037
Fold 2 | Ep 8 | Train F2: 0.739 | Val F2: 0.751 | Train MAE: 11.285 | Val MAE: 12.942
Fold 2 | Ep 9 | Train F2: 0.741 | Val F2: 0.755 | Train MAE: 11.346 | Val MAE: 13.006
Fold 2 | Ep 10 | Train F2: 0.733 | Val F2: 0.745 | Train MAE: 11.244 | Val MAE: 12.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 2 Best Val F2: 0.7547

--- Starting Fold 3 ---
Fold 3 | Ep 0 | Train F2: 0.737 | Val F2: 0.777 | Train MAE: 11.715 | Val MAE: 14.357
Fold 3 | Ep 1 | Train F2: 0.735 | Val F2: 0.777 | Train MAE: 11.685 | Val MAE: 14.309
Fold 3 | Ep 2 | Train F2: 0.731 | Val F2: 0.755 | Train MAE: 11.608 | Val MAE: 14.448
Fold 3 | Ep 3 | Train F2: 0.736 | Val F2: 0.752 | Train MAE: 11.519 | Val MAE: 14.477
Fold 3 | Ep 4 | Train F2: 0.738 | Val F2: 0.769 | Train MAE: 11.443 | Val MAE: 14.282
Fold 3 | Ep 5 | Train F2: 0.738 | Val F2: 0.755 | Train MAE: 11.405 | Val MAE: 14.182
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 3 Best Val F2: 0.7774

--- Starting Fold 4 ---
Fold 4 | Ep 0 | Train F2: 0.718 | Val F2: 0.771 | Train MAE: 12.035 | Val MAE: 15.477
Fold 4 | Ep 1 | Train F2: 0.712 | Val F2: 0.769 | Train MAE: 11.832 | Val MAE: 15.049
Fold 4 | Ep 2 | Train F2: 0.712 | Val F2: 0.757 | Train MAE: 11.984 | Val MAE: 15.446
Fold 4 | Ep 3 | Train F2: 0.735 | Val F2: 0.777 | Train MAE: 12.124 | Val MAE: 15.572
Fold 4 | Ep 4 | Train F2: 0.735 | Val F2: 0.779 | Train MAE: 11.562 | Val MAE: 14.980
Fold 4 | Ep 5 | Train F2: 0.725 | Val F2: 0.775 | Train MAE: 11.616 | Val MAE: 14.973
Fold 4 | Ep 6 | Train F2: 0.740 | Val F2: 0.777 | Train MAE: 11.932 | Val MAE: 15.307
Fold 4 | Ep 7 | Train F2: 0.737 | Val F2: 0.775 | Train MAE: 11.701 | Val MAE: 15.158
Fold 4 | Ep 8 | Train F2: 0.740 | Val F2: 0.778 | Train MAE: 11.423 | Val MAE: 14.905
Fold 4 | Ep 9 | Train F2: 0.737 | Val F2: 0.779 | Train MAE: 11.458 | Val MAE: 14.947
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 4 Best Val F2: 0.7791

--- Starting Fold 5 ---
Fold 5 | Ep 0 | Train F2: 0.740 | Val F2: 0.756 | Train MAE: 12.019 | Val MAE: 14.078
Fold 5 | Ep 1 | Train F2: 0.735 | Val F2: 0.757 | Train MAE: 11.947 | Val MAE: 14.134
Fold 5 | Ep 2 | Train F2: 0.728 | Val F2: 0.753 | Train MAE: 11.910 | Val MAE: 14.053
Fold 5 | Ep 3 | Train F2: 0.740 | Val F2: 0.758 | Train MAE: 11.856 | Val MAE: 13.997
Fold 5 | Ep 4 | Train F2: 0.745 | Val F2: 0.758 | Train MAE: 11.789 | Val MAE: 13.971
Fold 5 | Ep 5 | Train F2: 0.743 | Val F2: 0.758 | Train MAE: 11.729 | Val MAE: 13.967
Fold 5 | Ep 6 | Train F2: 0.742 | Val F2: 0.758 | Train MAE: 11.745 | Val MAE: 14.103
Fold 5 | Ep 7 | Train F2: 0.739 | Val F2: 0.754 | Train MAE: 11.662 | Val MAE: 13.889
Fold 5 | Ep 8 | Train F2: 0.738 | Val F2: 0.758 | Train MAE: 11.654 | Val MAE: 13.892
Fold 5 | Ep 9 | Train F2: 0.748 | Val F2: 0.758 | Train MAE: 11.604 | Val MAE: 13.942
Fold 5 | Ep 10 | Train F2: 0.741 | Val F2: 0.757 | Train MAE: 11.607 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 5 Best Val F2: 0.7586

--- Starting Fold 6 ---
Fold 6 | Ep 0 | Train F2: 0.732 | Val F2: 0.733 | Train MAE: 12.137 | Val MAE: 13.982
Fold 6 | Ep 1 | Train F2: 0.735 | Val F2: 0.756 | Train MAE: 12.083 | Val MAE: 13.915
Fold 6 | Ep 2 | Train F2: 0.729 | Val F2: 0.745 | Train MAE: 12.117 | Val MAE: 14.104
Fold 6 | Ep 3 | Train F2: 0.742 | Val F2: 0.755 | Train MAE: 11.984 | Val MAE: 13.886
Fold 6 | Ep 4 | Train F2: 0.746 | Val F2: 0.758 | Train MAE: 11.903 | Val MAE: 13.992
Fold 6 | Ep 5 | Train F2: 0.740 | Val F2: 0.749 | Train MAE: 11.885 | Val MAE: 14.067
Fold 6 | Ep 6 | Train F2: 0.747 | Val F2: 0.759 | Train MAE: 11.852 | Val MAE: 13.967
Fold 6 | Ep 7 | Train F2: 0.748 | Val F2: 0.760 | Train MAE: 11.783 | Val MAE: 13.927
Fold 6 | Ep 8 | Train F2: 0.747 | Val F2: 0.758 | Train MAE: 11.827 | Val MAE: 14.057
Fold 6 | Ep 9 | Train F2: 0.748 | Val F2: 0.757 | Train MAE: 11.734 | Val MAE: 13.979
Fold 6 | Ep 10 | Train F2: 0.742 | Val F2: 0.753 | Train MAE: 11.674 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 6 Best Val F2: 0.7599

--- Starting Fold 7 ---
Fold 7 | Ep 0 | Train F2: 0.737 | Val F2: 0.736 | Train MAE: 12.902 | Val MAE: 13.798
Fold 7 | Ep 1 | Train F2: 0.741 | Val F2: 0.729 | Train MAE: 12.819 | Val MAE: 13.736
Fold 7 | Ep 2 | Train F2: 0.744 | Val F2: 0.737 | Train MAE: 12.756 | Val MAE: 13.714
Fold 7 | Ep 3 | Train F2: 0.754 | Val F2: 0.751 | Train MAE: 12.705 | Val MAE: 13.682
Fold 7 | Ep 4 | Train F2: 0.752 | Val F2: 0.750 | Train MAE: 12.629 | Val MAE: 13.626
Fold 7 | Ep 5 | Train F2: 0.738 | Val F2: 0.733 | Train MAE: 12.648 | Val MAE: 13.669
Fold 7 | Ep 6 | Train F2: 0.748 | Val F2: 0.740 | Train MAE: 12.591 | Val MAE: 13.654
Fold 7 | Ep 7 | Train F2: 0.756 | Val F2: 0.751 | Train MAE: 12.517 | Val MAE: 13.616
Fold 7 | Ep 8 | Train F2: 0.751 | Val F2: 0.744 | Train MAE: 12.544 | Val MAE: 13.644
Fold 7 | Ep 9 | Train F2: 0.755 | Val F2: 0.750 | Train MAE: 12.465 | Val MAE: 13.597
Fold 7 | Ep 10 | Train F2: 0.745 | Val F2: 0.731 | Train MAE: 12.578 | Val MAE: 13.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 7 Best Val F2: 0.7537

--- Starting Fold 8 ---
Fold 8 | Ep 0 | Train F2: 0.752 | Val F2: 0.696 | Train MAE: 13.112 | Val MAE: 11.134
Fold 8 | Ep 1 | Train F2: 0.750 | Val F2: 0.690 | Train MAE: 12.960 | Val MAE: 11.082
Fold 8 | Ep 2 | Train F2: 0.758 | Val F2: 0.703 | Train MAE: 12.902 | Val MAE: 11.054
Fold 8 | Ep 3 | Train F2: 0.752 | Val F2: 0.692 | Train MAE: 12.868 | Val MAE: 11.031
Fold 8 | Ep 4 | Train F2: 0.755 | Val F2: 0.700 | Train MAE: 12.797 | Val MAE: 11.099
Fold 8 | Ep 5 | Train F2: 0.756 | Val F2: 0.701 | Train MAE: 12.767 | Val MAE: 11.073
Fold 8 | Ep 6 | Train F2: 0.757 | Val F2: 0.701 | Train MAE: 12.706 | Val MAE: 11.038
Fold 8 | Ep 7 | Train F2: 0.757 | Val F2: 0.703 | Train MAE: 12.724 | Val MAE: 11.208
Fold 8 | Ep 8 | Train F2: 0.757 | Val F2: 0.700 | Train MAE: 12.642 | Val MAE: 11.123
Fold 8 | Ep 9 | Train F2: 0.756 | Val F2: 0.700 | Train MAE: 12.590 | Val MAE: 11.059
Fold 8 | Ep 10 | Train F2: 0.755 | Val F2: 0.695 | Train MAE: 12.589 | Val MAE: 11.



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 8 Best Val F2: 0.7039

--- Starting Fold 9 ---
Fold 9 | Ep 0 | Train F2: 0.754 | Val F2: 0.644 | Train MAE: 13.414 | Val MAE: 8.722
Fold 9 | Ep 1 | Train F2: 0.758 | Val F2: 0.662 | Train MAE: 13.489 | Val MAE: 8.848
Fold 9 | Ep 2 | Train F2: 0.755 | Val F2: 0.637 | Train MAE: 13.218 | Val MAE: 8.670
Fold 9 | Ep 3 | Train F2: 0.757 | Val F2: 0.639 | Train MAE: 13.179 | Val MAE: 8.755
Fold 9 | Ep 4 | Train F2: 0.758 | Val F2: 0.638 | Train MAE: 13.159 | Val MAE: 8.802
Fold 9 | Ep 5 | Train F2: 0.761 | Val F2: 0.662 | Train MAE: 13.348 | Val MAE: 8.708
Fold 9 | Ep 6 | Train F2: 0.752 | Val F2: 0.620 | Train MAE: 13.086 | Val MAE: 8.719
Fold 9 | Ep 7 | Train F2: 0.758 | Val F2: 0.630 | Train MAE: 12.949 | Val MAE: 8.716
Fold 9 | Ep 8 | Train F2: 0.757 | Val F2: 0.619 | Train MAE: 12.936 | Val MAE: 8.636
Fold 9 | Ep 9 | Train F2: 0.760 | Val F2: 0.640 | Train MAE: 12.887 | Val MAE: 8.711
Fold 9 | Ep 10 | Train F2: 0.762 | Val F2: 0.662 | Train MAE: 12.855 | Val MAE: 8.646
  Early



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 9 Best Val F2: 0.6624

--- Starting Fold 10 ---
Fold 10 | Ep 0 | Train F2: 0.743 | Val F2: 0.652 | Train MAE: 13.126 | Val MAE: 8.689
Fold 10 | Ep 1 | Train F2: 0.751 | Val F2: 0.657 | Train MAE: 13.017 | Val MAE: 8.782
Fold 10 | Ep 2 | Train F2: 0.748 | Val F2: 0.617 | Train MAE: 12.916 | Val MAE: 8.591
Fold 10 | Ep 3 | Train F2: 0.748 | Val F2: 0.624 | Train MAE: 12.857 | Val MAE: 8.661
Fold 10 | Ep 4 | Train F2: 0.750 | Val F2: 0.635 | Train MAE: 12.849 | Val MAE: 8.708
Fold 10 | Ep 5 | Train F2: 0.752 | Val F2: 0.655 | Train MAE: 12.761 | Val MAE: 8.569
Fold 10 | Ep 6 | Train F2: 0.745 | Val F2: 0.641 | Train MAE: 12.818 | Val MAE: 8.712
  Early stopping triggered.




Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

  >> Fold 10 Best Val F2: 0.6572

=== CV Complete ===
Avg Best F2: 0.7352
