In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import copy
import torch.nn.functional as F
from google.colab import drive
import shutil
import time
import numpy as np

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

CONFIG = {
    "visual_dim": 768,
    "audio_dim": 768,
    "semantic_dim": 1024,
    "lstm_hidden_dim": 128,
    "lstm_layers": 1,
    "dropout": 0.5,
    "batch_size": 64,
    "learning_rate": 1e-4,
    "epochs": 50,
    "patience": 10,
    "weight_decay": 1e-3,
    "output_dim": 1
}

drive.mount('/content/drive')

BASE_PROJECT_DIR = '/content/drive/MyDrive/VEA'

PATHS = {
    "semantic_dir": os.path.join(BASE_PROJECT_DIR, "features_semantic"),
    "visual_dir": os.path.join(BASE_PROJECT_DIR, "features_visual"),
    "audio_dir": os.path.join(BASE_PROJECT_DIR, "features_audio"),
    "train_labels": os.path.join(BASE_PROJECT_DIR, "labels/16_train_labels.csv"),
    "test_labels": os.path.join(BASE_PROJECT_DIR, "labels/16_test_labels.csv"),
    "model_save_dir": os.path.join(BASE_PROJECT_DIR, "models"),
    "analysis_save_dir": os.path.join(BASE_PROJECT_DIR, "analysis"),
}

def accelerate_io(paths_config):
    print("Initiating data transfer to local runtime for I/O acceleration...")
    local_base = '/content/temp_data'
    start_total = time.time()
    feature_keys = ["semantic_dir", "visual_dir", "audio_dir"]

    for key in feature_keys:
        drive_path = paths_config[key]
        folder_name = os.path.basename(drive_path)
        local_path = os.path.join(local_base, folder_name)

        if not os.path.exists(local_path):
            print(f" -> Copying {folder_name}...")
            try:
                shutil.copytree(drive_path, local_path)
            except FileNotFoundError:
                print(f" [WARNING] Drive path not found: {drive_path}. Skipping.")
                continue
        else:
            print(f" -> {folder_name} already exists locally. Skipping copy.")

        paths_config[key] = local_path

    print(f"Data preparation completed. Time elapsed: {time.time() - start_total:.2f} seconds")
    return paths_config

PATHS = accelerate_io(PATHS)

print("\nUpdated feature paths:")
print(f"Visual: {PATHS['visual_dir']}")
print(f"Audio:  {PATHS['audio_dir']}")
print("-" * 30)

class MultimodalDataset(Dataset):
    def __init__(self, labels_path, paths_config):
        self.df = pd.read_csv(labels_path)
        self.video_ids = self.df['video_id'].astype(str).str.strip().values
        self.valence_labels = torch.tensor(self.df['valence'].values, dtype=torch.float)
        self.arousal_labels = torch.tensor(self.df['arousal'].values, dtype=torch.float)

        self.visual_feats = []
        self.audio_feats = []
        self.semantic_feats = []

        print(f"Loading data from: {os.path.basename(labels_path)}...")

        for vid in tqdm(self.video_ids, desc="Loading Features to RAM"):
            try:
                v_path = os.path.join(paths_config["visual_dir"], f"{vid}.npy")
                a_path = os.path.join(paths_config["audio_dir"], f"{vid}.npy")
                s_path = os.path.join(paths_config["semantic_dir"], f"{vid}.npy")

                self.visual_feats.append(torch.from_numpy(np.load(v_path)).float())
                self.audio_feats.append(torch.from_numpy(np.load(a_path)).float())
                self.semantic_feats.append(torch.from_numpy(np.load(s_path)).float())
            except FileNotFoundError as e:
                print(f"File missing: {vid} - {e}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'visual': self.visual_feats[idx],
            'audio': self.audio_feats[idx],
            'semantic': self.semantic_feats[idx],
            'valence': self.valence_labels[idx],
            'arousal': self.arousal_labels[idx]
        }

CONFIG["batch_size"] = 32

print("\nInitializing Train Loader...")
try:
    train_dataset = MultimodalDataset(PATHS["train_labels"], PATHS)
    train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True, num_workers=2)

    print("Initializing Test Loader...")
    test_dataset = MultimodalDataset(PATHS["test_labels"], PATHS)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=2)

    print("\nDataLoaders successfully initialized.")

except Exception as e:
    print(f"Data loading error: {e}")

In [None]:
class LateFusionBaseline(nn.Module):
    def __init__(self, config):
        super(LateFusionBaseline, self).__init__()

        self.lstm_v = nn.LSTM(
            input_size=config["visual_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        self.lstm_a = nn.LSTM(
            input_size=config["audio_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        self.lstm_s = nn.LSTM(
            input_size=config["semantic_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        fusion_dim = config["lstm_hidden_dim"] * 2 * 3
        self.dropout = nn.Dropout(config["dropout"])

        self.regressor_valence = nn.Sequential(
            nn.Linear(fusion_dim, 128),
            nn.ReLU(),
            nn.Dropout(config["dropout"]),
            nn.Linear(128, 1)
        )

        self.regressor_arousal = nn.Sequential(
            nn.Linear(fusion_dim, 128),
            nn.ReLU(),
            nn.Dropout(config["dropout"]),
            nn.Linear(128, 1)
        )

    def forward(self, x_v, x_a, x_s):
        self.lstm_v.flatten_parameters()
        _, (h_v, _) = self.lstm_v(x_v)
        feat_v = torch.cat((h_v[-2,:,:], h_v[-1,:,:]), dim=1)

        self.lstm_a.flatten_parameters()
        _, (h_a, _) = self.lstm_a(x_a)
        feat_a = torch.cat((h_a[-2,:,:], h_a[-1,:,:]), dim=1)

        self.lstm_s.flatten_parameters()
        _, (h_s, _) = self.lstm_s(x_s)
        feat_s = torch.cat((h_s[-2,:,:], h_s[-1,:,:]), dim=1)

        fusion_vec = torch.cat((feat_v, feat_a, feat_s), dim=1)
        fusion_vec = self.dropout(fusion_vec)

        val_pred = self.regressor_valence(fusion_vec).squeeze(-1)
        aro_pred = self.regressor_arousal(fusion_vec).squeeze(-1)

        return val_pred, aro_pred

def calculate_regression_metrics(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return mse, mae

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0

    for batch in tqdm(loader, desc="Training"):
        x_v = batch['visual'].to(device)
        x_a = batch['audio'].to(device)
        x_s = batch['semantic'].to(device)
        y_v = batch['valence'].to(device)
        y_a = batch['arousal'].to(device)

        optimizer.zero_grad()
        pred_v, pred_a = model(x_v, x_a, x_s)

        loss_v = criterion(pred_v, y_v)
        loss_a = criterion(pred_a, y_a)
        loss = loss_v + loss_a

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    return running_loss / len(loader)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0

    all_v_preds = []
    all_v_targets = []
    all_a_preds = []
    all_a_targets = []

    with torch.no_grad():
        for batch in data_loader:
            x_v = batch['visual'].to(device)
            x_a = batch['audio'].to(device)
            x_s = batch['semantic'].to(device)
            v_targets = batch['valence'].to(device)
            a_targets = batch['arousal'].to(device)

            v_preds, a_preds = model(x_v, x_a, x_s)

            loss = criterion(v_preds, v_targets) + criterion(a_preds, a_targets)
            total_loss += loss.item() * v_targets.size(0)

            all_v_preds.append(v_preds.cpu())
            all_v_targets.append(v_targets.cpu())
            all_a_preds.append(a_preds.cpu())
            all_a_targets.append(a_targets.cpu())

    v_preds_all = torch.cat(all_v_preds).squeeze()
    v_targets_all = torch.cat(all_v_targets).squeeze()
    a_preds_all = torch.cat(all_a_preds).squeeze()
    a_targets_all = torch.cat(all_a_targets).squeeze()

    val_loss = total_loss / len(data_loader.dataset)

    def compute_pcc(preds, targets):
        vx = preds - torch.mean(preds)
        vy = targets - torch.mean(targets)
        cov = torch.sum(vx * vy)
        denom = torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2))
        return (cov / (denom + 1e-8)).item()

    mse_v = nn.MSELoss()(v_preds_all, v_targets_all).item()
    mae_v = nn.L1Loss()(v_preds_all, v_targets_all).item()
    pcc_v = compute_pcc(v_preds_all, v_targets_all)

    mse_a = nn.MSELoss()(a_preds_all, a_targets_all).item()
    mae_a = nn.L1Loss()(a_preds_all, a_targets_all).item()
    pcc_a = compute_pcc(a_preds_all, a_targets_all)

    return val_loss, mse_v, mae_v, pcc_v, mse_a, mae_a, pcc_a

class CombinedLoss(nn.Module):
    def __init__(self, alpha=1.0, beta=1.0):
        super(CombinedLoss, self).__init__()
        self.alpha = alpha
        self.beta = beta

    def forward(self, x, y):
        x = x.squeeze()
        y = y.squeeze()

        mse_loss = torch.mean((x - y) ** 2)

        x_mean = torch.mean(x)
        y_mean = torch.mean(y)
        covariance = torch.mean((x - x_mean) * (y - y_mean))
        x_var = torch.mean((x - x_mean) ** 2)
        y_var = torch.mean((y - y_mean) ** 2)

        numerator = 2 * covariance
        denominator = x_var + y_var + (x_mean - y_mean) ** 2 + 1e-8
        ccc = numerator / denominator
        ccc_loss = 1.0 - ccc

        return self.alpha * mse_loss + self.beta * ccc_loss

model = LateFusionBaseline(CONFIG).to(device)
print(model)

criterion = CombinedLoss(alpha=1.0, beta=1.0)
optimizer = optim.Adam(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
save_path = os.path.join(PATHS["model_save_dir"], "late_fusion_lstm_combined.pth")

best_val_loss = float('inf')
patience_counter = 0

print("Starting Training with Combined Loss (MSE + CCC)...")
print("-" * 115)
print(f"{'Epoch':<6} | {'Train Loss':<10} | {'Val Loss':<10} | {'V-MSE':<8} {'V-PCC':<8} | {'A-MSE':<8} {'A-PCC':<8} | {'LR'}")
print("-" * 115)

for epoch in range(CONFIG["epochs"]):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, mse_v, mae_v, pcc_v, mse_a, mae_a, pcc_a = evaluate(model, test_loader, criterion, device)

    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']

    print(f"{epoch+1:<6} | {train_loss:.4f}     | {val_loss:.4f}     | {mse_v:.4f}   {pcc_v:.4f}   | {mse_a:.4f}   {pcc_a:.4f}   | {current_lr:.1e}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), save_path)
    else:
        patience_counter += 1

    if patience_counter >= CONFIG["patience"]:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        break

print("-" * 115)
print(f"Training finished. Best Val Combined Loss: {best_val_loss:.4f}")

In [None]:
model_name = "late_fusion_lstm_combined.pth"
load_path = os.path.join(PATHS["model_save_dir"], model_name)

print(f"Loading best model from {load_path}...")
model.load_state_dict(torch.load(load_path))
model.eval()

all_preds_v, all_labels_v = [], []
all_preds_a, all_labels_a = [], []

print("Running inference on Test Set...")
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Inference"):
        x_v = batch['visual'].to(device)
        x_a = batch['audio'].to(device)
        x_s = batch['semantic'].to(device)

        pred_v, pred_a = model(x_v, x_a, x_s)

        all_preds_v.extend(pred_v.cpu().numpy())
        all_labels_v.extend(batch['valence'].cpu().numpy())
        all_preds_a.extend(pred_a.cpu().numpy())
        all_labels_a.extend(batch['arousal'].cpu().numpy())

video_ids = test_loader.dataset.video_ids

df_results = pd.DataFrame({
    "video_id": video_ids,
    "valence_true": all_labels_v,
    "valence_pred": all_preds_v,
    "arousal_true": all_labels_a,
    "arousal_pred": all_preds_a
})

df_results["valence_error"] = abs(df_results["valence_true"] - df_results["valence_pred"])
df_results["arousal_error"] = abs(df_results["arousal_true"] - df_results["arousal_pred"])

output_file = os.path.join(PATHS["analysis_save_dir"], "late_fusion_lstm_regression_predictions.csv")
df_results.to_csv(output_file, index=False)
print(f"\nPredictions saved to: {output_file}")

print("\n" + "="*30)
print("FINAL REGRESSION REPORT")
print("="*30)

np_labels_v = np.array(all_labels_v).flatten()
np_preds_v = np.array(all_preds_v).flatten()
np_labels_a = np.array(all_labels_a).flatten()
np_preds_a = np.array(all_preds_a).flatten()

mse_v = mean_squared_error(all_labels_v, all_preds_v)
mae_v = mean_absolute_error(all_labels_v, all_preds_v)
pcc_v = np.corrcoef(np_labels_v, np_preds_v)[0, 1]

print(f"\n--- VALENCE ---")
print(f"MSE: {mse_v:.4f}")
print(f"MAE: {mae_v:.4f}")
print(f"PCC: {pcc_v:.4f}")

mse_a = mean_squared_error(all_labels_a, all_preds_a)
mae_a = mean_absolute_error(all_labels_a, all_preds_a)
pcc_a = np.corrcoef(np_labels_a, np_preds_a)[0, 1]

print(f"\n--- AROUSAL ---")
print(f"MSE: {mse_a:.4f}")
print(f"MAE: {mae_a:.4f}")
print(f"PCC: {pcc_a:.4f}")