In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from collections import Counter
from google.colab import drive
import shutil
import time
import subprocess

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

drive.mount('/content/drive')

BASE_PROJECT_DIR = '/content/drive/MyDrive/VEA'
LOCAL_ROOT = '/content/temp_data'

PATHS = {
    "semantic_dir": os.path.join(BASE_PROJECT_DIR, "features_semantic"),
    "visual_dir": os.path.join(BASE_PROJECT_DIR, "features_visual"),
    "audio_dir": os.path.join(BASE_PROJECT_DIR, "features_audio"),
    "train_labels": os.path.join(BASE_PROJECT_DIR, "labels/15_train_labels.csv"),
    "test_labels": os.path.join(BASE_PROJECT_DIR, "labels/15_test_labels.csv"),
    "model_save_dir": os.path.join(BASE_PROJECT_DIR, "models"),
    "analysis_save_dir": os.path.join(BASE_PROJECT_DIR, "analysis"),
}

os.makedirs(PATHS["model_save_dir"], exist_ok=True)
os.makedirs(PATHS["analysis_save_dir"], exist_ok=True)

def accelerate_io(paths_config):
    print("Initiating data transfer to local runtime for I/O acceleration...")
    start_total = time.time()
    feature_keys = ["semantic_dir", "visual_dir", "audio_dir"]

    for key in feature_keys:
        drive_path = paths_config[key]
        folder_name = os.path.basename(drive_path)
        local_path = os.path.join(LOCAL_ROOT, folder_name)

        if not os.path.exists(local_path):
            print(f" -> Copying {folder_name}...")
            os.makedirs(local_path, exist_ok=True)
            try:
                subprocess.run(["cp", "-r", drive_path, os.path.dirname(local_path)], check=True)
            except subprocess.CalledProcessError:
                print(f" [WARNING] Failed to copy {folder_name} using cp. Trying shutil.copytree.")
                try:
                    shutil.copytree(drive_path, local_path)
                except FileNotFoundError:
                    print(f" [WARNING] Drive path not found: {drive_path}. Skipping.")
                    continue
        else:
            print(f" -> {folder_name} already exists locally. Skipping copy.")

        paths_config[key] = local_path

    local_labels_dir = os.path.join(LOCAL_ROOT, "labels")
    os.makedirs(local_labels_dir, exist_ok=True)

    label_keys = ["train_labels", "test_labels"]
    for key in label_keys:
        drive_path = paths_config[key]
        file_name = os.path.basename(drive_path)
        local_path = os.path.join(local_labels_dir, file_name)

        if not os.path.exists(local_path):
            print(f" -> Copying {file_name}...")
            shutil.copy2(drive_path, local_path)
        else:
            print(f" -> {file_name} already exists locally. Skipping copy.")

        paths_config[key] = local_path


    print(f"\nData preparation completed. Time elapsed: {time.time() - start_total:.2f} seconds")
    return paths_config

PATHS = accelerate_io(PATHS)

df_train = pd.read_csv(PATHS["train_labels"])

def compute_class_weights(df, target_col):
    labels = df[target_col].values
    class_counts = Counter(labels)
    total_samples = len(labels)
    num_classes = 3
    weights = []

    for cls_val in sorted(class_counts.keys()):
        count = class_counts.get(cls_val, 0)
        w = total_samples / (num_classes * count)
        weights.append(w)

    return torch.tensor(weights, dtype=torch.float)

try:
    # Use the new column names for weight calculation
    valence_weights = compute_class_weights(df_train, 'valenceClass')
    arousal_weights = compute_class_weights(df_train, 'arousalClass')
except KeyError:
    print("[WARNING] Could not compute class weights using 'valenceClass' or 'arousalClass'. Using uniform weights.")
    valence_weights = torch.tensor([1.0, 1.0, 1.0], dtype=torch.float)
    arousal_weights = torch.tensor([1.0, 1.0, 1.0], dtype=torch.float)

CONFIG = {
    "visual_dim": 768,
    "audio_dim": 768,
    "semantic_dim": 1024,
    "lstm_hidden_dim": 128,
    "lstm_layers": 1,
    "dropout": 0.5,
    "batch_size": 32,
    "learning_rate": 1e-4,
    "epochs": 50,
    "patience": 10,
    "weight_decay": 1e-3,
    "num_classes": 3,
    "valence_weights": valence_weights,
    "arousal_weights": arousal_weights,
}

class MultimodalDataset(Dataset):
    def __init__(self, labels_path, paths_config):
        self.df = pd.read_csv(labels_path)
        self.video_ids = self.df['video_id'].astype(str).str.strip().values

        # USE NEW COLUMN NAMES: 'valenceClass' and 'arousalClass'
        self.valence_labels = torch.tensor(self.df['valenceClass'].values + 1, dtype=torch.long)
        self.arousal_labels = torch.tensor(self.df['arousalClass'].values + 1, dtype=torch.long)

        self.visual_feats = []
        self.audio_feats = []
        self.semantic_feats = []

        print(f"Loading data from: {os.path.basename(labels_path)}...")

        loaded_vids = []
        for i, vid in enumerate(tqdm(self.video_ids, desc="Loading Features to RAM")):
            try:
                v_path = os.path.join(paths_config["visual_dir"], f"{vid}.npy")
                a_path = os.path.join(paths_config["audio_dir"], f"{vid}.npy")
                s_path = os.path.join(paths_config["semantic_dir"], f"{vid}.npy")

                self.visual_feats.append(torch.from_numpy(np.load(v_path)).float())
                self.audio_feats.append(torch.from_numpy(np.load(a_path)).float())
                self.semantic_feats.append(torch.from_numpy(np.load(s_path)).float())
                loaded_vids.append(i)
            except FileNotFoundError as e:
                print(f" [ERROR] File missing for video {vid}: {e}. Skipping this sample.")

        self.valence_labels = self.valence_labels[loaded_vids]
        self.arousal_labels = self.arousal_labels[loaded_vids]
        self.video_ids = self.video_ids[loaded_vids]


    def __len__(self):
        return len(self.visual_feats)

    def __getitem__(self, idx):
        return {
            'visual': self.visual_feats[idx],
            'audio': self.audio_feats[idx],
            'semantic': self.semantic_feats[idx],
            'valence': self.valence_labels[idx],
            'arousal': self.arousal_labels[idx]
        }

print("\nInitializing Train Loader...")
train_dataset = MultimodalDataset(PATHS["train_labels"], PATHS)
train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True, num_workers=2)

print("\nInitializing Test Loader...")
test_dataset = MultimodalDataset(PATHS["test_labels"], PATHS)
test_loader = DataLoader(test_dataset, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=2)

print("\nDataLoaders successfully initialized.")

In [None]:
class LateFusionBaseline(nn.Module):
    def __init__(self, config):
        super(LateFusionBaseline, self).__init__()

        self.lstm_v = nn.LSTM(
            input_size=config["visual_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        self.lstm_a = nn.LSTM(
            input_size=config["audio_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        self.lstm_s = nn.LSTM(
            input_size=config["semantic_dim"],
            hidden_size=config["lstm_hidden_dim"],
            num_layers=config["lstm_layers"],
            batch_first=True,
            bidirectional=True
        )

        fusion_dim = config["lstm_hidden_dim"] * 2 * 3
        self.dropout = nn.Dropout(config["dropout"])

        self.classifier_valence = nn.Sequential(
            nn.Linear(fusion_dim, 128),
            nn.ReLU(),
            nn.Dropout(config["dropout"]),
            nn.Linear(128, config["num_classes"])
        )

        self.classifier_arousal = nn.Sequential(
            nn.Linear(fusion_dim, 128),
            nn.ReLU(),
            nn.Dropout(config["dropout"]),
            nn.Linear(128, config["num_classes"])
        )

    def forward(self, x_v, x_a, x_s):
        self.lstm_v.flatten_parameters()
        _, (h_v, _) = self.lstm_v(x_v)
        feat_v = torch.cat((h_v[-2,:,:], h_v[-1,:,:]), dim=1)

        self.lstm_a.flatten_parameters()
        _, (h_a, _) = self.lstm_a(x_a)
        feat_a = torch.cat((h_a[-2,:,:], h_a[-1,:,:]), dim=1)

        self.lstm_s.flatten_parameters()
        _, (h_s, _) = self.lstm_s(x_s)
        feat_s = torch.cat((h_s[-2,:,:], h_s[-1,:,:]), dim=1)

        fusion_vec = torch.cat((feat_v, feat_a, feat_s), dim=1)
        fusion_vec = self.dropout(fusion_vec)

        out_valence = self.classifier_valence(fusion_vec)
        out_arousal = self.classifier_arousal(fusion_vec)

        return out_valence, out_arousal

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_term = (1 - pt) ** self.gamma

        if self.alpha is not None:
            if self.alpha.device != logits.device:
                self.alpha = self.alpha.to(logits.device)
            alpha_t = self.alpha[targets]
            loss = alpha_t * focal_term * ce_loss
        else:
            loss = focal_term * ce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    return acc, f1

def train_epoch(model, loader, crit_v, crit_a, optimizer, device):
    model.train()
    running_loss = 0.0
    for batch in loader:
        x_v = batch['visual'].to(device)
        x_a = batch['audio'].to(device)
        x_s = batch['semantic'].to(device)
        y_v = batch['valence'].to(device)
        y_a = batch['arousal'].to(device)

        optimizer.zero_grad()
        logits_v, logits_a = model(x_v, x_a, x_s)

        loss_v = crit_v(logits_v, y_v)
        loss_a = crit_a(logits_a, y_a)
        loss = loss_v + loss_a

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def evaluate_epoch(model, loader, crit_v, crit_a, device):
    model.eval()
    running_loss = 0.0
    all_preds_v, all_labels_v = [], []
    all_preds_a, all_labels_a = [], []

    with torch.no_grad():
        for batch in loader:
            x_v = batch['visual'].to(device)
            x_a = batch['audio'].to(device)
            x_s = batch['semantic'].to(device)
            y_v = batch['valence'].to(device)
            y_a = batch['arousal'].to(device)

            logits_v, logits_a = model(x_v, x_a, x_s)

            loss_v = crit_v(logits_v, y_v)
            loss_a = crit_a(logits_a, y_a)
            running_loss += (loss_v + loss_a).item()

            all_preds_v.extend(torch.argmax(logits_v, dim=1).cpu().numpy())
            all_labels_v.extend(y_v.cpu().numpy())
            all_preds_a.extend(torch.argmax(logits_a, dim=1).cpu().numpy())
            all_labels_a.extend(y_a.cpu().numpy())

    acc_v, f1_v = calculate_metrics(all_labels_v, all_preds_v)
    acc_a, f1_a = calculate_metrics(all_labels_a, all_preds_a)
    return running_loss / len(loader), acc_v, f1_v, acc_a, f1_a

model = LateFusionBaseline(CONFIG).to(device)
criterion_v = FocalLoss(alpha=CONFIG["valence_weights"], gamma=2.0)
criterion_a = FocalLoss(alpha=CONFIG["arousal_weights"], gamma=2.0)
optimizer = optim.Adam(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)

best_f1 = -1.0
patience_counter = 0
save_path = os.path.join(PATHS["model_save_dir"], "late_fusion_lstm_focal.pth")

print("-" * 95)
print(f"{'Epoch':<6} | {'Train Loss':<10} | {'Val Loss':<10} | {'V-Acc':<6} {'V-F1':<6} | {'A-Acc':<6} {'A-F1':<6} | {'LR'}")
print("-" * 95)

for epoch in range(CONFIG["epochs"]):
    train_loss = train_epoch(model, train_loader, criterion_v, criterion_a, optimizer, device)
    val_loss, v_acc_v, v_f1_v, v_acc_a, v_f1_a = evaluate_epoch(model, test_loader, criterion_v, criterion_a, device)

    avg_f1 = (v_f1_v + v_f1_a) / 2
    scheduler.step(avg_f1)
    current_lr = optimizer.param_groups[0]['lr']

    print(f"{epoch+1:<6} | {train_loss:.4f}     | {val_loss:.4f}     | {v_acc_v:.2f}   {v_f1_v:.2f}   | {v_acc_a:.2f}   {v_f1_a:.2f}   | {current_lr:.1e}")

    if avg_f1 > best_f1:
        best_f1 = avg_f1
        patience_counter = 0
        torch.save(model.state_dict(), save_path)
    else:
        patience_counter += 1

    if patience_counter >= CONFIG["patience"]:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        break

print("-" * 95)
print(f"Training finished. Best Average F1: {best_f1:.4f}")

In [None]:
from sklearn.metrics import classification_report
model_name = "late_fusion_lstm_focal.pth"
load_path = os.path.join(PATHS["model_save_dir"], model_name)
print(f"Loading best model from {load_path}...")

model.load_state_dict(torch.load(load_path))
model.eval()

all_preds_v, all_labels_v = [], []
all_preds_a, all_labels_a = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Running Inference"):
        x_v = batch['visual'].to(device)
        x_a = batch['audio'].to(device)
        x_s = batch['semantic'].to(device)
        y_v = batch['valence'].to(device)
        y_a = batch['arousal'].to(device)

        logits_v, logits_a = model(x_v, x_a, x_s)

        preds_v = torch.argmax(logits_v, dim=1).cpu().numpy()
        preds_a = torch.argmax(logits_a, dim=1).cpu().numpy()

        all_preds_v.extend(preds_v)
        all_labels_v.extend(y_v.cpu().numpy())
        all_preds_a.extend(preds_a)
        all_labels_a.extend(y_a.cpu().numpy())

decoded_preds_v = [p - 1 for p in all_preds_v]
decoded_true_v  = [l - 1 for l in all_labels_v]
decoded_preds_a = [p - 1 for p in all_preds_a]
decoded_true_a  = [l - 1 for l in all_labels_a]

video_ids = test_loader.dataset.video_ids

df_results = pd.DataFrame({
    "video_id": video_ids,
    "valence_true": decoded_true_v,
    "valence_pred": decoded_preds_v,
    "arousal_true": decoded_true_a,
    "arousal_pred": decoded_preds_a
})

df_results["valence_correct"] = df_results["valence_true"] == df_results["valence_pred"]
df_results["arousal_correct"] = df_results["arousal_true"] == df_results["arousal_pred"]

output_file = os.path.join(PATHS["analysis_save_dir"], "late_fusion_lstm_test_predictions.csv")
df_results.to_csv(output_file, index=False)
print(f"\nPredictions saved to: {output_file}")

print("\n" + "="*30)
print("FINAL EVALUATION REPORT")
print("="*30)

target_names = ['Negative (-1)', 'Neutral (0)', 'Positive (1)']

print("\n--- VALENCE ---")
acc_v = accuracy_score(all_labels_v, all_preds_v)
print(f"Accuracy: {acc_v:.4f}")
print(classification_report(all_labels_v, all_preds_v, target_names=target_names))

print("\n--- AROUSAL ---")
acc_a = accuracy_score(all_labels_a, all_preds_a)
print(f"Accuracy: {acc_a:.4f}")
print(classification_report(all_labels_a, all_preds_a, target_names=target_names))

In [None]:
def run_unimodal_experiment(active_modality, config, train_loader, test_loader, device):
    print(f"\n" + "="*40)
    print(f"STARTING EXPERIMENT: {active_modality.upper()} ONLY")
    print("="*40)

    model_ablation = LateFusionBaseline(config).to(device)
    crit_v = FocalLoss(alpha=config["valence_weights"], gamma=2.0)
    crit_a = FocalLoss(alpha=config["arousal_weights"], gamma=2.0)
    optimizer = optim.Adam(model_ablation.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])

    epochs = 20

    for epoch in range(epochs):
        model_ablation.train()
        for batch in train_loader:
            x_v = batch['visual'].to(device)
            x_a = batch['audio'].to(device)
            x_s = batch['semantic'].to(device)
            y_v = batch['valence'].to(device)
            y_a = batch['arousal'].to(device)

            if active_modality != 'visual':
                x_v = torch.zeros_like(x_v)
            if active_modality != 'audio':
                x_a = torch.zeros_like(x_a)
            if active_modality != 'semantic':
                x_s = torch.zeros_like(x_s)

            optimizer.zero_grad()
            logits_v, logits_a = model_ablation(x_v, x_a, x_s)

            loss_v = crit_v(logits_v, y_v)
            loss_a = crit_a(logits_a, y_a)
            loss = loss_v + loss_a

            loss.backward()
            optimizer.step()

    model_ablation.eval()
    all_preds_v, all_labels_v = [], []
    all_preds_a, all_labels_a = [], []

    with torch.no_grad():
        for batch in test_loader:
            x_v = batch['visual'].to(device)
            x_a = batch['audio'].to(device)
            x_s = batch['semantic'].to(device)
            y_v = batch['valence'].to(device)
            y_a = batch['arousal'].to(device)

            if active_modality != 'visual':
                x_v = torch.zeros_like(x_v)
            if active_modality != 'audio':
                x_a = torch.zeros_like(x_a)
            if active_modality != 'semantic':
                x_s = torch.zeros_like(x_s)

            logits_v, logits_a = model_ablation(x_v, x_a, x_s)

            all_preds_v.extend(torch.argmax(logits_v, dim=1).cpu().numpy())
            all_labels_v.extend(y_v.cpu().numpy())
            all_preds_a.extend(torch.argmax(logits_a, dim=1).cpu().numpy())
            all_labels_a.extend(y_a.cpu().numpy())

    acc_v, f1_v = calculate_metrics(all_labels_v, all_preds_v)
    acc_a, f1_a = calculate_metrics(all_labels_a, all_preds_a)
    return acc_v, f1_v, acc_a, f1_a

results = {}
modalities = ['visual', 'audio', 'semantic']

for mod in modalities:
    results[mod] = run_unimodal_experiment(mod, CONFIG, train_loader, test_loader, device)

print("\n" + "="*65)
print("MODALITY CONTRIBUTION ANALYSIS (Unimodal Performance)")
print("="*65)
print(f"{'Modality':<15} | {'V-Acc':<8} {'V-F1':<8} | {'A-Acc':<8} {'A-F1':<8} | {'Avg F1'}")
print("-" * 65)

for mod in modalities:
    acc_v, f1_v, acc_a, f1_a = results[mod]
    avg_f1 = (f1_v + f1_a) / 2
    print(f"{mod.capitalize():<15} | {acc_v:.4f}   {f1_v:.4f}   | {acc_a:.4f}   {f1_a:.4f}   | {avg_f1:.4f}")
print("-" * 65)