In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchaudio.transforms as T
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report

from utility.data_loader import EmotionDataset
from utility.fusion import LateFusion
from utility.models import AudioCNN, VideoCNN
import sys

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
image_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

audio_transform = transforms.Compose([
    T.MelSpectrogram(
        sample_rate=16000,
        n_mels=64,
        n_fft=1024,
        hop_length=512
    ),
    transforms.Resize((64, 64)) 
])

## Load the data

In [4]:
import os

# Check if the first audio file exists
audio_path = os.path.normpath("../data/train/Audio/eric_excitement1/eric_excitement1.wav")
print(f"File exists: {os.path.exists(audio_path)}")
print(f"Absolute path: {os.path.abspath(audio_path)}")

File exists: True
Absolute path: C:\Users\ahmad\PycharmProjects\cmpt419-project\data\train\Audio\eric_excitement1\eric_excitement1.wav


In [5]:
emotion_classes = ['Rage', 'Excitement', 'Fear', 'Frustration']

train_dataset = EmotionDataset(
    root_dir="../data/train",
    transform=image_transform,
    audio_transform=audio_transform,
)

test_dataset = EmotionDataset(
    root_dir="../data/test",
    transform=image_transform,
    audio_transform=audio_transform,
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4)


Train dataset size: 142
Test dataset size: 61


## Initialize the models

In [6]:
video_model = VideoCNN(num_classes=len(emotion_classes)).to(device)
audio_model = AudioCNN(num_classes=len(emotion_classes)).to(device)
fusion_model = LateFusion(num_classes=len(emotion_classes), fusion_type='mlp').to(device)



## Define loss and optimizers

In [7]:
criterion = nn.CrossEntropyLoss()
video_optimizer = optim.Adam(video_model.parameters(), lr=0.001)
audio_optimizer = optim.Adam(audio_model.parameters(), lr=0.001)
fusion_optimizer = optim.Adam(fusion_model.parameters(), lr=0.001)

## Training functions

In [8]:
def train_video_model(epochs=10):
    video_model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch in tqdm(train_loader, desc=f"Video Epoch {epoch+1}"):
            audio, frames, labels = batch
            frames = frames.to(device)
            labels = labels.to(device)
            
            video_optimizer.zero_grad()
            outputs = video_model(frames)
            loss = criterion(outputs, labels)
            loss.backward()
            video_optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / len(train_loader)
        epoch_acc = 100 * correct / total
        print(f"Video Model - Epoch {epoch+1}: Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.2f}%")

def train_audio_model(epochs=10):
    audio_model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch in tqdm(train_loader, desc=f"Audio Epoch {epoch+1}"):
            audio, frames, labels = batch
            audio = audio.to(device)
            labels = labels.to(device)
            
            audio_optimizer.zero_grad()
            outputs = audio_model(audio)
            loss = criterion(outputs, labels)
            loss.backward()
            audio_optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / len(train_loader)
        epoch_acc = 100 * correct / total
        print(f"Audio Model - Epoch {epoch+1}: Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.2f}%")

## Train the models

In [10]:
train_video_model(epochs=5)
train_audio_model(epochs=5)

Video Epoch 1: 100%|██████████| 18/18 [00:12<00:00,  1.42it/s]


Video Model - Epoch 1: Loss: 2.0101, Acc: 29.58%


Video Epoch 2: 100%|██████████| 18/18 [00:11<00:00,  1.55it/s]


Video Model - Epoch 2: Loss: 1.2690, Acc: 44.37%


Video Epoch 3: 100%|██████████| 18/18 [00:11<00:00,  1.51it/s]


Video Model - Epoch 3: Loss: 1.1772, Acc: 46.48%


Video Epoch 4: 100%|██████████| 18/18 [00:11<00:00,  1.51it/s]


Video Model - Epoch 4: Loss: 1.1611, Acc: 48.59%


Video Epoch 5: 100%|██████████| 18/18 [00:11<00:00,  1.55it/s]


Video Model - Epoch 5: Loss: 1.2428, Acc: 44.37%


Audio Epoch 1: 100%|██████████| 18/18 [00:11<00:00,  1.62it/s]


Audio Model - Epoch 1: Loss: 1.5497, Acc: 50.00%


Audio Epoch 2: 100%|██████████| 18/18 [00:11<00:00,  1.57it/s]


Audio Model - Epoch 2: Loss: 1.1537, Acc: 57.04%


Audio Epoch 3: 100%|██████████| 18/18 [00:11<00:00,  1.57it/s]


Audio Model - Epoch 3: Loss: 1.0464, Acc: 57.04%


Audio Epoch 4: 100%|██████████| 18/18 [00:11<00:00,  1.55it/s]


Audio Model - Epoch 4: Loss: 0.9149, Acc: 72.54%


Audio Epoch 5: 100%|██████████| 18/18 [00:11<00:00,  1.52it/s]

Audio Model - Epoch 5: Loss: 0.7733, Acc: 69.72%





## Evaluate the models

In [14]:
print(f"Test dataset size: {len(test_dataset)}")

Test dataset size: 61


In [18]:
from sklearn.metrics import classification_report
import torch
from tqdm import tqdm

def evaluate_models():
    video_model.eval()
    audio_model.eval()
    fusion_model.eval()
    
    all_video_preds = []
    all_audio_preds = []
    all_fusion_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(test_loader, desc="Evaluating")):
            try:
                # Unpack batch
                audio, frames, labels = batch

                # Sanity check
                print(f"Batch {batch_idx}: audio {audio.shape}, frames {frames.shape}, labels {labels.shape}")

                # Move to device
                audio = audio.to(device)
                frames = frames.to(device)
                labels = labels.to(device)

                # Forward pass
                video_outputs = video_model(frames)
                audio_outputs = audio_model(audio)
                fusion_outputs = fusion_model(video_outputs, audio_outputs)

                # Predictions
                _, video_preds = torch.max(video_outputs, dim=1)
                _, audio_preds = torch.max(audio_outputs, dim=1)
                _, fusion_preds = torch.max(fusion_outputs, dim=1)

                # Collect results
                all_video_preds.extend(video_preds.cpu().numpy())
                all_audio_preds.extend(audio_preds.cpu().numpy())
                all_fusion_preds.extend(fusion_preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

            except Exception as e:
                print(f"❌ Exception during evaluation (batch {batch_idx}): {e}")
                continue

    # Print results only if we collected predictions
    if all_labels:
        print("\n🎬 Video Model Results:")
        print(classification_report(all_labels, all_video_preds, target_names=emotion_classes))

        print("\n🔊 Audio Model Results:")
        print(classification_report(all_labels, all_audio_preds, target_names=emotion_classes))

        print("\n🤝 Fusion Model Results:")
        print(classification_report(all_labels, all_fusion_preds, target_names=emotion_classes))
    else:
        print("⚠️ No predictions collected. Check your DataLoader or model output.")


In [19]:
evaluate_models()

Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

Batch 0: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])


Evaluating:  12%|█▎        | 1/8 [00:07<00:54,  7.82s/it]

Batch 1: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])


Evaluating:  38%|███▊      | 3/8 [00:08<00:10,  2.15s/it]

Batch 2: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])
Batch 3: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])


Evaluating:  62%|██████▎   | 5/8 [00:08<00:03,  1.10s/it]

Batch 4: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])


Evaluating:  75%|███████▌  | 6/8 [00:09<00:02,  1.22s/it]

Batch 5: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])


Evaluating:  88%|████████▊ | 7/8 [00:10<00:00,  1.05it/s]

Batch 6: audio torch.Size([8, 1, 64, 64]), frames torch.Size([8, 20, 1, 112, 112]), labels torch.Size([8])
Batch 7: audio torch.Size([5, 1, 64, 64]), frames torch.Size([5, 20, 1, 112, 112]), labels torch.Size([5])


Evaluating: 100%|██████████| 8/8 [00:11<00:00,  1.38s/it]


🎬 Video Model Results:
              precision    recall  f1-score   support

        Rage       0.54      0.45      0.49        29
  Excitement       0.31      0.69      0.43        16
        Fear       0.00      0.00      0.00         9
 Frustration       1.00      0.14      0.25         7

    accuracy                           0.41        61
   macro avg       0.46      0.32      0.29        61
weighted avg       0.45      0.41      0.38        61


🔊 Audio Model Results:
              precision    recall  f1-score   support

        Rage       0.54      0.69      0.61        29
  Excitement       0.47      0.50      0.48        16
        Fear       0.29      0.22      0.25         9
 Frustration       0.00      0.00      0.00         7

    accuracy                           0.49        61
   macro avg       0.32      0.35      0.34        61
weighted avg       0.42      0.49      0.45        61


🤝 Fusion Model Results:
              precision    recall  f1-score   support

  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
