In [12]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Dataset
import csv
import json
import pytorch_lightning as pl
import os

In [14]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb=xxx'

In [15]:
class RavdessRawExample:
    def __init__(self, example_id, video_feature_path, audio_feature_path, label):
        self.example_id = example_id
        self.video_feature_path = video_feature_path
        self.audio_feature_path = audio_feature_path
        self.label = label


class RavdessDataset(Dataset):
    def __init__(self, examples, vocab):
        super().__init__()
        self.examples = examples
        self.vocab = vocab

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        example = self.examples[index]
        video_features = np.load(example.video_feature_path)
        audio_features = np.load(example.audio_feature_path)
        label = example.label.lower()
        label_id = self.vocab["label2idx"][example.label]
        if label_id is None:
            raise ValueError(f"Unknown label: {example.label}")
        return {"video": video_features, "audio": audio_features, "label_id": label_id}




In [16]:
def load_vocab(vocab_filepath):
    with open(vocab_filepath, 'r') as file:
        vocab_data = json.load(file)
    return vocab_data

In [17]:

def load_examples(examples_path):
    examples = []
    with open(examples_path, 'r') as file:
        reader = csv.reader(file)
        next(reader, None)
        for row in reader:
            example_id = row[0]
            video_feature_path = row[1]
            audio_feature_path = row[2]
            label = row[3]
            example = RavdessRawExample(example_id, video_feature_path, audio_feature_path, label)
            examples.append(example)
    return examples





In [18]:
class EmotionRecognitionModel(pl.LightningModule):
    def __init__(self, audio_input_size, video_input_size, num_classes):
        super().__init__()
        self.audio_input_size = audio_input_size
        self.video_input_size = video_input_size
        self.num_classes = num_classes

        self.audio_fc = nn.Linear(self.audio_input_size, 256)
        self.video_fc = nn.Linear(self.video_input_size, 512)
        self.fc = nn.Linear(768, self.num_classes)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, audio, video):
        audio = audio.view(-1, self.audio_input_size)
        audio_features = self.audio_fc(audio)

        video = video.view(-1, self.video_input_size)
        video_features = self.video_fc(video)

        combined_features = torch.cat((audio_features, video_features), dim=1)
        output = self.fc(combined_features)

        return output

    def training_step(self, batch, batch_idx):
        audio = batch['audio']
        video = batch['video']
        labels = batch['label_id']

        logits = self(audio, video)
        loss = self.loss_fn(logits, labels)

        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        audio = batch['audio']
        video = batch['video']
        labels = batch['label_id']

        logits = self(audio, video)
        loss = self.loss_fn(logits,labels)

        self.log('val_loss', loss)

    def test_step(self, batch, batch_idx):
        audio = batch['audio']
        video = batch['video']
        labels = batch['label_id']

        logits = self(audio, video)
        loss = self.loss_fn(logits, labels)

        self.log('test_loss', loss)

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)




In [19]:
class EmotionRecognitionDataModule(pl.LightningDataModule):
    def __init__(self, examples_path, vocab_filepath):
        super().__init__()
        self.examples_path = examples_path
        self.vocab_filepath = vocab_filepath

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        vocab = load_vocab(self.vocab_filepath)
        examples = load_examples(self.examples_path)

        train_dataset = RavdessDataset(examples[:31], vocab)
        val_dataset = RavdessDataset(examples[31:38], vocab)
        test_dataset = RavdessDataset(examples[38:], vocab)

        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset

    def train_dataloader(self):
        return DataLoader(self.train_dataset)

    def val_dataloader(self):
        return DataLoader(self.val_dataset)

    def test_dataloader(self):
        num_workers = 12
        return DataLoader(self.test_dataset, num_workers=num_workers)



In [20]:
def main():
    audio_input_size = 512
    video_input_size = 1568 * 768
    num_classes = 8

    data_module = EmotionRecognitionDataModule("/mnt/c/users/admin/desktop/github/bimodal_emotion_recognition_with_ravdess_dataset/examples.csv", "/mnt/c/users/admin/desktop/github/bimodal_emotion_recognition_with_ravdess_dataset/vocab.json")
    model = EmotionRecognitionModel(audio_input_size, video_input_size, num_classes)
    trainer = pl.Trainer(accelerator="gpu", precision=16, max_epochs =2)
    trainer.fit(model, data_module)
    trainer.test(model=model, dataloaders=data_module.test_dataloader())
    test_dataset_length = len(data_module.test_dataset)
    print(test_dataset_length)

if __name__ == "__main__":
    main()

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type             | Params
----------------------------------------------
0 | audio_fc | Linear           | 131 K 
1 | video_fc | Linear           | 616 M 
2 | fc       | Linear           | 6.2 K 
3 | loss_fn  | CrossEntropyLoss | 0     
----------------------------------------------
616 M     Trainable params
0         Non-trainable params
616 M     Total params
2,466.803 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.30 GiB (GPU 0; 4.00 GiB total capacity; 3.48 GiB already allocated; 0 bytes free; 3.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
!nvidia-smi

Tue Aug 29 18:53:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.104      Driver Version: 528.79       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P5    N/A /  N/A |    129MiB /  4096MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces