In [1]:
import torch
import torchaudio
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [2]:
class EmotionDataset(Dataset):
    def __init__(self, image_dir, audio_dir, transform=None):
        self.image_dir = image_dir
        self.audio_dir = audio_dir
        self.transform = transform
        
        self.image_files = sorted(os.listdir(image_dir))
        self.audio_files = sorted(os.listdir(audio_dir))
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        audio_path = os.path.join(self.audio_dir, self.audio_files[idx])
        waveform, sample_rate = torchaudio.load(audio_path)
        mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_mels=64)(waveform)
        
        return image, mel_spectrogram

In [3]:
image_transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

trainRage_dataset = EmotionDataset("data/train/trainRage", "data/train/trainAudio", transform=image_transform)
testRage_dataset = EmotionDataset("data/test/testRage", "data/test/testAudio", transform=image_transform)

trainRage_loader = DataLoader(trainRage_dataset, batch_size=32, shuffle=True, num_workers=4)
testRage_loader = DataLoader(testRage_dataset, batch_size=32, shuffle=False, num_workers=4)

for img, audio in train_loader:
    print("image shape:", img.shape)  
    print("audio shape:", audio.shape)  
    break

TypeError: object of type 'EmotionDataset' has no len()