In [2]:
import pandas as pd

import torch
import torchaudio
import torch.nn as nn

import torchaudio.functional as F
import torchaudio.transforms as T

from torch.utils.data.dataloader import Dataset, T_co
from sklearn import preprocessing
import librosa

In [3]:
ANNOTATIONS_DIR = "/home/michael/Documents/fontys/semester_7/depmAInd/empathic_art/empathic-art/data/reference_df.csv"
AUDIO_DIR = "/home/michael/Documents/fontys/semester_7/depmAInd/empathic_art/empathic-art/data/utoronto/data"

SAMPLE_RATE = 44100
NUM_FRAMES = 220500

In [None]:
#trim whitespce of all sound files.
librosa.

In [4]:
class SpecgramDataset(Dataset):
    
    def __init__(self, annotations: str = ANNOTATIONS_DIR, audio_dir: str = AUDIO_DIR,  
                to_specgram = None, sample_rate: int = SAMPLE_RATE, num_frames: int = NUM_FRAMES) -> None:
        super().__init__()
        self.annotations = pd.read_csv(annotations)
        self.labels = self.label_encoder.fit_transform(self.annotations[self.annotations == "tess"].emotion)

        self.audio_dir = audio_dir

        self.to_specgram = to_specgram

        self.sample_rate = sample_rate
        self.num_frames = num_frames
    
    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, index: int) -> T_co:
        label = self.labels[index]
        waveform, sr = torchaudio.load(f"{self.audio_dir}/{self.annotations.iloc[index].filename}")
        
        waveform = self._resample_if_necessary(waveform, sr)
        waveform = self._cut_if_necessary(waveform)
        waveform = self._right_pad_if_necessary(waveform)

        specgram = self.to_specgram(waveform)

        return specgram, label

    def _cut_if_necessary(self, waveform: torch.Tensor) -> torch.Tensor:
        num_frames = waveform.shape[1]
        if num_frames > self.num_frames:
            waveform = waveform[:, :self.num_frames]
        return waveform

    def _right_pad_if_necessary(self, waveform: torch.Tensor) -> torch.Tensor:
        num_frames = waveform.shape[1]
        if num_frames < self.num_frames:
            missing_frames = self.num_frames - num_frames
            last_dim_padding = (0, missing_frames)
            waveform = nn.functional.pad(waveform, last_dim_padding)
        return waveform

    def _resample_if_necessary(self, waveform: torch.Tensor, sr: int) -> torch.Tensor:
        if self.sample_rate != sr:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
        return waveform