# Extracing Mel Spectrograms


In [11]:
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import torch
import os

We can create custom Datasets by linking it to the Dataset class and overwriting :
- \_\_len\_\_(): it's what len(data) should return
- \_\_getitem\_\_(): it's what data[i] should return

In [36]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_path, audio_dir, transformation, target_sample_rate):
        self.annotations = pd.read_csv(annotations_path)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.annotations)        

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample(signal, sr)
        signal = self._mix_down(signal)
        signal = self.transformation(signal)
        return signal, label

    def _get_audio_sample_path(self, index):
        folder = f'fold{self.annotations.iloc[index, 5]}'
        filename = self.annotations.iloc[index, 0]
        path = os.path.join(self.audio_dir, folder, filename)
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

    def _resample(self, signal, sr):
        # Normalizing the sample rate across all data points
        # If sample_rate = target_sample_rate, do nothing. Otherwise, resample it to the target sample rate
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down(self, signal):
        # If the signal has more than one channel, apply a mean of all channels
        # Ex: signal.shape = (2, 16000) -> (1, 16000)
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
        


Testing the implementation on the Urban Sound Dataset that I downloaded from [Kaggle](https://www.kaggle.com/datasets/chrisfilo/urbansound8k).

In [37]:
SAMPLE_RATE = 16000
mel_spectogram = torchaudio.transforms.MelSpectrogram(  sample_rate=SAMPLE_RATE,
                                                        n_fft=1024,
                                                        hop_length=512,
                                                        n_mels=64)

usd = UrbanSoundDataset(annotations_path='../../../Downloads/archive(4)/UrbanSound8K.csv',
                        audio_dir='../../../Downloads/archive(4)',
                        transformation=mel_spectogram,
                        target_sample_rate=SAMPLE_RATE)

In [41]:
signal, label = usd[0]

In [44]:
# 64 = number of Mels
# 10 = number of sample ranges
signal.shape

torch.Size([1, 64, 10])

In [46]:
label

3