In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchaudio

In [5]:
import os

import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio


class UrbanSoundDataset(Dataset):

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
            index, 0])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]


if __name__ == "__main__":
    ANNOTATIONS_FILE = '/content/drive/My Drive/UrbanSound8K/metadata/UrbanSound8K.csv'
    AUDIO_DIR = '/content/drive/My Drive/UrbanSound8K/audio'
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050

    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES)
    print(f"There are {len(usd)} samples in the dataset.")
    signal, label = usd[1]
    print(signal.shape)
    print(signal)

There are 8732 samples in the dataset.
torch.Size([1, 64, 44])
tensor([[[8.9202e-02, 6.2095e-01, 1.0333e-01,  ..., 3.6100e-02,
          1.7762e-01, 3.1767e-01],
         [1.3724e-01, 5.2129e-01, 3.5723e-02,  ..., 2.5212e-01,
          2.2663e-01, 8.1124e-01],
         [1.4066e-02, 3.7927e-01, 5.5285e-02,  ..., 8.5007e-01,
          4.2893e-01, 1.2757e+00],
         ...,
         [1.1717e-04, 1.9552e-04, 2.5877e-04,  ..., 1.9466e-04,
          9.3174e-05, 3.0051e-04],
         [2.5131e-04, 1.7274e-04, 3.0851e-04,  ..., 1.8992e-04,
          7.4250e-05, 1.6916e-04],
         [2.4768e-04, 1.8604e-04, 2.7004e-04,  ..., 6.9692e-05,
          8.3570e-05, 1.3193e-04]]])
