In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 8.1 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [None]:
import os

import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio


class UrbanSoundDataset(Dataset):

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler= torchaudio.transforms.Resample(sr,self.target_sample_rate).cuda()
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
            index, 0])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]


if __name__ == "__main__":
    ANNOTATIONS_FILE = '/content/drive/My Drive/UrbanSound8K/metadata/UrbanSound8K.csv'
    AUDIO_DIR = '/content/drive/My Drive/UrbanSound8K/audio'
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050

    if torch.cuda.is_available():
      device = "cuda"
    else:
      device = "cpu"
    print(f"Using device {device}")

    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    print(f"There are {len(usd)} samples in the dataset.")
    signal, label = usd[666]
    print(signal.shape, label)
    print(signal)

Using device cuda
There are 8732 samples in the dataset.
torch.Size([1, 64, 44]) 8
tensor([[[2.5823e-02, 1.8627e-03, 2.6361e-03,  ..., 7.2002e-03,
          8.9268e-03, 7.1241e-03],
         [7.1336e-02, 4.0598e-04, 1.0193e-03,  ..., 8.3895e-04,
          8.9792e-04, 1.1271e-05],
         [7.1337e-02, 5.3633e-04, 1.1970e-03,  ..., 4.2821e-04,
          2.4050e-04, 1.7659e-04],
         ...,
         [1.0344e-02, 1.3025e-03, 2.0636e-03,  ..., 3.3728e-03,
          2.1758e-03, 3.1069e-03],
         [1.3325e-02, 1.4365e-03, 2.0099e-03,  ..., 1.1798e-03,
          9.3222e-04, 6.9491e-04],
         [1.6390e-02, 5.2715e-04, 8.2463e-04,  ..., 8.6571e-04,
          5.7349e-04, 7.5222e-04]]], device='cuda:0')
