In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torchaudio

In [6]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import os
import torch

class DCASE_Dataset(Dataset):

  def __init__(self, annotations_file, audio_dir, transformation,
               target_sample_rate):
    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.transformation = transformation
    self.target_sample_rate = target_sample_rate

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    filename = self._get_audio_sample_filename(index)
    signal, sr = torchaudio.load(audio_sample_path) 
    signal = self._resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self.transformation(signal) 
    return signal, label, filename

  def _resample_if_necessary(self, signal, sr):
    if sr != self.target_sample_rate:
        resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
        signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1: 
        signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

  def _get_audio_sample_path(self, index):
    fold = f"{self.annotations.iloc[index, 1]}"
    path = os.path.join(self.audio_dir, fold, f"{self.annotations.iloc[index, 0]}.wav")
    return path

  def _get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 2]

  def _get_audio_sample_filename(self, index):
    return f"{self.annotations.iloc[index, 0]}.wav"

if __name__ == "__main__":

  ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/combined_metadata.csv'
  AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
  SAMPLE_RATE = 22050

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
      sample_rate=SAMPLE_RATE,
      n_fft=1024,
      hop_length=512,
      n_mels=64
      )

  dcase_data = DCASE_Dataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE)

  print(f"There are {len(dcase_data)} samples in the dataset.")

  signal, label, filename = dcase_data[0]

  print(signal.shape, label, filename)

  print(signal)



There are 35690 samples in the dataset.
torch.Size([1, 64, 431]) 0 64486.wav
tensor([[[1.5975e-01, 2.7099e-01, 1.3740e-01,  ..., 1.6348e-02,
          3.1244e-02, 4.7086e-02],
         [3.7911e-01, 4.1791e+00, 1.7681e+00,  ..., 1.2492e+00,
          2.9435e-01, 1.5889e+00],
         [8.4830e+00, 3.1850e+00, 2.4497e+00,  ..., 3.5935e+00,
          1.6346e+00, 2.1412e+00],
         ...,
         [7.5682e-02, 6.5243e-02, 3.7237e-02,  ..., 2.8266e-04,
          6.4843e-04, 4.0049e-04],
         [5.4711e-02, 2.5257e-02, 2.2485e-02,  ..., 3.4433e-04,
          4.1962e-04, 3.1364e-04],
         [4.4925e-02, 1.6226e-03, 2.2030e-03,  ..., 2.1046e-04,
          1.6134e-04, 2.3366e-04]]])
