In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchaudio

In [6]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import os
import torch

class DCASE_Dataset(Dataset):

  def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
    self.annotations = pd.read_csv(annotations_file)
    self.audio_dir = audio_dir
    self.device = device
    self.transformation = transformation.to(self.device)
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    filename = self._get_audio_sample_filename(index)
    signal, sr = torchaudio.load(audio_sample_path) 
    signal = signal.to(self.device)
    signal = self._resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self._cut_if_necessary(signal)
    signal = self._right_pad_if_necessary(signal)
    signal = self.transformation(signal) 
    return signal, label, filename

  def _cut_if_necessary(self, signal):
      if signal.shape[1] > self.num_samples:
          signal = signal[:, :self.num_samples]
      return signal

  def _right_pad_if_necessary(self, signal):
      length_signal = signal.shape[1]
      if length_signal < self.num_samples:
          num_missing_samples = self.num_samples - length_signal
          last_dim_padding = (0, num_missing_samples)
          signal = torch.nn.functional.pad(signal, last_dim_padding)
      return signal

  def _resample_if_necessary(self, signal, sr):
    if sr != self.target_sample_rate:
        resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).cuda()
        signal = resampler(signal)
    return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1: 
        signal = torch.mean(signal, dim=0, keepdim=True)
    return signal

  def _get_audio_sample_path(self, index):
    fold = f"{self.annotations.iloc[index, 1]}"
    path = os.path.join(self.audio_dir, fold, f"{self.annotations.iloc[index, 0]}.wav")
    print(path)
    return path

  def _get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 2]

  def _get_audio_sample_filename(self, index):
    return f"{self.annotations.iloc[index, 0]}.wav"

if __name__ == "__main__":

  ANNOTATIONS_FILE = '/content/drive/My Drive/DCASE_Datasets/labels/combined_metadata.csv'
  AUDIO_DIR = '/content/drive/My Drive/DCASE_Datasets/audio/'
  SAMPLE_RATE = 22050
  DURATION = 10
  NUM_SAMPLES = 22050 * DURATION

  if torch.cuda.is_available():
    device = "cuda"
  else:
    device = "cpu"
  print(f"Using device {device}")

  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
      sample_rate=SAMPLE_RATE,
      n_fft=1024,
      hop_length=512,
      n_mels=64
      )

  dcase_data = DCASE_Dataset(ANNOTATIONS_FILE, 
                             AUDIO_DIR, 
                             mel_spectrogram, 
                             SAMPLE_RATE,
                             NUM_SAMPLES,
                             device)

  print(f"There are {len(dcase_data)} samples in the dataset.")

  signal, label, filename = dcase_data[555]

  print(signal.shape, label, filename)

  print(signal)





Using device cuda
There are 35690 samples in the dataset.
/content/drive/My Drive/DCASE_Datasets/audio/ff1010bird/184463.wav
torch.Size([1, 64, 431]) 1 184463.wav
tensor([[[1.6176e+01, 1.0194e+01, 3.0178e+00,  ..., 1.4385e+01,
          4.8207e+00, 1.4695e+01],
         [2.6568e+00, 2.3177e+00, 2.3524e+00,  ..., 8.7684e+00,
          2.5412e+00, 1.0991e+01],
         [5.0469e-01, 9.2547e-01, 1.0446e+00,  ..., 9.0136e-01,
          1.6648e+00, 1.6646e+00],
         ...,
         [1.1813e-02, 1.2692e-02, 2.3908e-02,  ..., 1.4758e-02,
          1.8534e-02, 1.6635e-02],
         [8.7183e-03, 1.2198e-02, 1.2751e-02,  ..., 1.3182e-02,
          1.5341e-02, 1.3538e-02],
         [4.9677e-03, 1.0508e-02, 1.0361e-02,  ..., 1.1760e-02,
          9.1403e-03, 9.3836e-03]]], device='cuda:0')
