In [5]:
!wget https://goo.gl/8hY5ER

--2021-07-22 07:51:15--  https://goo.gl/8hY5ER
Resolving goo.gl (goo.gl)... 108.177.127.113, 108.177.127.102, 108.177.127.100, ...
Connecting to goo.gl (goo.gl)|108.177.127.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz [following]
--2021-07-22 07:51:15--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘8hY5ER’


2021-07-22 07:52:19 (91.5 MB/s) - ‘8hY5ER’ saved [6023741708/6023741708]



In [None]:
!pip install torchaudio

In [8]:
import shutil
shutil.unpack_archive("8hY5ER.tar", "./urban")

In [31]:
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio

class UrbanSoundDataset(Dataset):

    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate): # path to csv, path to audio folder
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal) # stereo to mono
        signal = self.transformation(signal)
        return signal, label

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1: # (2, 1000)
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal


    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}" # 5th column 'Fold'
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0] )
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

In [32]:
ANNOTATIONS_FILE = '/content/urban/UrbanSound8K/metadata/UrbanSound8K.csv'
AUDIO_DIR = '/content/urban/UrbanSound8K/audio'
SAMPLE_RATE = 16000

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram,
                        SAMPLE_RATE)

print(f"There are {len(usd)} samples in the dataset.")
signal, label = usd[0]

a=1

There are 8732 samples in the dataset.


In [34]:
signal.shape

torch.Size([1, 64, 10])

In [19]:
import torch
a = torch.randn(4, 4)
a

tensor([[-0.1545,  0.9309, -1.6141, -1.4686],
        [ 0.3969, -0.4319,  0.2086, -0.6068],
        [ 0.2030, -2.0247,  1.2353,  0.2695],
        [-0.5049,  0.7329,  1.6494,  2.6579]])

In [29]:
torch.mean(a, 0)

tensor([-0.0149, -0.1982,  0.3698,  0.2130])