# Truncating Audio Durations

In [1]:
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_path, audio_dir, transformation, target_sample_rate, sample_number):
        self.annotations = pd.read_csv(annotations_path)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.sample_number = sample_number

    def __len__(self):
        return len(self.annotations)        

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample(signal, sr)
        signal = self._mix_down(signal)
        print(signal.shape)
        signal = self._truncate_signal_size(signal, self.sample_number)
        print(signal.shape)
        signal = self.transformation(signal)
        return signal, label

    def _get_audio_sample_path(self, index):
        folder = f'fold{self.annotations.iloc[index, 5]}'
        filename = self.annotations.iloc[index, 0]
        path = os.path.join(self.audio_dir, folder, filename)
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

    def _resample(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
        
    def _truncate_signal_size(self, signal, sample_number):
        if signal.shape[1] > sample_number:
            signal = signal[:, :sample_number]
        elif signal.shape[1] < sample_number:
            pad_size = sample_number - signal.shape[1]
            signal = torch.nn.functional.pad(signal, pad=(0, pad_size), value=0)
        return signal
        


Testing the implementation on the Urban Sound Dataset that I downloaded from [Kaggle](https://www.kaggle.com/datasets/chrisfilo/urbansound8k).

In [35]:
SAMPLE_RATE = 22050
SAMPLE_NUMBER = 22050

mel_spectogram = torchaudio.transforms.MelSpectrogram(  sample_rate=SAMPLE_RATE,
                                                        n_fft=1024,
                                                        hop_length=512,
                                                        n_mels=64)

usd = UrbanSoundDataset(annotations_path='../../../Downloads/archive(4)/UrbanSound8K.csv',
                        audio_dir='../../../Downloads/archive(4)',
                        transformation=mel_spectogram,
                        target_sample_rate=SAMPLE_RATE,
                        sample_number=SAMPLE_NUMBER)

In [36]:
signal, label = usd[1]

torch.Size([1, 88200])
torch.Size([1, 22050])


In [25]:
# 64 = number of Mels
# 44 = number of sample ranges
signal.shape

torch.Size([1, 64, 44])