In [23]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import torch

In [None]:
class SoundDataset(Dataset):
    def __init__(self,audio_dir,transformation,target_sample_rate,num_samples):
        self.df = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.df)


    def __get_path(self,index):
        sample = self.df.iloc[index]
        path = self.audio_dir + f'fold{sample['fold']}/' + sample['slice_file_name'] 
        print ( path)
        return path

    def __get_label(self,index):
        sample = self.df.iloc[index]
        return sample['classID']
    

    def __resample(self,signal,sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr , self.target_sample_rate)
            signal = resampler(signal)

        return signal
    
    def __mix_down(self,signal):
        if signal.dim() > 1 and signal.size(0) > 1: # (2,1000) , if it isn't mono
            signal = torch.mean(signal, dim = 0 ,keepdim = True)
        
        return signal

    def __cut(self,signal):
        if signal.shape[1] > self.num_samples :
            signal[:, :self.num_samples]

        return signal

    def __right_pad(self,signal):
        signal_lenght = signal.shape[1]
        if signal_lenght < self.num_samples:
            missing_samples_num = self.num_samples - signal_lenght
            padding = (0,missing_samples_num)
            signal = torch.nn.functional.pad(signal,padding)

        return signal  


    def __getitem__(self, index):
        audio_sample_path = self.__get_path(index)
        label = self.__get_label(index)
        signal , sr = torchaudio.load(audio_sample_path,format="wav")
        print('Signal : ', len(signal))
        print('Sample rate : ',sr)
        # signal -> (num_channels , samples) -> (2 , 16000) 
        signal = self.__resample(signal,sr)
        signal = self.__mix_down(signal)
        signal = self.__cut(signal)
        signal = self.__right_pad(signal)
        signal = self.transformation(signal)
        print('Signal : ', len(signal))
        return signal , label

In [None]:
AUDIO_DIR = '/home/furkan/AudioDeepLearning/UrbanSound8K/audio/'
SAMPLE_RATE = 16000
NUM_SAMPLES = 22050

transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64
)


dataset = SoundDataset(AUDIO_DIR,transform,SAMPLE_RATE,NUM_SAMPLES)



In [26]:
len(dataset)

8732

In [27]:
signal , label = dataset[0]

/home/furkan/AudioDeepLearning/UrbanSound8K/audio/fold5/100032-3-0-0.wav
Signal :  2
Sample rate :  44100
Signal :  1
