In [1]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import torch
from torch import nn

In [2]:
class SoundDataset(Dataset):
    def __init__(self,audio_dir,transformation,target_sample_rate,num_samples,device):
        self.df = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        

    def __len__(self):
        return len(self.df)


    def __get_path(self,index):
        sample = self.df.iloc[index]
        path = self.audio_dir + f'fold{sample['fold']}/' + sample['slice_file_name'] 
        print ( path)
        return path

    def __get_label(self,index):
        sample = self.df.iloc[index]
        return sample['classID']
    

    def __resample(self,signal,sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr , self.target_sample_rate)
            signal = resampler(signal)

        return signal
    
    def __mix_down(self,signal):
        if signal.dim() > 1 and signal.size(0) > 1: # (2,1000) , if it isn't mono
            signal = torch.mean(signal, dim = 0 ,keepdim = True)
        
        return signal

    def __cut(self,signal):
        if signal.shape[1] > self.num_samples :
            signal[:, :self.num_samples]

        return signal

    def __right_pad(self,signal):
        signal_lenght = signal.shape[1]
        if signal_lenght < self.num_samples:
            missing_samples_num = self.num_samples - signal_lenght
            padding = (0,missing_samples_num)
            signal = torch.nn.functional.pad(signal,padding)

        return signal  


    def __getitem__(self, index):
        audio_sample_path = self.__get_path(index)
        label = self.__get_label(index)
        signal , sr = torchaudio.load(audio_sample_path,format="wav")
        signal = signal.to(self.device)
        print('Signal : ', len(signal))
        print('Sample rate : ',sr)
        # signal -> (num_channels , samples) -> (2 , 16000) 
        signal = self.__resample(signal,sr)
        signal = self.__mix_down(signal)
        signal = self.__cut(signal)
        signal = self.__right_pad(signal)
        signal = self.transformation(signal)
        print('Signal : ', len(signal))
        return signal , label

In [3]:
AUDIO_DIR = '/home/furkan/AudioDeepLearning/UrbanSound8K/audio/'
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print("Using device",device)


dataset = SoundDataset(AUDIO_DIR,transform,SAMPLE_RATE,NUM_SAMPLES,device)



Using device cpu


In [4]:
len(dataset)

8732

In [5]:
signal , label = dataset[0]

/home/furkan/AudioDeepLearning/UrbanSound8K/audio/fold5/100032-3-0-0.wav
Signal :  2
Sample rate :  44100
Signal :  1


In [11]:
class SoundNeauralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # 4 conv , flatten , linear , softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )


        self.flatten = nn.Flatten()

        self.linear = nn.Linear(128 * 5 * 4 , 10)
        
        self.softmax = nn.Softmax(dim = 1)


    def forward(self,input):
        x = self.conv1(input)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = self.flatten(x)

        logits = self.linear(x)

        predictions = self.softmax(logits)

        return predictions

In [12]:
model = SoundNeauralNetwork()
model

SoundNeauralNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)