In [31]:
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import torchaudio
import torch
from torch import nn
import os
from tqdm import tqdm

In [32]:
class SoundDataset(Dataset):
    def __init__(self,audio_dir,transformation,target_sample_rate,num_samples,device):
        self.df = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        

    def __len__(self):
        return len(self.df)


    def __get_path(self,index):
        sample = self.df.iloc[index]
        path = os.path.join(self.audio_dir, f"fold{sample['fold']}", sample['slice_file_name'])
        return path

    def __get_label(self,index):
        sample = self.df.iloc[index]
        return sample['classID']
    

    def __resample(self,signal,sr):
        if sr != self.target_sample_rate:
            signal = signal.cpu()
            resampler = torchaudio.transforms.Resample(sr , self.target_sample_rate)
            signal = resampler(signal)

        return signal
    
    def __mix_down(self,signal):
        if signal.dim() > 1 and signal.size(0) > 1: # (2,1000) , if it isn't mono
            signal = torch.mean(signal, dim = 0 ,keepdim = True)
        
        return signal

    def __cut(self,signal):
        if signal.shape[1] > self.num_samples :
            signal = signal[:, :self.num_samples]

        return signal

    def __right_pad(self,signal):
        signal_lenght = signal.shape[1]
        if signal_lenght < self.num_samples:
            missing_samples_num = self.num_samples - signal_lenght
            padding = (0,missing_samples_num)
            signal = torch.nn.functional.pad(signal,padding)

        return signal  


    def __getitem__(self, index):
        try:
            audio_sample_path = self.__get_path(index)
            label = self.__get_label(index)
            signal , sr = torchaudio.load(audio_sample_path, format="wav")
            #signal = signal.to(self.device)

            signal = self.__resample(signal, sr)
            signal = self.__mix_down(signal)
            signal = self.__cut(signal)
            signal = self.__right_pad(signal)
            signal = self.transformation(signal)

            return signal, label
        except Exception as e:
            print(f"Hata: {e} -> Index: {index}")
            return self.__getitem__((index + 1) % len(self.df))  # bir sonraki veriyle devam et


In [33]:
AUDIO_DIR = '/home/furkan/AudioDeepLearning/UrbanSound8K/audio/'
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
LEARNING_RATE = .001
BATCH_SIZE = 1024
EPOCHS = 10

transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print("Using device",device)


dataset = SoundDataset(AUDIO_DIR,transform,SAMPLE_RATE,NUM_SAMPLES,device)



Using device cuda


In [34]:
len(dataset)

8732

In [35]:
signal , label = dataset[0]

In [36]:
class SoundNeauralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # 4 conv , flatten , linear , softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )


        self.flatten = nn.Flatten()

        self.linear = nn.Linear(128 * 5 * 4 , 10)
        
        self.softmax = nn.Softmax(dim = 1)


    def forward(self,input):
        x = self.conv1(input)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = self.flatten(x)

        logits = self.linear(x)

        predictions = self.softmax(logits)

        return predictions

In [37]:
model = SoundNeauralNetwork().to(device)
model

SoundNeauralNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)

In [38]:
data_loader = DataLoader(dataset,batch_size=BATCH_SIZE)

In [39]:
loss_f = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(),lr = LEARNING_RATE)

In [40]:
for i in range(EPOCHS):
    print(f"Epoch {i+1}")
    for inputs , targets in tqdm(data_loader):
        inputs , targets = inputs.to(device) , targets.to(device)

        predictions = model(inputs)

        loss = loss_f(predictions,targets)

        optimiser.zero_grad()

        loss.backward()

        optimiser.step()
    print(f"Loss : {loss}")

torch.save(model.state_dict(),"model.pth")
print("model trained and stored")

Epoch 1


100%|██████████| 9/9 [01:14<00:00,  8.33s/it]


Loss : 2.211721420288086
Epoch 2


100%|██████████| 9/9 [01:13<00:00,  8.21s/it]


Loss : 2.2201600074768066
Epoch 3


100%|██████████| 9/9 [01:15<00:00,  8.34s/it]


Loss : 2.25758695602417
Epoch 4


100%|██████████| 9/9 [01:14<00:00,  8.25s/it]


Loss : 2.0946662425994873
Epoch 5


100%|██████████| 9/9 [01:11<00:00,  7.98s/it]


Loss : 2.058579206466675
Epoch 6


100%|██████████| 9/9 [01:14<00:00,  8.29s/it]


Loss : 2.0330209732055664
Epoch 7


100%|██████████| 9/9 [01:11<00:00,  7.97s/it]


Loss : 2.0200397968292236
Epoch 8


100%|██████████| 9/9 [01:16<00:00,  8.46s/it]


Loss : 2.06611967086792
Epoch 9


100%|██████████| 9/9 [01:06<00:00,  7.39s/it]


Loss : 1.9976367950439453
Epoch 10


100%|██████████| 9/9 [01:15<00:00,  8.39s/it]

Loss : 2.065526247024536
model trained and stored





In [41]:
class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren", 
    "street_music"
]

In [42]:
# (batch_size , num_channels , fr ,time)
input , target = dataset[0][0] , dataset[0][1]
input.unsqueeze_(0)

tensor([[[[8.1443e-04, 2.1588e-04, 9.1436e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.4870e-03, 1.5724e-03, 4.2855e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.9685e-03, 6.0449e-03, 4.1659e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [1.7061e-04, 5.7996e-02, 7.9969e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.1515e-04, 1.5781e-02, 3.7936e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.0015e-04, 1.4416e-02, 3.1233e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]]])

In [None]:
model = model.to("cpu")
model.eval()
with torch.no_grad():
    predictions = model(input)
    
    prediction_index = predictions[0].argmax(0)
    predicted = class_mapping[prediction_index]
    
expected = class_mapping[target]

print(f"Predicted : {predicted} , Expected : {expected}")

Predicted : dog_bark , Expected : dog_bark
