In [None]:
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary 
import librosa
import numpy as np
import os
import glob


In [None]:
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
       # 4 convolutional blocks / flatten the results / linear layer / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1, 
                out_channels=16,
                kernel_size=3,
                stride=1, 
                padding=2
            ), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16, 
                out_channels=32,
                kernel_size=3,
                stride=1, 
                padding=2
            ), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32, 
                out_channels=64,
                kernel_size=3,
                stride=1, 
                padding=2
            ), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64, 
                out_channels=128,
                kernel_size=3,
                stride=1, 
                padding=2
            ), 
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        prediction = self.softmax(logits)
        return prediction



In [26]:
if __name__ == "__main__":
    cnn = CNNNetwork()
    cnn = cnn.to('cpu')  
    summary(cnn, (1, 64, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

In [None]:
def spectrogram(file_path):
    audio, sr = librosa.load(file_path, sr=None)

    mfcc = librosa.feature.vfcc(y=audio, sr=sr, n_mfcc=13)

    mfcc = np.mean(mfcc, axis=1)

    return mfcc    

In [None]:
mfcc = np.expand_dims(mfcc, axis=0)
mfcc = np.expand_dims(mfcc, axis=0)

mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32)

print(mfcc_tensor.shape)

In [None]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, index):
        file_path = self.file_paths[index]
        label = self.labels[index]

        features = extract_features(file_path)

        features = torch.tensor(features, dtype=torch.float23)

        features = features.unsqueeze(0)

        features = features.unsqueeze(0)

        return features, label
    
file_paths = glob('/Users/gabrielasimon/Desktop/ac2/UrbanSound8K/audio/*/*.wav')
labels = [0, 1, 0]

dataset = AudioDataset(file_paths=file_paths, labels=labels)

data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

for input, targets, in data_loader: