In [6]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
#import panns_inference

TUT_CSV = 'Datasets/TUT18_train.csv'
SCAPPER_CSV = 'Datasets/scrapper_train_dataset.csv'
TUT_AUD_DIR = '../audioData/TUTUrban2018/developmentDataset/TUT-urban-acoustic-scenes-2018-development/'
SCAPPER_AUD_DIR = '../audioData/sythenticSoundscenes/train/'

In [11]:
!python3 Datasets/datasets.py

In [54]:
scapper_df = pd.read_csv(SCAPPER_CSV)
scapper_scene_labels = (list)(scapper_df['acoustic_scene_label'].unique())

def label_to_one_hot(labels, label_array=scapper_scene_labels):
    """
    Convert string labels to one-hot encoded labels based on the provided array of labels.

    Args:
    - labels (list of str): List of string labels to convert.
    - label_array (numpy array): Array containing all possible labels.

    Returns:
    - one_hot_encoded (numpy array): One-hot encoded labels corresponding to the input labels.
    """
    label_dict = {label: i for i, label in enumerate(label_array)}
    one_hot_encoded = np.zeros((len(labels), len(label_array)), dtype=int)
    for i, label in enumerate(labels):
        if label in label_dict:
            one_hot_encoded[i, label_dict[label]] = 1
    return torch.tensor(one_hot_encoded)


In [4]:
sys.path.append('Datasets/')
sys.path.append('utils/')
import datasets
import audio_utils

scapper_dataset = datasets.scraperDataset(SCAPPER_CSV, SCAPPER_AUD_DIR, only_scene=False, transforms=audio_utils.get_log_melSpectrogram)

In [9]:
tut_dataset = datasets.TUT18_Dataset(TUT_CSV, TUT_AUD_DIR, transforms=None)

for i in range(3):
    print(tut_dataset[i]['data'].shape)

torch.Size([1, 160000])
torch.Size([1, 160000])
torch.Size([1, 160000])


In [10]:
for i in range(3):
    print(scapper_dataset[i]['data'].shape)

torch.Size([1, 40, 1501])
torch.Size([1, 40, 1501])
torch.Size([1, 40, 1501])


In [6]:
!python3 utils/audio_utils.py Datasets/datasets.py

In [56]:
from torch.utils.data import DataLoader

train_loader = DataLoader(scapper_dataset, batch_size=32, shuffle=True)

for i, batch in enumerate(train_loader):
    if i == 0:
        print(label_to_one_hot(batch['scene_label']))
    else:
        break

tensor([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],


In [90]:
import torch.nn.functional as F

class ASC_Model00(nn.Module):
    def __init__(self):
        super(ASC_Model00, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        
        # Pooling layer
        self.pool = nn.AdaptiveAvgPool2d(output_size=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, 1024)
        self.fc3 = nn.Linear(1024, 2048)
        self.relu = nn.ReLU()
        # Output layers
        self.output_layer = nn.Linear(2048, 10)
        self.softmax = nn.Softmax()
        #self.second_last_layer = nn.Linear(2048, 2048)
        
    def forward(self, x):
        # Convolutional layers
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        
        # Pooling layer
        x = self.pool(x)
        
        # Flatten tensor
        x = x.flatten(start_dim=1)
        # Fully connected layers
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        second_last = self.relu(self.fc3(x))
        
        # Output layers
        output = self.output_layer(second_last)
        #second_last = self.second_last_layer(x)
        
        return (output), second_last

In [91]:
model = ASC_Model00()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [92]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

epochs = 10

for epoch in range(epochs):
    running_loss = 0
    model.train()
    for i, batch in enumerate(train_loader):

        optimizer.zero_grad()
        inputs, labels = batch['spec'], label_to_one_hot(batch['scene_label'])
        inputs = inputs.to(device)
        labels = labels.to(device = device, dtype=torch.float)

        outputs, second_last = model(inputs)
        loss = loss_fn(outputs, labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}')

Epoch 1, Loss: 2.286044120788574
Epoch 2, Loss: 1.987437736480794
Epoch 3, Loss: 1.8101707544732601
Epoch 4, Loss: 1.6402381341508094
Epoch 5, Loss: 1.5014876596471096
Epoch 6, Loss: 1.2825214583823021
Epoch 7, Loss: 1.0308161117929093
Epoch 8, Loss: 0.8724342448280212
Epoch 9, Loss: 0.7371335457614128
Epoch 10, Loss: 0.5589560324207266


In [103]:
i = 1963
sample = os.path.join(SCAPPER_AUD_DIR, scapper_df['audio_fileNames'][i])
audio = audio_utils.load_audio_from_file(sample)
spec = audio_utils.get_log_melSpectrogram(audio)

model = model.cpu()
with torch.inference_mode():
    model.eval()
    output, second_last = model(spec.unsqueeze(0))
    print(torch.sigmoid(output))
    print(output)

print(scapper_df['acoustic_scene_label'][i])


tensor([[0.0246, 0.0101, 0.1303, 0.7482, 0.0101, 0.0275, 0.9643, 0.7399, 0.1653,
         0.0674]])
tensor([[-3.6797, -4.5863, -1.8986,  1.0888, -4.5824, -3.5652,  3.2959,  1.0453,
         -1.6197, -2.6277]])
restaurant


In [86]:
scapper_scene_labels

['bus',
 'busystreet',
 'office',
 'openairmarket',
 'park',
 'quietstreet',
 'restaurant',
 'supermarket',
 'tube',
 'tubestation']

In [20]:
for i in range(0,5):
    print(label_to_one_hot([scapper_dataset[i]['scene_label']]))

[[1 0 0 0 0 0 0 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0]]
