In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
dir_path = ''
kaggle = '/kaggle/input/'
if os.path.isdir(kaggle):
    dir_path = kaggle
# Read metadata file
data_paths = {'appliences':dir_path + 'appliences/appliences/',
              'urban':dir_path + 'urbansound8k/',
              'raw' :dir_path + 'appliances-raw/'}
meta1 = data_paths['appliences'] + 'metadata/appliences.csv'
meta2 = data_paths['urban'] + 'UrbanSound8K.csv'
df_UrbanSound = pd.read_csv(meta2)
df_Appliences = pd.read_csv(meta1)
smpl = df_UrbanSound.classID.value_counts().max() / df_Appliences.classID.value_counts().min()
#df_Appliences = df_Appliences.sample(frac=upsmpl, replace=True, random_state=1)
df_UrbanSound = df_UrbanSound.sample(frac=(1/smpl), replace=True, random_state=1)

df_UrbanSound['relative_path'] = '/fold' + df_UrbanSound['fold'].astype(str) + '/' + df_UrbanSound['slice_file_name'].astype(str)
df_Appliences['relative_path'] = '/' + df_Appliences['fold'].astype(str) + '/' + df_Appliences['slice_file_name'].astype(str)

df = pd.concat([df_UrbanSound,df_Appliences])
df = df.reset_index(drop=True)
df.head()

#Take relative columns
df = df[['relative_path', 'classID']]
df.head()
#print((df.classID==1).sum())


Unnamed: 0,relative_path,classID
0,/fold9/103249-5-0-8.wav,5
1,/fold2/201652-5-3-0.wav,5
2,/fold1/118279-8-0-5.wav,8
3,/fold6/74726-8-0-0.wav,8
4,/fold8/162103-0-0-5.wav,0


In [2]:
print(df.classID.value_counts())

11    20
5     16
10    16
8     15
0     11
2     11
12    11
9     10
4      9
3      9
7      8
1      4
6      3
Name: classID, dtype: int64


In [3]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
import matplotlib
import matplotlib.pyplot as plt

class AudioUtil():
    #---------------
    # Load an audio file. Return the signal as a tensor and the sample rate
    #---------------
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)
    
    #---------------
    # Convert soundfile to desired number of channels
    #---------------
    @staticmethod
    def rechannel(aud, new_channel):
        
        sig, sr = aud
        
        if sig.shape[0] == new_channel:
            #Nothing todo
            return aud
        
        if (new_channel == 1):
            #Convert stereo to mono by selecting only the first channel
            resig = sig[:1, :]
        else:
            #Convert from mono to sterio by duplicating the first channel
            resig = torch.cat([sig,sig])
        return ((resig, sr))
    
    #---------------
    #Resample to make sure samplerate is the same for all files - resample applies to one channel at a time
    #---------------
    @staticmethod
    def resample(aud, newsr):
        
        sig, sr = aud
        
        if (sr == newsr):
            #do nothing
            return aud
        
        num_channels = sig.shape[0]
        
        #resample first channel
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        
        if (num_channels > 1):
            #Resample the second channel and merge both
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
            
        return ((resig, newsr))
    
    
    #-----------------
    #Pad or turncate the signal to be off a standard length in milliseconds
    #-----------------
    @staticmethod
    def pad_trunc(aud, max_ms):

        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms
        
        if (sig_len > max_len):
            #Turncate the signal to the given length
            sig = sig[:,:max_len]
        elif (sig_len < max_len):
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            
            #pad with zeroes
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        
        return (sig, sr)

    #--------------------
    #Shift the signal by a random bit, end of signal is wrapped around 
    #to beginning
    #--------------------
    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)
    
    #----------------------------------
    #Genetate spectrogram
    #----------------------------------
    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80
        
        #spec has shape [channel, n_mels, time]
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        
        #convert to db
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        
        return spec
    
    
    #Augment the spectrogram by masking out some sections of it in both the frequency
    #dimencion (Horizontal) and the time dimension (vertical bars)
    
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        
        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        
        return np.absolute(aug_spec) # Jonas added np.absolute
    
        

In [4]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

#----------------
#Sound dataset
#----------------
    
class SoundDS(Dataset):
    def __init__(self, df, data_paths):
        self.df = df
        self.data_paths = data_paths
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4
    
    #------------------
    #Number of items in dataset
    #------------------
    def __len__(self):
        return len(self.df)

    #------------------
    #Get i'th item in dataset
    #------------------
    def __getitem__(self, idx):

        # Get the class ID  
        class_id = self.df.loc[idx, 'classID']
        # Absolute file path of the audio file - concatenate the audio direcory
        # with the relative path
        if class_id < 0:
            audio_file = self.data_paths['raw'] + self.df.loc[idx, 'relative_path']
        elif class_id <= df_UrbanSound['classID'].max():  # UrbanSound data set (classIDs 0 - 10)
            audio_file = self.data_paths['urban'] + self.df.loc[idx, 'relative_path']
        else: # Appliences (Class IDs => 10)
            audio_file = self.data_paths['appliences'] + self.df.loc[idx, 'relative_path']
              
        aud = AudioUtil.open(audio_file)
        #Make all the sounds have the same number of channels and same sample rate
        #Then make all samples the same length
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)
        
        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        #aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
        return aug_sgram, class_id


In [5]:
# Splitt data for training and validation

myds = SoundDS(df, data_paths) #returns spectrogram and classID
#Random split between training and validation data

num_items = len(myds)
num_train = round(num_items*0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)


In [6]:
import torch.nn.functional as F
from torch.nn import init
from torch import nn

#---------------------------
# Audio Classification Model
#---------------------------
class AudioClassifier (nn.Module):
    #---------------------------
    #Build the model architecture
    #---------------------------
    
    def __init__(self):
        super().__init__()
        conv_layers = []
        
        #First convolution block with Relu and Batch Norm. Use Kaiming Initialisation
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        #Second convolution block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]
        
        #Third convolution block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        #Fourth convolution block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4] 
        
        # Linear classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=df.classID.max()+1) #Changed Jonas out_features
        
        # Wrap the convolutional blocks
        self.conv = nn.Sequential(*conv_layers)
    
    #-------------------------
    #Forward pass computations
    #-------------------------
    def forward(self, x):
        # Run convolutional blocks
        x = self.conv(x)
        
        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)
        
        # Linear layer
        x = self.lin(x)
        
        # Final output
        
        return x

# Create the model and put it on the GPU if available
myModel =  AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
#Check that it is on Cuda
next(myModel.parameters()).device

device(type='cpu')

In [7]:
import numpy as np


#-------------------
#Training loop
#-------------------
def training(model, train_dl, num_epochs):
    #Loss function optimiser and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimiser, max_lr=0.001,
                                                    steps_per_epoch=int(len(train_dl)),
                                                    epochs=num_epochs,
                                                    anneal_strategy='linear')
    
    #Repeat for each epoch
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0
        
        #repeat for each batch in the training set
        for i, data in enumerate(train_dl):
            # Get the input features and target labels and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)
            
            #Normalise the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            #Zero prameter gradients
            optimiser.zero_grad()
            
            #Forward + backward + optimise
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimiser.step()
            scheduler.step()
            
            #Keep stats for Loss and Accuracy
            running_loss += loss.item()
            
            
            #Get predicted class with highest score
            _, prediction = torch.max(outputs, 1)
            # Count the predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]
            
            if i % 10 == 0: #
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
        
        #Print stats at the end of each epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        acc = correct_prediction / total_prediction
        print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')
    
    print('Finished Training')
    
num_epochs = 2
training(myModel, train_dl, num_epochs)
    

[1,     1] loss: 0.260
Epoch: 0, Loss: 2.58, Accuracy: 0.10
[2,     1] loss: 0.243
Epoch: 1, Loss: 2.43, Accuracy: 0.31
Finished Training


In [8]:
import numpy as np

#-------------------
#Inferrence
#-------------------

def inferrence (model, v_dl):
    correct_prediction = 0
    total_prediction = 0
    
    #Disable gradient updates
    with torch.no_grad():
        for data in v_dl:
            # Get the input features and target labels and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)
            #Normalise the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            #Get predictions
            outputs = model(inputs)
            
            #Get predicted putputs with highest score
            i, prediction = torch.max(outputs,1)
            for n in range(len(outputs)):
                if labels[n].item() == -1:
                    print("Predicted class:{}, score:{}, actual class:{}".format(prediction[n].item(), outputs[n].max().item(), labels[n].item()))
                else:
                    if prediction[n].item() == labels[n].item():
                        df_metr['correct_pred'].loc[prediction[n].item()] += 1
                    else:
                        df_metr['false_pred'].loc[prediction[n].item()] += 1
                #print("Predicted class:{}, score:{}, actual class:{}".format(prediction[n].item(), outputs[n].max().item(), labels[n].item()))
            #print(outputs[15].max(),prediction[15], labels[15])
            #Count of predictions that matched target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]
            
    acc = correct_prediction / total_prediction
    df_metr['accuracy'] = df_metr['correct_pred'] / (df_metr['correct_pred'] + df_metr['false_pred'])
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

    
classIDs = sorted(list(df.classID.unique()))
df_metr = pd.DataFrame({'classIDs': classIDs,
                        'correct_pred': np.zeros(len(classIDs)),
                        'false_pred': np.zeros(len(classIDs)),
                        'accuracy': np.zeros(len(classIDs))})
df_metr = df_metr.set_index('classIDs')

#Run inferrence on trained model with validation set
inferrence(myModel, val_dl)


Accuracy: 0.17, Total items: 29


In [9]:
print(df_metr)

          correct_pred  false_pred  accuracy
classIDs                                    
0                  1.0         2.0  0.333333
1                  0.0         0.0       NaN
2                  0.0         0.0       NaN
3                  0.0         1.0  0.000000
4                  0.0         0.0       NaN
5                  0.0         6.0  0.000000
6                  0.0         0.0       NaN
7                  0.0         0.0       NaN
8                  0.0         4.0  0.000000
9                  0.0         2.0  0.000000
10                 4.0         0.0  1.000000
11                 0.0         8.0  0.000000
12                 0.0         1.0  0.000000


In [10]:
# Test of inferrence with unlabeled recording of dehumifier

test_rec = {'relative_path' : ['Recording_11.wav'],
            'classID' : [-1]}

df_test = pd.DataFrame(test_rec)

testds = SoundDS(df_test, data_paths)
test_dl = torch.utils.data.DataLoader(testds, batch_size=16, shuffle=False)
inferrence(myModel, test_dl)


Predicted class:5, score:0.12656208872795105, actual class:-1
Accuracy: 0.00, Total items: 1


In [11]:
from pathlib import Path
if str(Path.cwd()) == '/kaggle/working':
    print('kaggle')

print(os.listdir())

kaggle
['__notebook__.ipynb']
