# Hearbet Sound Anomaly Detector - Training

In [147]:
import pandas as pd
from pathlib import Path
import torchaudio
import torchaudio.transforms as T 
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np

## Creating Pandas dataframe

In [148]:
AUDIO_DIR = Path('../unzipped_data/')

def list_files():
    for file in AUDIO_DIR.glob('**/*.wav'):
        yield file.as_posix()

def create_dataframe():     
    data_files = []
    for filename in list_files():
        if filename.find('artifact')>-1:
            data_files.append((filename, 'artifact'))
        if filename.find('extrahls')>-1:
            data_files.append((filename, 'extrahls'))
        if filename.find('extrastole')>-1:
            data_files.append((filename, 'extrastole'))
        if filename.find('murmur')>-1:
            data_files.append((filename, 'murmur'))
        if filename.find('normal')>-1:
            data_files.append((filename, 'normal'))
    
    return pd.DataFrame(data_files, columns=('fname', 'label'))

In [149]:
dataframe = create_dataframe()

## Custom Dataset and DataLoaders

In [214]:
labels = dataframe.label.unique()
dict(zip(labels, range(len(labels))))

{'artifact': 0, 'extrahls': 1, 'murmur': 2, 'normal': 3, 'extrastole': 4}

In [215]:
class AudioDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, audio_length: float, target_sample_rate: int) -> None:
        self.dataframe = dataframe
        self.audio_length = audio_length
        self.target_sample_rate = target_sample_rate
        self.num_samples = target_sample_rate * audio_length
        self.labels = dataframe['label'].values
        self.filenames = dataframe['fname'].values
        self.class_indices = {'artifact': 0, 'extrahls': 1, 'murmur': 2, 'normal': 3, 'extrastole': 4}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index) -> dict:
        waveform, sample_rate = torchaudio.load(self.filenames[index])
        waveform = torch.mean(waveform, axis=0)
        
        if sample_rate!=self.target_sample_rate:
            resampler = T.Resample(sample_rate, self.target_sample_rate)
            waveform = resampler(waveform)
        
        if waveform.shape[0] > self.num_samples:
            waveform = waveform[:self.num_samples]
        else:
            waveform = F.pad(waveform, (0, self.num_samples - waveform.shape[0]))
        melspectrgoram = T.MelSpectrogram(n_fft=128, n_mels=128, hop_length=128)
        melspec = melspectrgoram(waveform)
        
        class_idx = self.class_indices[self.labels[index]]
        
        return torch.stack([melspec]), class_idx

In [216]:
SEED = 42
AUDIO_LENGTH: int = 15
TARGET_SAMPLE_RATE: int = 4000

audio_dataset = AudioDataset(dataframe, AUDIO_LENGTH, TARGET_SAMPLE_RATE)
generator = torch.Generator().manual_seed(SEED)
train_dataset, test_dataset = torch.utils.data.random_split(audio_dataset, [.7, .3], generator)

In [217]:
print('Train:', len(train_dataset.indices), '-- Test:', len(test_dataset.indices))

Train: 410 -- Test: 175


In [218]:
BATCH_SIZE: int = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset)

## Training

In [189]:
NUM_CLASSES: int = 5
INPUT_SHAPE = (128, 469, 1)

class HearbetDetectorModel(nn.Module):
    def __init__(self, num_classes: int) -> None:
        super().__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 4)),
            nn.MaxPool2d(kernel_size=(2, 3), stride=(2, 3)),
            nn.ReLU()
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=(3, 4)),
            nn.MaxPool2d(kernel_size=(2, 3), stride=(2, 3)),
            nn.ReLU()
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=(3, 4)),
            nn.ReLU()
        )
        self.dense_layers = nn.Sequential(
            nn.Linear(64*28*47, 100),
            nn.ReLU(),
            nn.Linear(100, num_classes)
        )
    
    def forward(self, X):
        X = self.conv_block1(X)
        X = self.conv_block2(X)
        X = self.conv_block3(X)
        X = X.view(-1, 64*28*47)
        X = self.dense_layers(X)
        return X

In [190]:
model = HearbetDetectorModel(NUM_CLASSES)
model

HearbetDetectorModel(
  (conv_block1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 4), stride=(1, 1))
    (1): MaxPool2d(kernel_size=(2, 3), stride=(2, 3), padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
  )
  (conv_block2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 4), stride=(1, 1))
    (1): MaxPool2d(kernel_size=(2, 3), stride=(2, 3), padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
  )
  (conv_block3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 4), stride=(1, 1))
    (1): ReLU()
  )
  (dense_layers): Sequential(
    (0): Linear(in_features=84224, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=5, bias=True)
  )
)

In [191]:
loss_fn = nn.CrossEntropyLoss()
lr = 0.001
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)

In [226]:
NUM_EPOCHS = 5

for epoch in range(NUM_EPOCHS):
    train_loss = []
    for specs, labels in train_loader:
        outputs = model(specs)
        loss = loss_fn(outputs, labels) 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
    print(f'Epoch: {epoch} | Training loss: {np.mean(train_loss)}')
        

Epoch: 0 | Training loss: 0.5849769539557971
Epoch: 1 | Training loss: 0.5971282754953091
Epoch: 2 | Training loss: 0.5902307572273108
Epoch: 3 | Training loss: 0.5602060728348218
Epoch: 4 | Training loss: 0.5484531338398273
