# Hearbet Sound Anomaly Detector - Training

In [1]:
import pandas as pd
from pathlib import Path
import torchaudio
import torchaudio.transforms as T
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchmetrics import Precision

In [2]:
torch.__version__

'2.1.2+cu121'

In [3]:
torch.cuda.is_available()

True

## Creating Pandas dataframe

In [4]:
AUDIO_DIR = Path('../unzipped_data/')

def list_files():
    for file in AUDIO_DIR.glob('**/*.wav'):
        yield file.as_posix()

def create_dataframe():
    data_files = []
    for filename in list_files():
        if filename.find('artifact')>-1:
            data_files.append((filename, 'artifact'))
        if filename.find('extrahls')>-1:
            data_files.append((filename, 'extrahls'))
        if filename.find('extrastole')>-1:
            data_files.append((filename, 'extrastole'))
        if filename.find('murmur')>-1:
            data_files.append((filename, 'murmur'))
        if filename.find('normal')>-1:
            data_files.append((filename, 'normal'))

    return pd.DataFrame(data_files, columns=('fname', 'label'))

In [5]:
dataframe = create_dataframe()

## Custom Dataset and DataLoaders

In [6]:
labels = dataframe.label.unique()
dict(zip(labels, range(len(labels))))

{'artifact': 0, 'extrahls': 1, 'murmur': 2, 'normal': 3, 'extrastole': 4}

In [7]:
class AudioDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, audio_length: float, target_sample_rate: int) -> None:
        self.dataframe = dataframe
        self.audio_length = audio_length
        self.target_sample_rate = target_sample_rate
        self.num_samples = target_sample_rate * audio_length
        self.labels = dataframe['label'].values
        self.filenames = dataframe['fname'].values
        self.class_indices = {'artifact': 0, 'extrahls': 1, 'murmur': 2, 'normal': 3, 'extrastole': 4}
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index) -> dict:
        waveform, sample_rate = torchaudio.load(self.filenames[index])
        waveform = torch.mean(waveform, axis=0)
        
        if sample_rate!=self.target_sample_rate:
            resampler = T.Resample(sample_rate, self.target_sample_rate)
            waveform = resampler(waveform)
        
        if waveform.shape[0] > self.num_samples:
            waveform = waveform[:self.num_samples]
        else:
            waveform = F.pad(waveform, (0, self.num_samples - waveform.shape[0]))
        melspectrgoram = T.MelSpectrogram(n_fft=128, n_mels=128, hop_length=128)
        melspec = melspectrgoram(waveform)
        
        class_idx = self.class_indices[self.labels[index]]
        
        return torch.stack([melspec]), class_idx

In [8]:
SEED = 42
AUDIO_LENGTH: int = 15
TARGET_SAMPLE_RATE: int = 4000

audio_dataset = AudioDataset(dataframe, AUDIO_LENGTH, TARGET_SAMPLE_RATE)
generator = torch.Generator().manual_seed(SEED)
train_dataset, test_dataset = torch.utils.data.random_split(audio_dataset, [.7, .3], generator)

In [9]:
print('Train:', len(train_dataset.indices), '-- Test:', len(test_dataset.indices))

Train: 410 -- Test: 175


In [10]:
del dataframe, audio_dataset

## Training

### Architecture

In [11]:
# NUM_CLASSES: 5
# INPUT_SHAPE: (128, 469, 1)

class HearbetDetectorModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=(3, 4)),
            nn.MaxPool2d(kernel_size=(2, 3), stride=(2, 3)),
            nn.ReLU()
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=(3, 4)),
            nn.MaxPool2d(kernel_size=(2, 3), stride=(2, 3)),
            nn.ReLU()
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=(3, 4)),
            nn.ReLU()
        )
        self.dense_layers = nn.Sequential(
            nn.Linear(64*28*47, 100),
            nn.ReLU(),
            nn.Linear(100, 5)
        )
    
    def forward(self, X):
        X = self.conv_block1(X)
        X = self.conv_block2(X)
        X = self.conv_block3(X)
        X = X.view(-1, 64*28*47)
        X = self.dense_layers(X)
        return X

In [12]:
print(HearbetDetectorModel())

HearbetDetectorModel(
  (conv_block1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 4), stride=(1, 1))
    (1): MaxPool2d(kernel_size=(2, 3), stride=(2, 3), padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
  )
  (conv_block2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 4), stride=(1, 1))
    (1): MaxPool2d(kernel_size=(2, 3), stride=(2, 3), padding=0, dilation=1, ceil_mode=False)
    (2): ReLU()
  )
  (conv_block3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 4), stride=(1, 1))
    (1): ReLU()
  )
  (dense_layers): Sequential(
    (0): Linear(in_features=84224, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=5, bias=True)
  )
)


### Training model

In [30]:
from typing import Callable

class HearbetDetectorNetwork:
    def __init__(self, model, lr: float) -> None:
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.__optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)
        self.__precision_fn: Callable = Precision(task='multiclass', num_classes=5).to(self.device)

    def __init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            torch.nn.init.xavier_uniform_(module.weight)
            module.bias.data.fill_(0.01)

    def __get_report_by_epoch(self, *args) -> None:
        print('Epoch: {} | Train loss: {:.4f} - Train precision: {:.4f} --- Val loss: {:.4f} - Val precision: {:.4f}'.format(*args))

    def train(self, loss_function, epochs: int, batch_size: int, training_set, validation_set):
        self.model.apply(self.__init_weights)

        train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(validation_set, batch_size=batch_size)

        print('Training on', self.device)

        for epoch in range(epochs):
            train_loss = 0
            for specs, labels in train_loader:
                specs, labels = specs.to(self.device), labels.to(self.device)
                outputs = self.model(specs)
                loss = loss_function(outputs, labels)
                train_loss += loss.item()

                self.__optimizer.zero_grad()
                loss.backward()
                self.__optimizer.step()

            with torch.inference_mode():
                self.model.eval()
                train_precision = 0
                for specs, labels in train_loader:
                    specs, labels = specs.to(self.device), labels.to(self.device)
                    outputs = self.model(specs)
                    train_precision += self.__precision_fn(outputs, labels)

            with torch.inference_mode():
                self.model.eval()
                val_precision, val_loss = 0, 0
                for specs, labels in val_loader:
                    specs, labels = specs.to(self.device), labels.to(self.device)
                    outputs = self.model(specs)
                    loss = loss_function(outputs, labels)
                    val_loss += loss.item()
                    val_precision += self.__precision_fn(outputs, labels)

            self.__get_report_by_epoch(epoch + 1,
                                       train_loss/len(train_loader),
                                       train_precision/len(train_loader),
                                       val_loss/len(val_loader), 
                                       val_precision/len(val_loader))

    def predict(self, X):
        return self.model(X)

In [32]:
hearbetDetectorNetwork = HearbetDetectorNetwork(HearbetDetectorModel(), lr=0.001)

hearbetDetectorNetwork.train(nn.CrossEntropyLoss(), epochs=15, batch_size=15, training_set=train_dataset, validation_set=test_dataset)

Training on cuda
Epoch: 1 | Train loss: 1.4824 - Train precision: 0.5762 --- Val loss: 1.4547 - Val precision: 0.4583
Epoch: 2 | Train loss: 1.0412 - Train precision: 0.6738 --- Val loss: 1.3538 - Val precision: 0.5806
Epoch: 3 | Train loss: 0.9300 - Train precision: 0.7071 --- Val loss: 1.4102 - Val precision: 0.6167
Epoch: 4 | Train loss: 0.8640 - Train precision: 0.7214 --- Val loss: 1.4957 - Val precision: 0.5917
Epoch: 5 | Train loss: 0.8170 - Train precision: 0.7333 --- Val loss: 1.4561 - Val precision: 0.5917
Epoch: 6 | Train loss: 0.7857 - Train precision: 0.7333 --- Val loss: 1.5030 - Val precision: 0.5833
Epoch: 7 | Train loss: 0.7409 - Train precision: 0.7381 --- Val loss: 1.4173 - Val precision: 0.6000
Epoch: 8 | Train loss: 0.7133 - Train precision: 0.7667 --- Val loss: 1.5883 - Val precision: 0.5778
Epoch: 9 | Train loss: 0.6644 - Train precision: 0.8000 --- Val loss: 1.5146 - Val precision: 0.6083
Epoch: 10 | Train loss: 0.6489 - Train precision: 0.7976 --- Val loss: 1.6