<a href="https://colab.research.google.com/github/eikegermann/audio_classifier_test/blob/main/Firearm_classifier_pretrained_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyroomacoustics

In [12]:
import os
import librosa
import torch
import random

import numpy as np
import torch.nn as nn
import torch.optim as optim
import pyroomacoustics as pra


from pathlib import Path
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

In [13]:
class AudioDataset(Dataset):
    def __init__(self, root_dir, augment=False):
        self.root_dir = root_dir
        self.labels = sorted(os.listdir(self.root_dir))
        self.filepaths = list(Path(root_dir).rglob("*.wav"))
        self.augment = augment

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, index):
        audio_file_path = self.filepaths[index]
        label = self.labels.index(Path(audio_file_path).parent.name)

        audio, sr = librosa.load(audio_file_path, sr=16000)  # Load audio at 16kHz sample rate

        if self.augment:
            audio = self.apply_augmentation(audio, sr)

        # Pad or trim audio to a fixed length of 0.96 seconds (15360 samples)
        max_length = 15360
        audio_padded = np.zeros(max_length)
        audio_padded[:min(max_length, len(audio))] = audio[:max_length]

        # Compute mel-spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(audio_padded, sr=sr, n_fft=400, hop_length=160, n_mels=64)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

        audio_tensor = torch.tensor(mel_spectrogram).float()
        label_tensor = torch.tensor(label, dtype=torch.long)

        return audio_tensor.unsqueeze(0), label_tensor

    def add_reverb(audio, sr=16000, max_order=10, absorption=0.5):
        # Create a shoebox room with the given dimensions and absorption
        room_dim = np.array([10, 7, 3])
        room = pra.ShoeBox(room_dim, absorption=absorption, fs=sr, max_order=max_order)

        # Place the source and microphone in the room
        room.add_source([2, 3, 2], signal=audio)
        room.add_microphone_array(pra.MicrophoneArray(np.array([[4, 5, 2]]).T, room.fs))

        # Compute the RIR (room impulse response)
        room.compute_rir()

        # Simulate the reverberant audio
        room.simulate()
        reverberant_audio = room.mic_array.signals[0, :]

        # Normalize the audio to avoid clipping or distortion
        reverberant_audio = reverberant_audio / np.max(np.abs(reverberant_audio))

        return reverberant_audio

    def apply_frequency_mask(audio, sr, mask_factor=0.5):
        # Convert audio to the frequency domain using the short-time Fourier transform (STFT)
        stft = librosa.stft(audio)

        # Generate a random frequency mask
        mask_shape = stft.shape
        mask = np.random.uniform(low=1-mask_factor, high=1, size=mask_shape)

        # Apply the mask to the STFT
        masked_stft = np.multiply(stft, mask)

        # Convert the masked STFT back to the time domain
        masked_audio = librosa.istft(masked_stft)

        return masked_audio        


    def apply_augmentation(self, audio, sr):
        num_augmentations = random.choice([0, 1, 2])

        augmentation_types = ['pitch_shift', 'add_noise', 'time_stretch', 'reverb', 'masking']
        random.shuffle(augmentation_types)

        for i in range(num_augmentations):
            augmentation_type = augmentation_types[i]

            if augmentation_type == 'pitch_shift':
                pitch_shift = round(random.uniform(-2, 2), 2)
                audio = librosa.effects.pitch_shift(audio, sr, pitch_shift)
            
            elif augmentation_type == 'add_noise':
                noise = np.random.normal(0, 0.005, len(audio))
                audio = audio + noise
                
            elif augmentation_type == 'time_stretch':
                rate = random.uniform(0.9, 1.1)
                audio = librosa.effects.time_stretch(audio, rate)
            
            elif augmentation_type == 'reverb':
                audio = add_reverb(audio, sr)

            elif augmentation_type == 'masking':
                audio = apply_frequency_mask(audio, sr)

        return audio




In [14]:
!pwd

/content


In [15]:
data_path = "drive/MyDrive/audio_ml_data/firearm_samples/"
train_dataset = AudioDataset(data_path + "train/")
test_dataset = AudioDataset(data_path + "test/")

In [16]:
import torch.nn as nn

class CustomVGGishClassifier(nn.Module):
    def __init__(self, num_classes=3):
        super(CustomVGGishClassifier, self).__init__()
        self.vggish = torch.hub.load('harritaylor/torchvggish', 'vggish', preprocess=False)
        self.fc = nn.Linear(128, num_classes)  # 128 is the output size of VGGish

    def forward(self, x):
        x = self.vggish(x)
        x = self.fc(x)
        return x

In [18]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the custom VGGish model and send it to the device
model = CustomVGGishClassifier(num_classes=3)
model.to(device)

# Initialize the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set up the data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Train the model
num_epochs = 20
best_f1 = 0.0
best_checkpoint = None

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_preds, train_labels = [], []

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        train_preds.extend(outputs.argmax(dim=1).cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
    
    # Print the average loss for this epoch
    epoch_loss = running_loss / len(train_loader)
    
    # Calculate train accuracy and F1 score
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    # Evaluate on test data
    model.eval()
    test_preds, test_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            test_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    test_accuracy = accuracy_score(test_labels, test_preds)
    test_f1 = f1_score(test_labels, test_preds, average='macro')

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Train Acc: {train_accuracy:.4f}, Train F1: {train_f1:.4f}, Test Acc: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")

    # Save the best checkpoint based on the test F1 score
    if test_f1 > best_f1:
        best_f1 = test_f1
        best_checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
            'train_accuracy': train_accuracy,
            'train_f1': train_f1,
            'test_accuracy': test_accuracy,
            'test_f1': test_f1
        }
        torch.save(best_checkpoint, 'best_checkpoint.pth')

print("Finished Training")


Using cache found in /root/.cache/torch/hub/harritaylor_torchvggish_master


Epoch [1/20], Loss: 75.4350, Train Acc: 0.3750, Train F1: 0.1846, Test Acc: 0.3143, Test F1: 0.1977
Epoch [2/20], Loss: 49.3735, Train Acc: 0.3542, Train F1: 0.2387, Test Acc: 0.2857, Test F1: 0.2527
Epoch [3/20], Loss: 30.3287, Train Acc: 0.3333, Train F1: 0.2489, Test Acc: 0.4000, Test F1: 0.3202
Epoch [4/20], Loss: 25.8293, Train Acc: 0.3333, Train F1: 0.2699, Test Acc: 0.4286, Test F1: 0.3348
Epoch [5/20], Loss: 28.2136, Train Acc: 0.3125, Train F1: 0.2355, Test Acc: 0.5143, Test F1: 0.3928
Epoch [6/20], Loss: 22.6187, Train Acc: 0.5208, Train F1: 0.3728, Test Acc: 0.6000, Test F1: 0.4294
Epoch [7/20], Loss: 19.3827, Train Acc: 0.6458, Train F1: 0.5143, Test Acc: 0.5429, Test F1: 0.4528
Epoch [8/20], Loss: 17.7788, Train Acc: 0.5000, Train F1: 0.4211, Test Acc: 0.6000, Test F1: 0.5206
Epoch [9/20], Loss: 16.2748, Train Acc: 0.5208, Train F1: 0.4030, Test Acc: 0.5429, Test F1: 0.4553
Epoch [10/20], Loss: 13.1785, Train Acc: 0.5625, Train F1: 0.4885, Test Acc: 0.6000, Test F1: 0.4998