<a href="https://colab.research.google.com/github/eikegermann/audio_classifier_test/blob/main/audio_classifier_cnn_2_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import math
import random
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from IPython.display import Audio, display


In [2]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, n_mels, max_length_s=3):
        self.file_paths = file_paths
        self.labels = labels
        self.n_mels = n_mels
        self.max_length_s = max_length_s

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio, sample_rate = librosa.load(self.file_paths[idx], sr=44100)
        audio = self.adjust_audio_length(audio, sample_rate)
        mel_spectrogram = self.preprocess_data(audio, sample_rate, self.n_mels)
        label = self.labels[idx]
        return mel_spectrogram, label

    def adjust_audio_length(self, audio, sample_rate):
        target_length = math.ceil(self.max_length_s * sample_rate)
        if len(audio) < target_length:
            padding = target_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')
        elif len(audio) > target_length:
            audio = audio[:target_length]
        return audio

    def preprocess_data(self, audio, sample_rate, n_mels):
        hop_length = 512
        mel_spec = librosa.feature.melspectrogram(audio,
                                                  sr=sample_rate,
                                                  n_fft=2048,
                                                  hop_length=hop_length,
                                                  n_mels=n_mels)
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec = np.expand_dims(mel_spec, axis=0)  # Add channel dimension (C, H, W)

        # Normalize the spectrogram
        mean = np.mean(mel_spec)
        std = np.std(mel_spec)
        normalized_spec = (mel_spec - mean) / std

        return normalized_spec


In [3]:
class AudioClassifier(nn.Module):
    def __init__(self, n_mels, max_length_s):
        super(AudioClassifier, self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)

        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.4)

        self.fc1 = None
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 2)
        
        self._initialize_fc1(n_mels, max_length_s)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))

        x = self.dropout(x)
        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)

        return x
    
    def _initialize_fc1(self, n_mels, max_length_s):
        with torch.no_grad():
            sample_input = torch.randn(1, 1, n_mels, int(max_length_s * 44100 // 512 + 1))
            x = self.pool(F.relu(self.bn1(self.conv1(sample_input))))
            x = self.pool(F.relu(self.bn2(self.conv2(x))))
            x = self.pool(F.relu(self.bn3(self.conv3(x))))
            flattened_size = x.view(x.size(0), -1).shape[1]
            self.fc1 = nn.Linear(flattened_size, 256)




In [4]:
!pwd

/content


In [5]:
# Create training and test datasets
data_path = "drive/MyDrive/audio_ml_data/samples/"
n_mels = 156
max_length_s = 1
test_size = 0.2


def load_file_paths_and_labels(data_path):
    file_paths = []
    labels = []
    for label, class_folder in enumerate(os.listdir(data_path)):
        class_path = os.path.join(data_path, class_folder)
        for audio_file in os.listdir(class_path):
            file_path = os.path.join(class_path, audio_file)
            file_paths.append(file_path)
            labels.append(label)
    return file_paths, labels


file_paths, labels = load_file_paths_and_labels(data_path)

train_file_paths, test_file_paths, train_labels, test_labels = train_test_split(
    file_paths, labels, test_size=test_size, random_state=42, stratify=labels
)

train_dataset = AudioDataset(train_file_paths, train_labels, n_mels, max_length_s)
test_dataset = AudioDataset(test_file_paths, test_labels, n_mels, max_length_s)

# Create DataLoader instances for training and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize the model
model = AudioClassifier(n_mels, max_length_s)
#model = AudioClassifier(max_length_s)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
# Training parameters
epochs = 20
learning_rate = 3e-4
best_test_accuracy = 0.0
weight_decay = 0.0025

# Set up the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),
                       lr=learning_rate,
                       weight_decay=weight_decay)

def count_samples_and_calculate_splits(data_path, test_size):
    total_samples = 0
    class_folders = os.listdir(data_path)
    
    for class_folder in class_folders:
        class_path = os.path.join(data_path, class_folder)
        num_samples_in_class = len(os.listdir(class_path))
        total_samples += num_samples_in_class
    
    test_samples = int(total_samples * test_size)
    train_samples = total_samples - test_samples
    
    return total_samples, train_samples, test_samples

total_samples, train_samples, test_samples = count_samples_and_calculate_splits(data_path, test_size)
print(f"Total samples: {total_samples}")
print(f"Training samples: {train_samples}")
print(f"Test samples: {test_samples}")

def calculate_max_accuracy_change(train_samples, test_samples):
    max_train_accuracy_change = 1 / train_samples * 100
    max_test_accuracy_change = 1 / test_samples * 100
    
    return max_train_accuracy_change, max_test_accuracy_change

max_train_accuracy_change, max_test_accuracy_change = calculate_max_accuracy_change(train_samples, test_samples)
print(f"Maximum change in training accuracy: {max_train_accuracy_change:.4f}%")
print(f"Maximum change in test accuracy: {max_test_accuracy_change:.4f}%")

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0

    for mel_spec, label in train_loader:
        mel_spec, label = mel_spec.to(device), label.to(device)
        optimizer.zero_grad()

        output = model(mel_spec)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = output.max(1)
        total += label.size(0)
        correct += predicted.eq(label).sum().item()

    train_accuracy = 100.0 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss / total:.6f}, Training accuracy: {train_accuracy:.2f}%")

    # Evaluation on the test set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for mel_spec, label in test_loader:
            mel_spec, label = mel_spec.to(device), label.to(device)
            output = model(mel_spec)
            loss = criterion(output, label)

            test_loss += loss.item()
            _, predicted = output.max(1)
            total += label.size(0)
            correct += predicted.eq(label).sum().item()

        test_accuracy = 100.0 * correct / total
        print(f"Test Loss: {test_loss / total:.6f}, Test accuracy: {test_accuracy:.2f}%")

    # Save the best checkpoint
    if test_accuracy > best_test_accuracy:
        best_test_accuracy = test_accuracy
        torch.save(model.state_dict(), "best_checkpoint.pth")
        print(f"New best checkpoint saved with accuracy: {best_test_accuracy:.2f}%")


Total samples: 300
Training samples: 240
Test samples: 60
Maximum change in training accuracy: 0.4167%
Maximum change in test accuracy: 1.6667%


In [None]:
def predict_sample(model_path, eval_data_path, n_mels=156, max_length_s=3):
    # Load the model
    model = AudioClassifier(n_mels, max_length_s)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # Select a random class and sample
    available_classes = os.listdir(eval_data_path)
    class_folder = random.choice(available_classes)
    class_path = os.path.join(eval_data_path, class_folder)
    sample_path = os.path.join(class_path, random.choice(os.listdir(class_path)))

    # Load the sample and create mel-spectrogram
    audio, sample_rate = librosa.load(sample_path, sr=44100)
    audio = adjust_audio_length(audio, sample_rate, max_length_s)
    mel_spectrogram = preprocess_data(audio, sample_rate, n_mels)

    input_tensor = torch.tensor(mel_spectrogram).unsqueeze(0)

    # Make the prediction
    with torch.no_grad():
        output = model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()


    # Play the audio sample
    display(Audio(audio, rate=sample_rate))

    print(f"Predicted class: {available_classes[prediction]}")

    # Display the sample waveform
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sample_rate)
    plt.title(f"Waveform of the audio sample (Class: {class_folder})")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.show()

    # Display the mel-spectrogram
    plt.figure(figsize=(10, 4))
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram.squeeze(0), ref=np.max)
    librosa.display.specshow(mel_spectrogram_db, sr=sample_rate, hop_length=512, x_axis='time', y_axis='mel')
    plt.title(f"Mel-spectrogram of the audio sample (Class: {class_folder})")
    plt.colorbar(format='%+2.0f dB')
    plt.show()


def adjust_audio_length(audio, sample_rate, max_length_s):
    target_length = math.ceil(max_length_s * sample_rate)
    if len(audio) < target_length:
        padding = target_length - len(audio)
        audio = np.pad(audio, (0, padding), mode='constant')
    elif len(audio) > target_length:
        audio = audio[:target_length]
    return audio

def preprocess_data(audio, sample_rate, n_mels):
    hop_length = 512
    mel_spec = librosa.feature.melspectrogram(audio,
                                              sr=sample_rate,
                                              n_fft=2048,
                                              hop_length=hop_length,
                                              n_mels=n_mels)
    mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec = np.expand_dims(mel_spec, axis=0)  # Add channel dimension (C, H, W)

    # Normalize the spectrogram
    mean = np.mean(mel_spec)
    std = np.std(mel_spec)
    normalized_spec = (mel_spec - mean) / std

    return normalized_spec

best_checkpoint = "best_checkpoint.pth"
eval_samples_path = "drive/MyDrive/audio_ml_data/eval_samples"



In [None]:
predict_sample(best_checkpoint, eval_samples_path, n_mels=n_mels, max_length_s=max_length_s)

In [None]:
predict_sample(best_checkpoint, eval_samples_path, max_length_s=max_length_s)

In [None]:
predict_sample(best_checkpoint, eval_samples_path, max_length_s=max_length_s)