In [None]:

import csv
import datetime
from glob import glob
from pathlib import Path


import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
from IPython.display import Audio
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torchaudio.transforms import AmplitudeToDB, MelSpectrogram, Resample
from tqdm.notebook import tqdm

print(torch.cuda.is_available())

In [None]:
BASE_DIR = Path("../../ml/urbansound8k")
SOURCE_DIR = BASE_DIR
DEST_DIR = BASE_DIR/"processed"
INDEX_PATH = BASE_DIR/"UrbanSound8K.csv"
RATE = 22050
N_FFT = 256
HOP_LENGTH = N_FFT // 2
N_MELS = 100
SPEC_TIMESTEPS = 1000

In [None]:
# files = glob("*.wav", root_dir=SOURCE_DIR)
# filepaths = [SOURCE_DIR/file for file in files]
index = {}
with open(INDEX_PATH, encoding='UTF-8') as index_file:
    csv_reader = csv.DictReader(index_file)
    index = [row for row in csv_reader]
print(index[:3])

In [None]:
def preprocess(filepath):
    audio, sr = torchaudio.load(filepath)

    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)

    if sr != RATE:
        resampler = Resample(sr, RATE)
        audio = resampler(audio)

    num_samples = audio.shape[-1]
    total_duration = num_samples / RATE

    return audio, RATE, num_samples, total_duration

In [None]:
def make_mel_spectrogram(audio):
    spec_transformer = MelSpectrogram(RATE, N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
    mel_spec = spec_transformer(audio).squeeze(0)

    amplitude_to_db_transformer = AmplitudeToDB()
    mel_spec_db = amplitude_to_db_transformer(mel_spec)

    return mel_spec_db

In [None]:
def frame_timings(spec):
    num_frames = spec.shape[-1]
    time_per_frame = HOP_LENGTH / RATE
    time_values = (torch.arange(0, num_frames) * time_per_frame).numpy()
    return num_frames, time_per_frame, time_values

In [None]:
def preprocess_all_folds():
    for record in tqdm(index, total=len(index)):
        fold_dir = Path(f"fold{record['fold']}")
        file_name = record['slice_file_name']
        source = SOURCE_DIR/fold_dir/file_name
        dest_dir = DEST_DIR/fold_dir
        Path.mkdir(dest_dir, exist_ok=True, parents=True)
        dest_file = dest_dir/f"{file_name}.spec"

        audio, sr = preprocess(source)
        mel_spec_db = make_mel_spectrogram(audio)

        torch.save(mel_spec_db, dest_file)

In [None]:



def plot_saved_spec(path):
    spec = torch.load(path)
    plot_spec(spec)

def plot_spec(spec):
    _, _, time_values = frame_timings(spec)
    fig, axs = plt.subplots(1, 1, figsize=(4,4))

    axs.set_xticks(np.arange(0, len(time_values), step=int(len(time_values)/5)), np.round(time_values[::int(len(time_values)/5)], 2))

    axs.imshow(spec.numpy(), origin='lower')
    plt.show()

plot_saved_spec("/home/davery/ml/urbansound8k/processed/fold1/203356-3-0-3.wav-0.spec")
Audio("/home/davery/ml/urbansound8k/fold1/203356-3-0-0.wav")

In [None]:

def plot_audio(audio):

    mel_spec_db = make_mel_spectrogram(audio)

    _, _, time_values = frame_timings(mel_spec_db)

    fig, axs = plt.subplots(2, 1, figsize=(8,4))
    plt.style.use('dark_background')

    axs[0].set_xlabel('Time')
    axs[0].set_ylabel('Amplitude')
    axs[0].plot(audio.t().numpy())

    axs[1].set_xticks(np.arange(0, len(time_values), step=int(len(time_values)/5)), np.round(time_values[::int(len(time_values)/5)], 2))
    axs[1].imshow(mel_spec_db.numpy())
    plt.show()
    Audio(audio, rate=RATE)

In [None]:
def split_spectrogram(spec: torch.Tensor, chunk_size: int) -> torch.Tensor:
    """
    Splits a spectrogram tensor into equal-sized chunks along the time axis.

    This function divides a 2D spectrogram tensor into smaller chunks of a specified size. If the spectrogram
    cannot be evenly divided, the remaining part is zero-padded at the end to form a complete chunk. The output
    is a 3D tensor where the first dimension corresponds to the chunk index.

    Parameters:
    spec (torch.Tensor): A 2D tensor representing the spectrogram with shape (frequency_bins, time_steps).
    chunk_size (int): The desired number of time steps in each chunk.

    Returns:
    torch.Tensor: A 3D tensor with shape (num_chunks, frequency_bins, chunk_size), where num_chunks is the
                  number of total chunks calculated based on the spectrogram size and chunk_size.

    """
    # Calculate number of chunks needed without padding
    new_spec = spec.clone()
    num_chunks = new_spec.shape[1] // chunk_size

    # calculate the size of the remainder
    remainder = new_spec.shape[1] % chunk_size
    if remainder != 0:
        # if there is a remainder, we need to pad the spec
        padding_size = chunk_size - remainder
        padding = torch.zeros(
            (new_spec.shape[0], padding_size),
            dtype=new_spec.dtype,
            device=new_spec.device,
        )
        new_spec = torch.cat([new_spec, padding], dim=1)
        num_chunks += 1
    # Use unfold to split the tensor along the time axis
    unfolded = new_spec.unfold(dimension=1, size=chunk_size, step=chunk_size)

    # unfolded has shape (frequency_bins, num_chunks, chunk_size)
    # We need to transpose it to get (num_chunks, frequency_bins, chunk_size)
    chunks = unfolded.transpose(0, 1)
    return chunks.contiguous()

audio, sr, _, _ = preprocess("/home/davery/ml/urbansound8k/fold1/203356-3-0-3.wav")
mel_spec_db = make_mel_spectrogram(audio)
print(mel_spec_db.shape)
split_spec = split_spectrogram(mel_spec_db, SPEC_TIMESTEPS)
print(split_spec.shape)

# minispec = torch.tensor([[5,6,7,8,9],[5,6,7,8,9],[5,6,7,8,9],[5,6,7,8,9],[5,6,7,8,9]])
# print(f"{minispec=}")
# print(f"{minispec.shape=}")
# split = split_spectrogram(minispec, 2)
# print(f"{split=}")
# print(f"{split.shape=}")


In [None]:
plot_spec(mel_spec_db)
for i in range(len(split_spec)):
    plot_spec(split_spec[i])

In [None]:
def save_split_specs():
    shutil.rmtree(DEST_DIR)
    count = 0
    for record in tqdm(index, total=len(index)):
        fold_dir_name = f"fold{record['fold']}"
        file_name = record['slice_file_name']
        source = SOURCE_DIR/fold_dir_name/file_name
        fold_dir = DEST_DIR/fold_dir_name
        Path.mkdir(fold_dir, exist_ok=True, parents=True)

        audio, sr, num_samples, total_duration = preprocess(source)
        mel_spec_db = make_mel_spectrogram(audio)
        chunks = split_spectrogram(mel_spec_db, SPEC_TIMESTEPS)

        for i in range(len(chunks)):
            dest_file = fold_dir/f"{file_name}-{i}.spec"
            torch.save(chunks[i], dest_file)
            count += 1
    print(f"{count} chunk specs saved")


In [None]:
# save_split_specs()

In [None]:
class SpectrogramDataset(Dataset):
    def __init__(self, spec_dir, transform=None, target_transform=None):
        self.spec_dir = spec_dir
        self.transform = transform
        self.target_transform = target_transform
        self.spec_paths = [os.path.join(self.spec_dir, file) for file in glob("*.spec", root_dir=self.spec_dir)]

    def __len__(self):
        return len(self.spec_paths)

    def __getitem__(self, idx):
        file_path = self.spec_paths[idx]
        file_name = os.path.basename(file_path)
        spec = torch.load(self.spec_paths[idx])
        parts = file_name.split('-')
        label = int(parts[1])
        return spec, label

In [None]:
class BasicCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(BasicCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()

        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, 100, 1000)  # Batch size of 1, 1 channel, 100x100 image
            dummy_output = self.pool1(self.conv1(dummy_input))
            self.flat_features = int(torch.numel(dummy_output) / dummy_output.shape[0])

        self.fc1 = nn.Linear(self.flat_features, num_classes)


    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool1(x)
        x = x.view(-1,self.flat_features)
        x = self.relu(self.fc1(x))
        return x

In [None]:
class BasicCNN_2(nn.Module):
    def __init__(self, num_classes=10):
        super(BasicCNN_2, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.conv1a = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()

        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, 100, 500)  # Batch size of 1, 1 channel, 100x100 image
            dummy_output = self.pool1(self.conv1(dummy_input))
            self.flat_features = int(torch.numel(dummy_output) / dummy_output.shape[0])

        self.fc1 = nn.Linear(self.flat_features, num_classes)


    def forward(self, x):
        x = self.relu(self.conv1a(self.conv1(x)))
        x = self.pool1(x)
        x = x.view(-1,self.flat_features)
        x = self.relu(self.fc1(x))
        return x

In [None]:
class BasicCNN_3(nn.Module):
    def __init__(self, num_classes=10):
        super(BasicCNN_3, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=1, padding=1)
        self.conv1a = nn.Conv2d(64, 64, kernel_size=5, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=1, padding=1)
        self.conv2a = nn.Conv2d(128, 128, kernel_size=5, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=1, padding=1)
        self.conv3a = nn.Conv2d(256, 256, kernel_size=5, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()

        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, 100, 1000)  # Batch size of 1, 1 channel, 100x100 image
            dummy_output = self.pre_flatten(dummy_input)
            self.flat_features = int(torch.numel(dummy_output) / dummy_output.shape[0])

        self.fc1 = nn.Linear(self.flat_features, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def pre_flatten(self, x):
        x = self.relu(self.conv1a(self.conv1(x)))
        x = self.pool1(x)
        x = self.relu(self.conv2a(self.conv2(x)))
        x = self.pool2(x)
        x = self.relu(self.conv3a(self.conv3(x)))
        x = self.pool3(x)
        return x

    def forward(self, x):
        x = self.pre_flatten(x)
        x = x.view(-1,self.flat_features)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return x

In [None]:
def train(model, dataloader, optimizer, loss_function):
    model.train()
    epoch_loss = 0.0
    epoch_correct = 0
    epoch_total = 0
    for batch_idx, (data, target) in enumerate(dataloader):
        data = data.to("cuda")
        # print(data.shape)
        data = data.unsqueeze(1)
        data = F.normalize(data, dim=2)

        target = target.to("cuda")
        optimizer.zero_grad()
        output = model(data)
        loss = loss_function(output, target)
        epoch_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(output.data, 1)
        epoch_total += target.size(0)
        epoch_correct += (predicted == target).sum().item()

        loss.backward()
        optimizer.step()
    avg_loss = epoch_loss / len(dataloader)
    avg_acc = 100.0 * epoch_correct / epoch_total
    return avg_loss, avg_acc


In [None]:
spectrogram_dataset = SpectrogramDataset(spec_dir=DEST_DIR/"fold1")
all_folds = [SpectrogramDataset(spec_dir=DEST_DIR/f"fold{i}") for i in range(1,11)]
spectrogram_datasets = ConcatDataset(all_folds)
spectrogram_dataloader = DataLoader(spectrogram_dataset, batch_size=32, num_workers=8, shuffle=True, pin_memory=True)

In [None]:
model = BasicCNN_3()
model = model.to("cuda")
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_function = nn.CrossEntropyLoss()
loss_function = loss_function.to("cuda")

In [None]:
# for param_group in optimizer.param_groups:
    # param_group['lr'] = 5e-6

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, spectrogram_dataloader, optimizer, loss_function)
    print(f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Epoch {epoch+1}/{num_epochs}, ", end='')
    print(f"Train Loss: {train_loss:.5f}, Train Accuracy: {train_acc:.2f}%", end='\n')
