# Urbansed

In [None]:
# Automatically reload modules before entering the execution of code typed at 
# the IPython prompt
%load_ext autoreload
%autoreload 2

In [None]:
# Import used libraries
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import librosa
import torch

torch.manual_seed(0)

print(f"{pd.__name__} version: {pd.__version__}")
print(f"{matplotlib.__name__} version: {matplotlib.__version__}")
print(f"{librosa.__name__} version: {librosa.__version__}")
print(f"{torch.__name__} version: {torch.__version__}")

In [33]:
from yoho.utils import AudioFile

def plot_melspectrogram(
    audio: AudioFile,
    n_mels: int = 40,
    win_len: float = 0.04,
    hop_len: float = 0.01,
):
    """
    Plots the Mel spectrogram.
    """
    plt.figure(figsize=(10, 4))
    plt.title(f"Mel spectrogram")
    librosa.display.specshow(
        data=audio.mel_spectrogram(
            n_mels=n_mels, win_len=win_len, hop_len=hop_len
        ),
        sr=audio.sr,
        hop_length=hop_len*audio.sr,
        x_axis="s",
        y_axis="mel",
    )
    plt.colorbar(format="%+2.0f dB")
    plt.tight_layout()
    plt.show()

## TUT Dataset

In [None]:
from yoho.train import load_dataset
from yoho.utils import UrbanSEDDataset

urbansed_train: UrbanSEDDataset = load_dataset(partition="train")
urbansed_val: UrbanSEDDataset = load_dataset(partition="validate")

print(f"Number of audio clips in the train dataset: {len(urbansed_train)}")
print(f"Number of audio clips in the validation dataset: {len(urbansed_val)}")
print(f"Duration of each audio clips: {urbansed_train.audios[0].duration} seconds")
print(f"Sampling rate of each audio clips: {urbansed_train.audios[0].sr} Hz")

## Data generator

In [None]:
from yoho.utils import YOHODataGenerator

val_dataloader = YOHODataGenerator(
    urbansed_val, batch_size=32, shuffle=True
)

val_features, val_labels = next(iter(val_dataloader))

print(f"Test features shape: {val_features.shape}")
print(f"Test labels shape: {val_labels.shape}")

## YOHO model

In [None]:
from torchsummary import summary

from yoho.models import YOHO
from yoho.train import get_device

# Get the available device (cuda, mps or cpu)
device = get_device()

# Define the model
urbansed_model = YOHO(
    name="UrbanSEDYOHO",
    input_shape=(1, 40, 257), n_classes=len(urbansed_train.labels)
).to(device)

urbansed_model.load(f"./models/{urbansed_model.name}_checkpoint.pth.tar", device)

summary(
    urbansed_model,
    input_size=(1, 40, 257),
    batch_size=32,
)

## Evaluation

In [None]:
import json
import matplotlib.pyplot as plt

# Path to the losses.json file:

# Load the losses.json file
with open("./models/UrbanSEDYOHO_losses.json", "r") as f:
    losses = json.load(f)

# Extract epochs, train_loss, and val_loss
epochs = []
train_losses = []
val_losses = []

for epoch, metrics in losses.items():
    epochs.append(int(epoch))
    train_losses.append(metrics["train_loss"])
    val_losses.append(metrics["val_loss"])

# Sort the data by epochs to ensure correct plotting
epochs, train_losses, val_losses = zip(
    *sorted(zip(epochs, train_losses, val_losses))
)


# Plot the training and validation losses
plt.figure(figsize=(10, 4))
plt.plot(epochs, train_losses, label="Training Loss")
plt.plot(epochs, val_losses, label="Validation Loss", linestyle="--")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Losses")
plt.legend()
plt.grid(True)
plt.show()

In [38]:
prediction = urbansed_model(val_features.__getitem__(11).unsqueeze(0).to(device))
target = urbansed_val._get_output(11)
# Add the batch dimension to the target which is a numpy array
target = torch.from_numpy(target).unsqueeze(0).to(device)

In [None]:
import numpy as np

labels_ = [
    "noise",
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]


def process_output(output: np.array) -> list[tuple[str, float, float]]:

    STEPS_NO = 9
    step_duration = 2.56 / STEPS_NO
    MIN_EVENT_DURATION = 0
    MIN_SILENCE_DURATION = 1.0

    labels = []
    for i in range(output.shape[1]):

        for j in range(0, output.shape[0], 3):
            if output[j, i] >= 0.5:
                label = labels_[j // 3]
                start = (
                    i * step_duration + output[j + 1, i].item() * step_duration
                )
                end = (
                    i * step_duration + output[j + 2, i].item() * step_duration
                )
                labels.append((label, round(start, 2), round(end, 2)))

    # Order the labels by class
    labels = sorted(labels, key=lambda x: x[0])

    # Merge events of the same class that are close to each other
    merged_labels = []
    for label, start, end in labels:
        if not merged_labels:
            merged_labels.append((label, start, end))
        else:
            prev_label, prev_start, prev_end = merged_labels[-1]
            if prev_label == label and start - prev_end < MIN_SILENCE_DURATION:
                merged_labels[-1] = (label, prev_start, end)
            else:
                merged_labels.append((label, start, end))

    # Remove events that are too short
    merged_labels = [
        (label, start, end)
        for label, start, end in merged_labels
        if end - start >= MIN_EVENT_DURATION
    ]

    # Order the labels by start time
    # If two events start at the same time, order by class index
    merged_labels = sorted(
        merged_labels, key=lambda x: (x[1], labels_.index(x[0]))
    )

    return merged_labels


print(process_output(target[0]), process_output(prediction[0]), sep="\n")

In [None]:
torch.round(prediction[0], decimals=2)

In [None]:
plot_melspectrogram(urbansed_val.audios[11])

In [45]:
import os

def get_test_set():
    urbansed_test = UrbanSEDDataset(
                audios=[
                    audioclip
                    for _, audio in enumerate(
                        AudioFile(
                            filepath=file.filepath, labels=eval(file.events)
                        )
                        for _, file in pd.read_csv(
                            "./data/raw/URBAN-SED/test.csv"
                        ).iterrows()
                    )
                    for audioclip in audio.subdivide(
                        win_len=2.56, hop_len=1.00
                    )
                ]
            )
    return urbansed_test

urbansed_test = get_test_set()

In [81]:
urbansed_test_dataloader = YOHODataGenerator(
    urbansed_test, batch_size=1, shuffle=False
)

In [164]:
import sed_eval
import dcase_util

labels_ = [
    "noise",
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]

all_thruth_data = dcase_util.containers.MetaDataContainer()
all_predictions = dcase_util.containers.MetaDataContainer()


for idx, (features, _) in enumerate(urbansed_test_dataloader):

    # Get model predictions
    prediction = urbansed_model(features.to(device))
    labels = process_output(prediction[0])

    thruth = urbansed_test.audios[idx].labels
    print(f"Truth for audio clip {idx}: {thruth}")
    print(f"Prediction for audio clip {idx}: {labels}")

    all_thruth_data += dcase_util.containers.MetaDataContainer(
        [
            {
                "filename": f"{idx}.wav",
                "onset": onset,
                "offset": offset,
                "event_label": label,
            }
            for label, onset, offset in labels
        ]
    )

    all_predictions = dcase_util.containers.MetaDataContainer(
        [
            {
                "filename": f"{idx}.wav",
                "onset": onset,
                "offset": offset,
                "event_label": label,
            }
            for label, onset, offset in thruth
        ]
    )

    break


segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=labels_,
    time_resolution=1.0,
)

# Evaluate the predictions
segment_based_metrics.evaluate(all_predictions, all_thruth_data)

overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
curr_f1 = overall_segment_based_metrics['f_measure']['f_measure']
curr_error = overall_segment_based_metrics['error_rate']['error_rate']

print("F1: {:.3f}, Error: {:.3f}".format(curr_f1, curr_error))

Truth for audio clip 0: [('noise', 0, 2.56), ('siren', 1.896800669134819, 2.56)]
Prediction for audio clip 0: [('noise', 0.0, 2.56), ('street_music', 1.88, 2.56)]
F1: 0.600, Error: 0.400
