# Marine Datathon 2022: Exploratory Data Analysis


In [None]:
import os

import numpy as np
import pandas as pd
import torch
import torchaudio
import librosa
from matplotlib import pyplot as plt

## Config


In [None]:
annotations_path = "../dataset/labels.csv"
audio_dir = "../dataset/audios"

## Load Data

In [None]:
annotations = pd.read_csv(annotations_path)
annotations.head()

## Labels Exploration

In [None]:
annotations.isna().sum()

In [None]:
annotations.label.value_counts()

Show the count of events durations

In [None]:
plt.figure(figsize=(30, 10))
plt.xlabel("Duration (seconds)")
plt.ylabel("Count")
annotations.duration.plot.hist(bins=100)

Drop **click** labels with high duration (> 290 seconds). They are annotation errors

In [None]:
drop_mask = annotations.duration > 290
drop_mask &= annotations.label == "click"
annotations = annotations[~drop_mask]

In [None]:
annotations.label.value_counts()

In [None]:
annotations.sort_values("duration").tail(15)

Show the new distribution without wrong labels

In [None]:
plt.figure(figsize=(30, 10))
plt.xlabel("Duration (seconds)")
plt.ylabel("Count")
annotations.duration.plot.hist(bins=100)

In [None]:
annotations[["start", "duration", "end"]].describe()

Check the total duration of audio data by label (**in minutes**)

In [None]:
annotations.groupby("label").duration.sum() / 60

Check the mean duration of each audio event by label

In [None]:
annotations.groupby("label").duration.mean()

### Test labels extraction for Sound Event Detection (SED)

Prepare a function to convert all the annotations from one audio into a 2D matix of (# labels, frames) to encode the labels at each time step

In [None]:
sorted_labels = annotations.label.value_counts().index
n_labels = len(sorted_labels)
labels2idx = {l: i for i, l in enumerate(sorted_labels)}


def labels_to_mask(
    audio_annot: pd.DataFrame,
    frame_size: int,
    hop_size: int,
) -> torch.Tensor:
    # Load the audio
    signal, sample_rate = torchaudio.load(os.path.join(audio_dir, f"{audio_annot.name}.wav"))

    # Prepare the mask 2D matrix (labels, frames)
    n_frames = int((signal.shape[-1] - frame_size) / hop_size) + 1
    mask = torch.zeros((n_labels, n_frames))
    # Compute some utility values
    sample_time = 1 / sample_rate
    frame_time = sample_time * frame_size
    for idx, row in audio_annot.iterrows():
        start_frame = int(row.start / frame_time)
        end_frame = int(row.end / frame_time)
        mask[labels2idx[row.label], start_frame:end_frame] += 1
        
    return mask.bool().int()

Apply the mask extraction (for each audi file) from the annotations temporal labels

In [None]:
sample_idx = 47  # Sample to select
FRAME_SIZE = 1024
HOP_SIZE = 512

SOURCE_SAMPLE_RATE = 50000
sample_duration = 1 / SOURCE_SAMPLE_RATE
frame_duration = sample_duration * FRAME_SIZE
hop_duration = sample_duration * HOP_SIZE
print(f"{sample_duration=}s")
print(f"{frame_duration=}s")
print(f"{hop_duration=}s")

In [None]:
audio_masks = annotations.groupby("path").apply(lambda x: labels_to_mask(x, FRAME_SIZE, HOP_SIZE))

In [None]:
def plot_mask(mask, labels):
    plt.figure(figsize=(30, 10))
    plt.imshow(mask, aspect="auto", interpolation="none", cmap="jet")
    plt.yticks(range(len(labels)), labels=labels)
    plt.xlabel("Frame")
    plt.colorbar()
    plt.show()

In [None]:
selected_audio_id = audio_masks.index[sample_idx]

plot_mask(audio_masks.iloc[sample_idx], sorted_labels)

# Show the audio annotations to compare
display(annotations[annotations.path == audio_masks.index[sample_idx]])

## Audio visualization

In [None]:
signal, sample_rate = torchaudio.load(os.path.join(audio_dir, f"{selected_audio_id}.wav"))

In [None]:
def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()
    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate
    plt.figure(figsize=(30, 10))
    plt.plot(time_axis, waveform[0], linewidth=1)
    plt.grid(True)
    plt.suptitle("waveform")
    plt.show(block=False)


def plot_spectrogram(specgram, title=None, ylabel="freq_bin", interpolation="antialiased"):
    plt.figure(figsize=(30, 10))
    plt.title(title or "Spectrogram (db)")
    plt.ylabel(ylabel)
    plt.xlabel("frame")
    amplitude_2_DB = torchaudio.transforms.AmplitudeToDB()
    plt.imshow(amplitude_2_DB(specgram), origin="lower", aspect="auto", interpolation=interpolation)
    plt.colorbar()
    plt.show()

In [None]:
plot_waveform(signal, sample_rate)

In [None]:
spec_transform = torchaudio.transforms.Spectrogram(n_fft=FRAME_SIZE, hop_length=HOP_SIZE, center=False)
spectrogram = spec_transform(signal)
plot_spectrogram(spectrogram[0])

In [None]:
melspec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=50000, n_fft=FRAME_SIZE, hop_length=HOP_SIZE, center=False, n_mels=64)
mel_spectrogram = melspec_transform(signal)
plot_spectrogram(mel_spectrogram[0])

## Chunks preprocessing

Experiment with the frame chunks extraction from the spectrograms and labels mask to create the training samples

In [None]:
n_frames = 512
spec_chunks = librosa.util.frame(spectrogram, frame_length=n_frames, hop_length=n_frames // 2)
mask_chunks = librosa.util.frame(audio_masks.iloc[sample_idx], frame_length=n_frames, hop_length=n_frames // 2)
print(f"{spec_chunks.shape=}")
print(f"{mask_chunks.shape=}")

In [None]:
chunk_idx = 23
spec_chunk = spec_chunks[:, :, :, chunk_idx]
mask_chunk = mask_chunks[:, :, chunk_idx]
print(f"{spec_chunk.shape=}")
print(f"{mask_chunk.shape=}")
plot_spectrogram(torch.from_numpy(spec_chunk[0]))
plot_mask(torch.from_numpy(mask_chunk), sorted_labels)

Test the mask chunk combination to create the 1D one-hot vector

In [None]:
acc_mask_chunk = mask_chunk.sum(axis=1)  # Aggregate the frames
chunk_label = acc_mask_chunk.astype(bool).astype(np.uint8)
#chunk_label = acc_mask_chunk / mask_chunk.shape[1]
chunk_label