# Explore the preprocessed data

In [None]:
import os

import pandas as pd
import torch
import torchaudio
from matplotlib import pyplot as plt

## Config

In [None]:
annotations_path = "../dataset/preprocessed_data/frame-0.04_hop-0.02/labels.tsv"
audio_dir = "../dataset/preprocessed_data/frame-0.04_hop-0.02/audio_tensors"
labels = ["click", "whistle", "cetaceans_allfreq", "allfreq", "volcano"]

In [None]:
annotations = pd.read_csv(annotations_path, sep="\t")
annotations.head()

## Check labels distributions

In [None]:
display(annotations[labels].sum())
total_annot = len(annotations)
total_labels = annotations[labels].sum().sum()
print(f"{total_labels=}")
print(f"{total_annot=}")
print(f"{(total_labels/total_annot)=}")

## Check extracted spectrograms

In [None]:
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", interpolation="antialiased"):
    plt.figure(figsize=(30, 10))
    plt.title(title or "Spectrogram (db)")
    plt.ylabel(ylabel)
    plt.xlabel("frame")
    amplitude_2_DB = torchaudio.transforms.AmplitudeToDB()
    plt.imshow(amplitude_2_DB(specgram), origin="lower", aspect="auto", interpolation=interpolation)
    plt.colorbar()
    plt.show()

In [None]:
mask = (annotations.whistle == 1) & (annotations.click == 1) & (annotations.cetaceans_allfreq == 1)
selected_annot = annotations[mask]
selected_annot.head()

In [None]:
tensor_path =selected_annot.iloc[0].path
tensor_data = torch.load(os.path.join(audio_dir, tensor_path))

In [None]:
plot_spectrogram(torch.from_numpy(tensor_data)[0])