In [1]:
import os
import re
import math
import time
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.patches as patches

from soundbay.utils.metadata_processing import (
    bg_from_non_overlap_calls,
    correct_call_times_with_duration,
    non_overlap_df,
)

## Load data

In [2]:
current_directory = os.getcwd()

deepvoice_directory = os.path.dirname(
    os.path.dirname(os.path.dirname(current_directory))
)

nefsc_folder_path = os.path.join(
    deepvoice_directory, "datasets", "nefsc_sbnms_200903_nopp6_ch10"
)

In [3]:
data_path = f"{nefsc_folder_path}/detections/NEFSC_SBNMS_200903_NOPP6_CH10_allbaleen_detection_log.csv"
original_metadata = pd.read_csv(data_path)
audio_files_path = f"{nefsc_folder_path}/source-audio/"

## Utils

In [4]:
FILES_DELTA = 15 * 60  # 15 minutes files


def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(":")
    return int(h) * 3600 + int(m) * 60 + int(s)


def filetime_from_time(time_str):
    sec_time = get_sec(time_str)
    filetime_sec = math.floor(sec_time / FILES_DELTA) * FILES_DELTA
    filetime_str = time.strftime("%H%M%S", time.gmtime(filetime_sec))
    return filetime_str


def get_time_and_date(iso_input):
    date, time_str = iso_input.split("T")
    time_str = time_str.split("-")[0]
    return date, time_str


def iso_to_file_name(iso_input):
    date, time_str = get_time_and_date(iso_input)
    filename_time = filetime_from_time(time_str)
    filename_date = date.replace("-", "")
    return f"NOPP6_EST_{filename_date}_{filename_time}_CH10"


def get_time_in_file(iso_input, type="start"):
    date, time_str = get_time_and_date(iso_input)
    time_int = get_sec(time_str)
    time_in_file = time_int - (time_int // FILES_DELTA) * FILES_DELTA
    if type == "end" and time_in_file == 0:
        time_in_file = FILES_DELTA
    return time_in_file


def get_previous_filename(filename):
    file_parts = filename.split("_")
    time_part = file_parts[-2]
    h, m, s = re.findall("..", time_part)
    time_sec = int(h) * 3600 + int(m) * 60 + int(s)
    new_time = time_sec - FILES_DELTA
    assert new_time % FILES_DELTA == 0, "whyyyy"
    new_time_part = time.strftime("%H%M%S", time.gmtime(new_time))
    file_parts[-2] = new_time_part
    return "_".join(file_parts)

## Process

In [5]:
original_metadata["filename"] = [
    iso_to_file_name(x) for x in original_metadata["End_DateTime_ISO8601"]
]
original_metadata["begin_time"] = [
    get_time_in_file(x) for x in original_metadata["Start_DateTime_ISO8601"]
]
original_metadata["end_time"] = [
    get_time_in_file(x, "end")
    for x in original_metadata["End_DateTime_ISO8601"]
]
original_metadata["call_length"] = (
    original_metadata["end_time"] - original_metadata["begin_time"]
)

In [None]:
print(original_metadata['Species'].value_counts())
original_metadata['Species'] = original_metadata['Species'].apply(lambda x: 'HUWH' if 'HUWH' in x else x)
print(original_metadata['Species'].value_counts())

In [7]:
# split annotations that originate from different files to the corresponding files
problematic_samples_filter = original_metadata["call_length"] < 0
after_split_samples = original_metadata[problematic_samples_filter].copy()
original_metadata.loc[problematic_samples_filter, "end_time"] = FILES_DELTA
original_metadata.loc[problematic_samples_filter, "filename"] = [
    get_previous_filename(x)
    for x in original_metadata[problematic_samples_filter]["filename"]
]
after_split_samples["begin_time"] = 0
new_metadata = pd.concat(
    [original_metadata, after_split_samples], ignore_index=True
)

In [8]:
# remove calls with length of zero
new_metadata = new_metadata[new_metadata["call_length"] != 0]

In [9]:
# correct files duration
new_metadata = correct_call_times_with_duration(
    new_metadata, audio_files_path=audio_files_path
)

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [None]:
new_metadata.shape

In [12]:
# Split by species and process each separately
species_dfs = []
for species in new_metadata['Species'].unique():
    species_df = new_metadata[new_metadata['Species'] == species]
    species_df = non_overlap_df(species_df)
    species_dfs.append(species_df)

# Combine back into single dataframe
new_metadata = pd.concat(species_dfs, ignore_index=True)

In [None]:
new_metadata.shape

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [15]:
# merge overlapping calls
new_metadata = non_overlap_df(new_metadata)

In [16]:
new_metadata["unique_id"] = (
    new_metadata.index.astype(str)
    + new_metadata["Selection"].astype(str)
    + new_metadata["filename"]
)

In [17]:
with_bg_metadata_all = bg_from_non_overlap_calls(new_metadata)
with_bg_metadata_all = with_bg_metadata_all.sort_values(
    by=["filename", "begin_time"]
)

In [18]:
# Create a mapping from species to unique labels
species_to_label_all = {
    species: idx + 1
    for idx, species in enumerate(with_bg_metadata_all["Species"].unique())
}

# Assign these labels to the 'label' column where it is non-zero
with_bg_metadata_all.loc[with_bg_metadata_all["label"] != 0, "label"] = (
    with_bg_metadata_all["Species"].map(species_to_label_all)
)

In [19]:
with_bg_metadata_all["label"] = np.array(with_bg_metadata_all["label"]).astype(
    "int"
)

In [20]:
with_bg_metadata_all = with_bg_metadata_all[with_bg_metadata_all["Detection_Confidence"] == "Detected"]

In [None]:
# split to train and val
val_filter = with_bg_metadata_all["filename"].str.contains("20090330")
train_metadata = with_bg_metadata_all[~val_filter]
val_metadata = with_bg_metadata_all[val_filter]
print(f"{len(train_metadata)=}, {len(val_metadata)=}")

In [22]:
# Find the common species between train and validation datasets
common_species = set(train_metadata['Species']).intersection(set(val_metadata['Species']))

# Filter the dataframes to keep only the common species
train_metadata = train_metadata[train_metadata['Species'].isin(common_species)]
val_metadata = val_metadata[val_metadata['Species'].isin(common_species)]

# Check the number of unique species again
assert train_metadata['Species'].nunique() == val_metadata['Species'].nunique(), "Species counts still do not match!"

In [None]:
species_label_dict = dict(zip(train_metadata[train_metadata.label!=0]['Species'], train_metadata[train_metadata.label!=0]['label']))
species_label_dict

In [None]:
def get_time_in_hrs_format(time_secs):
    return (
        time_secs // 3600,
        (time_secs - time_secs // 3600 * 3600) // 60,
        time_secs % 60,
    )


for name, meta in {
    "train": train_metadata,
    "val": val_metadata,
    "with_bg_metadata_all": with_bg_metadata_all,
}.items():
    print(name)
    print(f"Number of samples: {len(meta)}")
    print(f"Labels breakdown: {meta['label'].value_counts()}")
    h, m, s = get_time_in_hrs_format(
        meta["call_length"][meta["label"] == 1].sum()
    )
    print(f"Calls length: {h}:{m}:{s}")
    h, m, s = get_time_in_hrs_format(
        meta["call_length"][meta["label"] == 0].sum()
    )
    print(f"Background length: {h}:{m}:{s}")
    print(
        "-----------------------------------------------------------------------------"
    )

In [25]:
train_metadata.to_csv("train.csv", index=False)
val_metadata.to_csv("val.csv", index=False)

In [None]:
train_metadata = pd.read_csv("train.csv")
val_metadata = pd.read_csv("val.csv")

In [27]:
huwh_train = train_metadata.copy()
huwh_train.loc[huwh_train.label != species_label_dict["HUWH"], "label"] = 0
huwh_train.loc[huwh_train.label == species_label_dict["HUWH"], "label"] = 1

riwh_train = train_metadata.copy()
riwh_train.loc[riwh_train.label != species_label_dict["RIWH"], "label"] = 0
riwh_train.loc[riwh_train.label == species_label_dict["RIWH"], "label"] = 1

huwh_train.to_csv("train_huwh.csv", index=False)
riwh_train.to_csv("train_riwh_nefsc_sbnms_200903_nopp6_ch10.csv", index=False)

huwh_val = val_metadata.copy()
huwh_val.loc[huwh_val.label != species_label_dict["HUWH"], "label"] = 0
huwh_val.loc[huwh_val.label == species_label_dict["HUWH"], "label"] = 1

riwh_val = val_metadata.copy()
riwh_val.loc[riwh_val.label != species_label_dict["RIWH"], "label"] = 0
riwh_val.loc[riwh_val.label == species_label_dict["RIWH"], "label"] = 1

huwh_val.to_csv("val_huwh.csv", index=False)
riwh_val.to_csv("val_riwh_nefsc_sbnms_200903_nopp6_ch10.csv", index=False)