In [102]:
import os
import re
import math
import time
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split

from soundbay.utils.metadata_processing import (
    bg_from_non_overlap_calls,
    correct_call_times_with_duration,
    non_overlap_df,
)

## Load data

In [None]:
annotation_path = "" # Deep_Voice_NARW_Detections_JWreviewed/Selection_Tables
audio_path = "" # data

In [74]:
file_paths = [f for f in os.listdir(annotation_path) if f.endswith('.txt')]
original_metadata = pd.DataFrame()
for file_path in file_paths:
    current_df = pd.read_csv(os.path.join(annotation_path, file_path), sep='\t')
    current_df["Detection_Confidence"] = ""
    current_df.loc[current_df["Analyst Score"] == "T", "Detection_Confidence"] = "Detected"
    current_df['audio_path'] = audio_path
    current_df['filename'] = file_path.split("/")[-1].split("-")[0]
    original_metadata = pd.concat([original_metadata, current_df])

original_metadata["Species"] = "RIWH"

In [None]:
original_metadata.shape

## Utils

In [76]:
FILES_DELTA = 15 * 60  # 15 minutes files


def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(":")
    return int(h) * 3600 + int(m) * 60 + int(s)


def filetime_from_time(time_str):
    sec_time = get_sec(time_str)
    filetime_sec = math.floor(sec_time / FILES_DELTA) * FILES_DELTA
    filetime_str = time.strftime("%H%M%S", time.gmtime(filetime_sec))
    return filetime_str


def get_time_and_date(iso_input):
    date, time_str = iso_input.split("T")
    time_str = time_str.split("-")[0]
    return date, time_str


def iso_to_file_name(iso_input, format_str):
    date, time_str = get_time_and_date(iso_input)
    filename_time = filetime_from_time(time_str)
    # filename_date = date.replace("-", "")
    return format_str.format(filename_time=filename_time)


def get_time_in_file(iso_input, type="start"):
    date, time_str = get_time_and_date(iso_input)
    time_int = get_sec(time_str)
    time_in_file = time_int - (time_int // FILES_DELTA) * FILES_DELTA
    if type == "end" and time_in_file == 0:
        time_in_file = FILES_DELTA
    return time_in_file


def get_previous_filename(filename):
    file_parts = filename.split("_")
    time_part = file_parts[-2]
    h, m, s = re.findall("..", time_part)
    time_sec = int(h) * 3600 + int(m) * 60 + int(s)
    new_time = time_sec - FILES_DELTA
    assert new_time % FILES_DELTA == 0, "whyyyy"
    new_time_part = time.strftime("%H%M%S", time.gmtime(new_time))
    file_parts[-2] = new_time_part
    return "_".join(file_parts)

## Process

In [77]:
original_metadata.rename(columns={"Begin Time (s)": "begin_time", "End Time (s)": "end_time"}, inplace=True)

In [78]:
original_metadata["call_length"] = (
    original_metadata["end_time"] - original_metadata["begin_time"]
)

In [80]:
original_metadata = original_metadata[original_metadata.Species == "RIWH"]

In [None]:
print(original_metadata['Species'].value_counts())
original_metadata['Species'] = original_metadata['Species'].apply(lambda x: 'HUWH' if 'HUWH' in x else x)
print(original_metadata['Species'].value_counts())

In [None]:
# Remove samples with negative call length
problematic_samples_filter = original_metadata["call_length"] < 0
print(f"Dropping {problematic_samples_filter.sum()} samples with negative call length")
original_metadata = original_metadata[~problematic_samples_filter]

# Create new_metadata without the problematic samples
new_metadata = original_metadata.copy()

In [83]:
# remove calls with length of zero
new_metadata = new_metadata[new_metadata["call_length"] != 0]

In [85]:
# correct files duration
dfs = []
for audio_path in new_metadata.audio_path.unique():
    sub_df = new_metadata[new_metadata.audio_path == audio_path].copy()
    sub_df = correct_call_times_with_duration(
        sub_df, audio_files_path=audio_path
    )
    dfs.append(sub_df)
new_metadata = pd.concat(dfs, ignore_index=True)
new_metadata.drop(columns=["audio_path"], inplace=True)

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [None]:
new_metadata.shape

In [88]:
# Split by species and process each separately
species_dfs = []
for species in new_metadata['Species'].unique():
    species_df = new_metadata[new_metadata['Species'] == species]
    species_df = non_overlap_df(species_df)
    species_dfs.append(species_df)

# Combine back into single dataframe
new_metadata = pd.concat(species_dfs, ignore_index=True)

In [None]:
new_metadata.shape

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [91]:
# merge overlapping calls
new_metadata = non_overlap_df(new_metadata)

In [92]:
new_metadata["unique_id"] = (
    new_metadata.index.astype(str)
    + new_metadata["Selection"].astype(str)
    + new_metadata["filename"]
)

In [93]:
with_bg_metadata_all = bg_from_non_overlap_calls(new_metadata)
with_bg_metadata_all = with_bg_metadata_all.sort_values(
    by=["filename", "begin_time"]
)

In [94]:
# Create a mapping from species to unique labels
species_to_label_all = {
    species: idx + 1
    for idx, species in enumerate(with_bg_metadata_all["Species"].unique())
}

# Assign these labels to the 'label' column where it is non-zero
with_bg_metadata_all.loc[(with_bg_metadata_all["label"] != 0) & (with_bg_metadata_all["Detection_Confidence"] == "Detected"), "label"] = (
    with_bg_metadata_all["Species"].map(species_to_label_all)
)
with_bg_metadata_all.loc[(with_bg_metadata_all["label"] != 0) & (with_bg_metadata_all["Detection_Confidence"] != "Detected"), "label"] = 0

In [95]:
with_bg_metadata_all["label"] = np.array(with_bg_metadata_all["label"]).astype(
    "int"
)

In [None]:
with_bg_metadata_all[with_bg_metadata_all.label==0].call_length.value_counts()
with_bg_metadata_all[with_bg_metadata_all.label!=0].Detection_Confidence.value_counts()

In [97]:
# with_bg_metadata_all = with_bg_metadata_all[with_bg_metadata_all["Detection_Confidence"] == "Detected"]

In [None]:
with_bg_metadata_all.sort_values(by=["filename", "begin_time"])

In [None]:
print(f"{len(with_bg_metadata_all)=}")

In [None]:
species_label_dict = dict(zip(with_bg_metadata_all[with_bg_metadata_all.label!=0]['Species'], with_bg_metadata_all[with_bg_metadata_all.label!=0]['label']))
species_label_dict

In [None]:
def create_train_val_split(df, val_size=0.2, random_state=42):
    # Get unique filenames
    unique_files = df['filename'].unique()
    
    try:
        # Try stratified split first
        file_labels = df.groupby('filename')['label'].first()
        train_files, val_files = train_test_split(
            unique_files,
            test_size=val_size,
            random_state=random_state,
            stratify=file_labels
        )
    except ValueError:
        # Fall back to random split if stratification fails
        print("Warning: Not enough samples for stratification. Performing random split instead.")
        train_files, val_files = train_test_split(
            unique_files,
            test_size=val_size,
            random_state=random_state
        )
    
    # Create train and validation masks
    train_mask = df['filename'].isin(train_files)
    val_mask = df['filename'].isin(val_files)
    
    # Split the dataframe
    train_df = df[train_mask].copy()
    val_df = df[val_mask].copy()
    
    return train_df, val_df

# Apply the split
train_data, val_data = create_train_val_split(with_bg_metadata_all)

# Print split statistics
print(f"Training set size: {len(train_data)} ({len(train_data)/len(with_bg_metadata_all)*100:.1f}%)")
print(f"Validation set size: {len(val_data)} ({len(val_data)/len(with_bg_metadata_all)*100:.1f}%)")

# Verify label distribution
print("\nLabel distribution in training set:")
print(train_data['label'].value_counts(normalize=True))
print("\nLabel distribution in validation set:")
print(val_data['label'].value_counts(normalize=True))

In [58]:
train_data.to_csv(f"train_riwh_jwreviewed.csv", index=False)
val_data.to_csv(f"val_riwh_jwreviewed.csv", index=False)

In [None]:
[with_bg_metadata_all.filename.unique()]