In [32]:
import os
import re
import math
import time
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split

from soundbay.utils.metadata_processing import (
    bg_from_non_overlap_calls,
    correct_call_times_with_duration,
    non_overlap_df,
)

## Load data

In [None]:
import os
directory = '/DeepVoice/datasets/'
suffix = '_narw'

# Get all narw dataset folders and their csv paths and wav formats
narw_folders = [f for f in os.listdir(directory) if f.endswith(suffix)]
csv_paths = {}
wav_formats = {}

for folder in narw_folders:
    folder_path = os.path.join(directory, folder)
    data_path = os.path.join(folder_path, 'data')
    audio_path = os.path.join(folder_path, 'ancillary', 'source-audio')
    
    # Find the csv file in the data folder
    if os.path.exists(data_path):
        csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]
        if csv_files:
            csv_paths[folder] = os.path.join(data_path, csv_files[0])
            
    # Find wav file format in audio folder        
    if os.path.exists(audio_path):
        wav_files = [f for f in os.listdir(audio_path) if f.endswith('.wav')]
        if wav_files:
            # Get first wav file as template
            wav_file = wav_files[0]
            # Split on underscore and replace timestamp with format placeholder
            parts = wav_file.split('_')
            for i in range(len(parts)-2, -1, -1):
                if parts[i].isdigit() and len(parts[i]) == 6:
                    parts[i] = '{filename_time}'
                    wav_formats[folder] = '_'.join(parts)[:-4]
                    break

print("CSV paths:")                    
print(csv_paths)
print("\nWAV formats:")
print(wav_formats)

In [34]:
# Load CSV file into a dataframe
# Create empty dataframe to store all metadata
original_metadata = pd.DataFrame()

# Load and concatenate all CSVs with their corresponding format string
for folder, csv_path in csv_paths.items():
    if wav_formats.get(folder):
        df = pd.read_csv(csv_path)
        df['Format_String'] = wav_formats.get(folder)
        df["audio_path"] = os.path.join(directory, folder, 'ancillary', 'source-audio')
        original_metadata = pd.concat([original_metadata, df], ignore_index=True)

## Utils

In [5]:
FILES_DELTA = 15 * 60  # 15 minutes files


def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(":")
    return int(h) * 3600 + int(m) * 60 + int(s)


def filetime_from_time(time_str):
    sec_time = get_sec(time_str)
    filetime_sec = math.floor(sec_time / FILES_DELTA) * FILES_DELTA
    filetime_str = time.strftime("%H%M%S", time.gmtime(filetime_sec))
    return filetime_str


def get_time_and_date(iso_input):
    date, time_str = iso_input.split("T")
    time_str = time_str.split("-")[0]
    return date, time_str


def iso_to_file_name(iso_input, format_str):
    date, time_str = get_time_and_date(iso_input)
    filename_time = filetime_from_time(time_str)
    # filename_date = date.replace("-", "")
    return format_str.format(filename_time=filename_time)


def get_time_in_file(iso_input, type="start"):
    date, time_str = get_time_and_date(iso_input)
    time_int = get_sec(time_str)
    time_in_file = time_int - (time_int // FILES_DELTA) * FILES_DELTA
    if type == "end" and time_in_file == 0:
        time_in_file = FILES_DELTA
    return time_in_file

## Process

In [6]:
original_metadata["filename"] = [
    iso_to_file_name(x, original_metadata.iloc[i]["Format_String"]) 
    for i, x in enumerate(original_metadata["End_DateTime_ISO8601"])
]
original_metadata["begin_time"] = [
    get_time_in_file(x) for x in original_metadata["Start_DateTime_ISO8601"]
]
original_metadata["end_time"] = [
    get_time_in_file(x, "end")
    for x in original_metadata["End_DateTime_ISO8601"]
]
original_metadata["call_length"] = (
    original_metadata["end_time"] - original_metadata["begin_time"]
)

In [7]:
# original_metadata = original_metadata[original_metadata.audio_path == original_metadata.audio_path.unique()[2]].copy()

In [8]:
original_metadata.drop(columns=['Format_String'], inplace=True)

In [None]:
original_metadata.audio_path.unique()

In [10]:
original_metadata = original_metadata[original_metadata.Species == "RIWH"]

In [11]:
# original_metadata = original_metadata[original_metadata.filename.str.contains('20090330')]

In [None]:
print(original_metadata['Species'].value_counts())
original_metadata['Species'] = original_metadata['Species'].apply(lambda x: 'HUWH' if 'HUWH' in x else x)
print(original_metadata['Species'].value_counts())

In [None]:
# Remove samples with negative call length
problematic_samples_filter = original_metadata["call_length"] < 0
print(f"Dropping {problematic_samples_filter.sum()} samples with negative call length")
original_metadata = original_metadata[~problematic_samples_filter]

# Create new_metadata without the problematic samples
new_metadata = original_metadata.copy()

In [14]:
# remove calls with length of zero
new_metadata = new_metadata[new_metadata["call_length"] != 0]

In [15]:
# correct files duration
dfs = []
for audio_path in new_metadata.audio_path.unique():
    sub_df = new_metadata[new_metadata.audio_path == audio_path].copy()
    sub_df = correct_call_times_with_duration(
        sub_df, audio_files_path=audio_path
    )
    dfs.append(sub_df)
new_metadata = pd.concat(dfs, ignore_index=True)
new_metadata.drop(columns=["audio_path"], inplace=True)

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [None]:
new_metadata.shape

In [18]:
# Split by species and process each separately
species_dfs = []
for species in new_metadata['Species'].unique():
    species_df = new_metadata[new_metadata['Species'] == species]
    species_df = non_overlap_df(species_df)
    species_dfs.append(species_df)

# Combine back into single dataframe
new_metadata = pd.concat(species_dfs, ignore_index=True)

In [None]:
new_metadata.shape

In [None]:
new_metadata["Detection_Confidence"].value_counts()

In [21]:
# merge overlapping calls
new_metadata = non_overlap_df(new_metadata)

In [22]:
new_metadata["unique_id"] = (
    new_metadata.index.astype(str)
    + new_metadata["Selection"].astype(str)
    + new_metadata["filename"]
)

In [23]:
with_bg_metadata_all = bg_from_non_overlap_calls(new_metadata)
with_bg_metadata_all = with_bg_metadata_all.sort_values(
    by=["filename", "begin_time"]
)

In [24]:
# Create a mapping from species to unique labels
species_to_label_all = {
    species: idx + 1
    for idx, species in enumerate(with_bg_metadata_all["Species"].unique())
}

# Assign these labels to the 'label' column where it is non-zero
with_bg_metadata_all.loc[(with_bg_metadata_all["label"] != 0) & (with_bg_metadata_all["Detection_Confidence"] == "Detected"), "label"] = (
    with_bg_metadata_all["Species"].map(species_to_label_all)
)
with_bg_metadata_all.loc[(with_bg_metadata_all["label"] != 0) & (with_bg_metadata_all["Detection_Confidence"] != "Detected"), "label"] = 0

In [25]:
with_bg_metadata_all["label"] = np.array(with_bg_metadata_all["label"]).astype(
    "int"
)

In [None]:
with_bg_metadata_all.sort_values(by=["filename", "begin_time"])

In [26]:
# with_bg_metadata_all = with_bg_metadata_all[with_bg_metadata_all["Detection_Confidence"] == "Detected"]

In [None]:
print(f"{len(with_bg_metadata_all)=}")

In [None]:
species_label_dict = dict(zip(with_bg_metadata_all[with_bg_metadata_all.label!=0]['Species'], with_bg_metadata_all[with_bg_metadata_all.label!=0]['label']))
species_label_dict

In [None]:
# Assuming with_bg_metadata_all is your DataFrame
# Group by filename and stratify by label

def create_train_val_split(df, val_size=0.2, random_state=42):
    # Get unique filenames
    unique_files = df['filename'].unique()
    
    # Get one label per file (assuming same file has same label)
    file_labels = df.groupby('filename')['label'].first()
    
    # Split filenames while stratifying by their labels
    train_files, val_files = train_test_split(
        unique_files,
        test_size=val_size,
        random_state=random_state,
        stratify=file_labels
    )
    
    # Create train and validation masks
    train_mask = df['filename'].isin(train_files)
    val_mask = df['filename'].isin(val_files)
    
    # Split the dataframe
    train_df = df[train_mask].copy()
    val_df = df[val_mask].copy()
    
    return train_df, val_df

# Apply the split
train_data, val_data = create_train_val_split(with_bg_metadata_all)

# Print split statistics
print(f"Training set size: {len(train_data)} ({len(train_data)/len(with_bg_metadata_all)*100:.1f}%)")
print(f"Validation set size: {len(val_data)} ({len(val_data)/len(with_bg_metadata_all)*100:.1f}%)")

# Verify label distribution
print("\nLabel distribution in training set:")
print(train_data['label'].value_counts(normalize=True))
print("\nLabel distribution in validation set:")
print(val_data['label'].value_counts(normalize=True))

In [31]:
train_data.to_csv(f"train_riwh_narw.csv", index=False)
val_data.to_csv(f"val_riwh_narw.csv", index=False)
