In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
# from glob import glob
# from datetime import datetime
# from tqdm import tqdm

from IPython.display import Audio
import torchaudio

import matplotlib.pyplot as plt
import seaborn as sns

from soundbay.utils.metadata_processing import create_wav_info_df, convert_month_annotation_to_file_anotation

BASE_PATH = Path(os.getcwd())
DATASET_PATH = BASE_PATH / "datasets/fannie_project"
print(DATASET_PATH)

/mnt/d/DeepVoice/soundbay/datasets/fannie_project


# create all calls dataframe:

In [2]:
wav_info_df = create_wav_info_df(list(DATASET_PATH.rglob("*.wav")), wav_files_names_format="fannie")

In [9]:
def create_annotation_df_with_file_times(annotation_dir, wav_info_df):
    """
    Create a DataFrame with annotations and their corresponding file times.
    """
    annotation_files = list(annotation_dir.rglob("*.txt"))
    annotation_dfs = [pd.read_csv(file, sep="\t") for file in annotation_files]
    
    # Convert month-based annotations to file-based annotations
    fixed_annotation_dfs = []
    for df in annotation_dfs:
        df = convert_month_annotation_to_file_anotation(df, wav_info_df)
        fixed_annotation_dfs.append(df)
    
    return pd.concat(fixed_annotation_dfs, ignore_index=True)

In [10]:
ant_blue_annotation = create_annotation_df_with_file_times(DATASET_PATH / "ANT_BLUE", wav_info_df)
mad_blue_annotation = create_annotation_df_with_file_times(DATASET_PATH / "MAD_BLUE", wav_info_df)

ant_blue_annotation.head(), mad_blue_annotation.head()

(   Selection           View  Channel  High Freq (Hz)  Low Freq (Hz)  \
 0          1  Spectrogram 1        1          26.831         24.036   
 1          2  Spectrogram 1        1          26.551         23.757   
 2          3  Spectrogram 1        1          26.551         23.477   
 3          4  Spectrogram 1        1          26.272         23.757   
 4          5  Spectrogram 1        1          26.551         23.477   
 
    Peak Freq (Hz)  Delta Time (s)  Dur 90% (s)  SNR NIST Quick (dB)  \
 0          25.195          6.2761       5.3805                 6.96   
 1          24.609          6.7782       6.0674                 9.95   
 2          25.195          7.9079       5.9529                11.75   
 3          24.609         10.7950       8.4715                 9.56   
 4          25.195          7.1548       5.3805                12.53   
 
             wav_file  Begin Time (s)  End Time (s)  
 0  5756.210501212958      831.942378    838.218523  
 1  5756.210502002958   

In [14]:
num_ant_blue_calls = ant_blue_annotation.shape[0]
num_mad_blue_calls = mad_blue_annotation.shape[0]
print(f"ANT_BLUE calls: {num_ant_blue_calls}, MAD_BLUE calls: {num_mad_blue_calls}")

ANT_BLUE calls: 5441, MAD_BLUE calls: 9607


In [12]:
# Calculate mean and std for 'Dur 90% (s)' in each annotation dataframe
ant_mean = ant_blue_annotation['Delta Time (s)'].mean()
ant_std = ant_blue_annotation['Delta Time (s)'].std()

mad_mean = mad_blue_annotation['Delta Time (s)'].mean()
mad_std = mad_blue_annotation['Delta Time (s)'].std()

print(f"ANT_BLUE: mean={ant_mean:.2f}, std={ant_std:.2f}")
print(f"MAD_BLUE: mean={mad_mean:.2f}, std={mad_std:.2f}")

ANT_BLUE: mean=6.39, std=1.38
MAD_BLUE: mean=14.65, std=2.12


In [15]:
# Filter wav_info_df for files recorded in November (month == 11)
november_wavs = wav_info_df[wav_info_df['month'] == 11]

def generate_november_sampled_df(annotation_df, target_mean, target_std, november_wavs, random_state):
    n_samples = annotation_df.shape[0] // 2
    columns = annotation_df.columns

    # Randomly sample wav files for November
    sampled_wavs = november_wavs.sample(n=n_samples, replace=True, random_state=random_state).reset_index(drop=True)

    # Generate random durations with the same mean and std as annotation_df
    np.random.seed(random_state)
    delta_times = np.random.normal(loc=target_mean, scale=target_std, size=n_samples)
    delta_times = np.clip(delta_times, 0.1, None)

    # Generate random begin times within each wav file's duration
    begin_times = []
    end_times = []
    for i, row in sampled_wavs.iterrows():
        max_start = max(row['duration'] - delta_times[i], 0.1)
        begin = np.random.uniform(0, max_start)
        end = begin + delta_times[i]
        begin_times.append(begin)
        end_times.append(end)

    # Build the DataFrame
    sampled_df = pd.DataFrame({
        'Selection': np.arange(1, n_samples + 1),
        'View': 'Spectrogram 1',
        'Channel': 1,
        'High Freq (Hz)': annotation_df['High Freq (Hz)'].mean(),
        'Low Freq (Hz)': annotation_df['Low Freq (Hz)'].mean(),
        'Peak Freq (Hz)': annotation_df['Peak Freq (Hz)'].mean(),
        'Delta Time (s)': delta_times,
        'Dur 90% (s)': delta_times * (annotation_df['Dur 90% (s)'].mean() / annotation_df['Delta Time (s)'].mean()),
        'SNR NIST Quick (dB)': annotation_df['SNR NIST Quick (dB)'].mean(),
        'wav_file': sampled_wavs['file_name'],
        'Begin Time (s)': begin_times,
        'End Time (s)': end_times
    })

    return sampled_df[columns]

november_sampled_ant_df = generate_november_sampled_df(
    ant_blue_annotation, ant_mean, ant_std, november_wavs, random_state=42
)
november_sampled_mad_df = generate_november_sampled_df(
    mad_blue_annotation, mad_mean, mad_std, november_wavs, random_state=43
)

november_sampled_ant_df.head(), november_sampled_mad_df.head()


(   Selection           View  Channel  High Freq (Hz)  Low Freq (Hz)  \
 0          1  Spectrogram 1        1       26.182238      23.720313   
 1          2  Spectrogram 1        1       26.182238      23.720313   
 2          3  Spectrogram 1        1       26.182238      23.720313   
 3          4  Spectrogram 1        1       26.182238      23.720313   
 4          5  Spectrogram 1        1       26.182238      23.720313   
 
    Peak Freq (Hz)  Delta Time (s)  Dur 90% (s)  SNR NIST Quick (dB)  \
 0       24.972167        7.070657     5.742328             8.914424   
 1       24.972167        6.195362     5.031470             8.914424   
 2       24.972167        7.278769     5.911344             8.914424   
 3       24.972167        8.485396     6.891287             8.914424   
 4       24.972167        6.063182     4.924123             8.914424   
 
             wav_file  Begin Time (s)  End Time (s)  
 0  5756.211105062958      378.782472    385.853129  
 1  5756.211119032958   

# train, val, test split

In [18]:
ant_blue_annotation.columns

Index(['Selection', 'View', 'Channel', 'High Freq (Hz)', 'Low Freq (Hz)',
       'Peak Freq (Hz)', 'Delta Time (s)', 'Dur 90% (s)',
       'SNR NIST Quick (dB)', 'wav_file', 'Begin Time (s)', 'End Time (s)',
       'class'],
      dtype='object')

In [19]:
# add class to all dataframes:
ant_blue_annotation['class'] = 'ant_blue'
mad_blue_annotation['class'] = 'mad_blue'
november_sampled_ant_df['class'] = 'noise'
november_sampled_mad_df['class'] = 'noise'

# train, val, test split
train_ant_blue = ant_blue_annotation.sample(frac=0.8, random_state=42)
val_ant_blue = ant_blue_annotation.drop(train_ant_blue.index).sample(frac=0.5, random_state=42)
test_ant_blue = ant_blue_annotation.drop(train_ant_blue.index).drop(val_ant_blue.index)

train_mad_blue = mad_blue_annotation.sample(frac=0.8, random_state=42)
val_mad_blue = mad_blue_annotation.drop(train_mad_blue.index).sample(frac=0.5, random_state=42)
test_mad_blue = mad_blue_annotation.drop(train_mad_blue.index).drop(val_mad_blue.index)

train_nov_ant_noise = november_sampled_ant_df.sample(frac=0.8, random_state=42)
val_nov_ant_noise = november_sampled_ant_df.drop(train_nov_ant_noise.index).sample(frac=0.5, random_state=42)
test_nov_ant_noise = november_sampled_ant_df.drop(train_nov_ant_noise.index).drop(val_nov_ant_noise.index)

train_nov_mad_noise = november_sampled_mad_df.sample(frac=0.8, random_state=42)
val_nov_mad_noise = november_sampled_mad_df.drop(train_nov_mad_noise.index).sample(frac=0.5, random_state=42)
test_nov_mad_noise = november_sampled_mad_df.drop(train_nov_mad_noise.index).drop(val_nov_mad_noise.index)

# Combine all dataframes
train_df = pd.concat([train_ant_blue, train_mad_blue, train_nov_ant_noise, train_nov_mad_noise], ignore_index=True)
val_df = pd.concat([val_ant_blue, val_mad_blue, val_nov_ant_noise, val_nov_mad_noise], ignore_index=True)
test_df = pd.concat([test_ant_blue, test_mad_blue, test_nov_ant_noise, test_nov_mad_noise], ignore_index=True)

# choose relavent columns for the final dataframes
columns_to_keep = ['wav_file', 'Begin Time (s)', 'End Time (s)', 'class']
train_df = train_df[columns_to_keep]
val_df = val_df[columns_to_keep]
test_df = test_df[columns_to_keep]

# change column names to match the expected format
train_df.columns = ['file_name', 'begin_time', 'end_time', 'class']
val_df.columns = ['file_name', 'begin_time', 'end_time', 'class']
test_df.columns = ['file_name', 'begin_time', 'end_time', 'class']

# print the shapes of the final dataframes
print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Save the dataframes to CSV files
train_df.to_csv(DATASET_PATH / "train.csv", index=False)
val_df.to_csv(DATASET_PATH / "val.csv", index=False)
test_df.to_csv(DATASET_PATH / "test.csv", index=False)

Train set shape: (18057, 4)
Validation set shape: (2256, 4)
Test set shape: (2258, 4)
