In [1]:
from soundbay.utils.metadata_processing import (
    bg_from_non_overlap_calls, 
    correct_call_times_with_duration, 
    non_overlap_df,
    reorder_columns_to_default_view, 
    merge_calls
)
import pandas as pd
import soundfile as sf
import math
import time
import re
import numpy as np
import glob
from typing import List

In [2]:
annotations_path_2021_2022 = "selection_tables/MARION_ISLAND_KW_SELECTION_TABLES/*.txt"
annotations_path_2022_2023 = "selection_tables/5756.220509112958_220612142958.Table.1_social.selections.txt"
audio_files_path = "/danielle_mnt/audio/audio_source/"

# batch syntax: 
batch_pattern = r"Batch([0-9]+)|Bach([0-9]+)|Batch_([0-9]+)"
# call type syntax: 
call_type_pattern = r"Batch[0-9]+([a-zA-Z]+)|Batch[0-9]+_([a-zA-Z]+)|Bach[0-9]+_([a-zA-Z]+)|Batch_[0-9]+_([a-zA-Z]+)"
# fix some call type typos
call_type_mapping = {
    'Upsweeps':'Upsweeps', 
    'Downsweeps':'Downsweeps', 
    'Whistles':'Whistles', 
    'Tones': 'Tones', 
    'Squeaks': 'Squeaks', 
    'Clicks': 'Clicks',
    'Downseeps': 'Downsweeps', 
    'Squeak': 'Squeaks', 
    'Dowsnweeps': 'Downsweeps', 
    'Upsweep': 'Upsweeps', 
    'Tone':'Tones',
    'Downsweep': 'Downsweeps'
}

call_type_label = {
    'Upsweeps': 1,
    'Downsweeps': 2,
    'Whistles': 3,
    'Tones': 4,
    'Squeaks': 5,
    'Clicks': 6,
    # np.nan: 6
}

file_sec_length = 14 * 60
time_delta = 0.75
seed = 1234
random_rng = np.random.default_rng(seed)

In [3]:
def get_first_non_null_str_match(pattern: str, s: str) -> str: 
    return list(filter(lambda x: x!='', re.findall(pattern, s)[0]))[0]


def get_date_and_hour_from_filename(filename: str) -> str:
    filename = filename.split('.')[1]
    year, month, day = '20'+filename[:2], filename[2:4], filename[4:6]
    hour, minute, sec = filename[6:8], filename[8:10], filename[10:12]
    return f'{year}{month}{day}', f'{hour}:{minute}:{sec}'


def get_time_in_file(sec_time, files_delta=file_sec_length, type='start') -> float:
    time_in_file = sec_time - ((sec_time // files_delta) * files_delta)
    if type == 'end' and time_in_file == 0:
        time_in_file = FILES_DELTA
    return time_in_file


def set_time_bounds(
    time_in_file, 
    start=True, 
    delta = time_delta,
    files_delta=file_sec_length, 
    random_rng=random_rng) -> float:
    return min(max(0, time_in_file - (start) * delta + (1-start) * delta), files_delta)


def data_enrichment(data: pd.DataFrame) -> pd.DataFrame: 
    return data.assign(
        date=lambda df: df['Begin File'].apply(get_date_and_hour_from_filename).apply(lambda t:t[0]), 
        time=lambda df: df['Begin File'].apply(get_date_and_hour_from_filename).apply(lambda t:t[1]), 
        time_in_file=lambda df:  df['Begin Time (s)'].apply(get_time_in_file),
        begin_time=lambda df: df.time_in_file.apply(set_time_bounds),
        end_time=lambda df: df.time_in_file.apply(set_time_bounds, start=0),
        call_length=lambda df: df.end_time - df.begin_time,
        filename=lambda df: df['Begin File'].str.replace('.wav',''), 
        channel=lambda df: df['Channel']
    )


In [4]:
set_time_bounds(760, start=True)

759.25

In [5]:
set_time_bounds(760, start=False)

760.75

In [14]:
l = [pd
     .read_table(filename)
     .assign(
        batch=get_first_non_null_str_match(batch_pattern, filename),
        call_type=call_type_mapping[get_first_non_null_str_match(call_type_pattern, filename)],)
     .pipe(data_enrichment)
     for filename in glob.glob(annotations_path_2021_2022)]

annotations = pd.concat(
    l + [pd.read_table(annotations_path_2022_2023).pipe(data_enrichment)], axis=0
).query('call_type in ["Downsweeps", "Upsweeps"]')
# .query('call_type != "Clicks"')

print('\n',
    'recording length: ' + str(file_sec_length) + '\n',
    'shape: ' + str(annotations.shape) + '\n', 
    'min time in file: ' + str(annotations.time_in_file.min()) + '\n',
    'max time in file: ' + str(annotations.time_in_file.max()) + '\n',
    'min begin time: ' + str(annotations.begin_time.min()) + '\n',
    'max end time: ' + str(annotations.end_time.max()) + '\n',
    'min_call_length: ' + str(annotations.call_length.min().round(3)) + '\n',
    'max_call_length: ' + str(annotations.call_length.max().round(3)) + '\n',
     )




 recording length: 840
 shape: (8364, 19)
 min time in file: 3.033401154000103
 max time in file: 839.5595264080002
 min begin time: 2.283401154000103
 max end time: 840.0
 min_call_length: 1.19
 max_call_length: 1.5



In [16]:
annotations_with_call_type = annotations.dropna(subset=['call_type'])
# corrected_annotations = correct_call_times_with_duration(annotations_with_call_type, audio_files_path)

In [17]:
corrected_annotations.shape

(8364, 19)

In [18]:
annotations_with_call_type.shape

(8364, 20)

In [23]:
merged_df = non_overlap_df(annotations_with_call_type.drop(['call_type'], axis=1))

In [24]:
bg = bg_from_non_overlap_calls(merged_df).query('call_length!=0').query('label==0').reset_index(drop=True)

In [25]:
annotations_with_bg = pd.concat([
    annotations_with_call_type.assign(label=lambda df: df.call_type.map(call_type_label)), 
    bg
], ignore_index=True).sort_values(['filename', 'begin_time']).reset_index(drop=True)

In [26]:
print(
        f"annotations with bg noise shape: {annotations_with_bg.shape} \n"
        f"call_length 99 percentile:  {annotations_with_bg.query('label!=0').call_length.quantile(0.99).round(3)} \n"
        f"min_call_length:  {annotations_with_bg.query('label!=0').call_length.min().round(3)} \n"
        f"max_call_length:  {annotations_with_bg.query('label!=0').call_length.max().round(3)} \n"
       )

annotations with bg noise shape: (15106, 21) 
call_length 99 percentile:  1.5 
min_call_length:  1.158 
max_call_length:  1.5 



In [27]:
annotations_with_bg.sort_values(['filename','begin_time']).reset_index(drop=True).head(5)[['begin_time','end_time', 'label']].round(3)

Unnamed: 0,begin_time,end_time,label
0,677.885,679.385,2.0
1,679.385,691.081,0.0
2,691.081,692.581,2.0
3,692.581,700.111,0.0
4,700.111,701.611,2.0


In [38]:
train_pct = 0.85 
val_pct = 1 - train_pct
n = len(annotations_with_bg.query('label==1'))
annotations_with_bg = annotations_with_bg.astype({'label':int}).sort_values(by=["date", "time", "filename"])
last_filename = annotations_with_bg.query('label==1').iloc[:int(n * train_pct)].filename.iloc[-1]
train = annotations_with_bg.query('filename <= @last_filename')
val = annotations_with_bg.query('filename > @last_filename')

train.to_csv(f'train_{time_delta}_by_files_3_class_per_frame.csv', index=False)
val.to_csv(f'val_{time_delta}_by_files_3_class_per_frame.csv', index=False)

In [34]:
val.label.value_counts()

0    1179
2     986
1     429
Name: label, dtype: int64

In [35]:
train.label.value_counts()

0    5563
2    4359
1    2590
Name: label, dtype: int64

In [36]:
train.label.nunique()

3

In [37]:
def get_time_in_hrs_format(time_secs):
    return time_secs//3600, (time_secs - time_secs//3600 * 3600) // 60, round(time_secs % 60,2)

for name, meta in {'train': train, 'val': val}.items():
    print(name)
    print(f'Number of samples: {len(meta)}')
    print(f"Labels breakdown:\n {meta['label'].value_counts()}")
    h, m, s = get_time_in_hrs_format(meta['call_length'][meta['label']!=0].sum())
    print(f"Calls length: {h}:{m}:{s}")
    h, m, s = get_time_in_hrs_format(meta['call_length'][meta['label']==0].sum())
    print(f"Background length: {h}:{m}:{s}")
    print('-----------------------------------------------------------------------------')

train
Number of samples: 12512
Labels breakdown:
 0    5563
2    4359
1    2590
Name: label, dtype: int64
Calls length: 2.0:53.0:42.95
Background length: 19.0:31.0:27.69
-----------------------------------------------------------------------------
val
Number of samples: 2594
Labels breakdown:
 0    1179
2     986
1     429
Name: label, dtype: int64
Calls length: 0.0:35.0:22.02
Background length: 6.0:23.0:39.03
-----------------------------------------------------------------------------
