In [None]:
from soundbay.utils.metadata_processing import (bg_from_non_overlap_calls, correct_call_times_with_duration, 
                                                non_overlap_df,reorder_columns_to_default_view)
import pandas as pd
import soundfile as sf
import math
import time
import re
import numpy as np

In [None]:
original_metadata = pd.read_csv('nefsc_sbnms_200903_nopp6_ch10/detections/NEFSC_SBNMS_200903_NOPP6_CH10_allbaleen_detection_log.csv')
audio_files_path = 'nefsc_sbnms_200903_nopp6_ch10/source-audio/'

In [None]:
FILES_DELTA = 15 * 60 # 15 minutes files

def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)
    
def filetime_from_time(time_str):
    sec_time = get_sec(time_str)
    filetime_sec = math.ceil(sec_time / FILES_DELTA) * FILES_DELTA
    filetime_str = time.strftime('%H%M%S', time.gmtime(filetime_sec))
    return filetime_str

def get_time_and_date(iso_input):
    date, time_str = iso_input.split('T')
    time_str = time_str.split('-')[0]
    return date, time_str

def iso_to_file_name(iso_input):
    date, time_str = get_time_and_date(iso_input)
    filename_time = filetime_from_time(time_str)
    filename_date = date.replace('-','')
    return f'NOPP6_EST_{filename_date}_{filename_time}_CH10'

def get_time_in_file(iso_input, type='start'):
    date, time_str = get_time_and_date(iso_input)
    time_int = get_sec(time_str)
    time_in_file = time_int - (time_int // FILES_DELTA) * FILES_DELTA
    if type == 'end' and time_in_file == 0:
        time_in_file = FILES_DELTA
    return time_in_file

def get_previous_filename(filename):
    file_parts = filename.split('_')
    time_part = file_parts[-2]
    h, m, s = re.findall('..',time_part)
    time_sec = int(h) * 3600 + int(m) * 60 + int(s)
    new_time = time_sec - FILES_DELTA
    assert new_time % FILES_DELTA ==0, 'whyyyy'
    new_time_part = time.strftime('%H%M%S', time.gmtime(new_time))
    file_parts[-2] = new_time_part
    return '_'.join(file_parts)

In [None]:
original_metadata.head(5)

In [None]:
original_metadata['filename'] = [iso_to_file_name(x) for x in original_metadata['End_DateTime_ISO8601']]
original_metadata['begin_time'] = [get_time_in_file(x) for x in original_metadata['Start_DateTime_ISO8601']]
original_metadata['end_time'] = [get_time_in_file(x, 'end') for x in original_metadata['End_DateTime_ISO8601']]
original_metadata = original_metadata[original_metadata['Species'] == 'RIWH']

In [None]:
original_metadata['Species'].value_counts()

In [None]:
original_metadata['call_length'] = original_metadata['end_time'] - original_metadata['begin_time'] 

In [None]:
# split annotations that originate from different files to the corresponding files
problematic_samples_filter = original_metadata['call_length'] < 0
after_split_samples = original_metadata[problematic_samples_filter].copy()
original_metadata.loc[problematic_samples_filter, 'end_time'] = FILES_DELTA
original_metadata.loc[problematic_samples_filter, 'filename'] = [get_previous_filename(x) for x in original_metadata[problematic_samples_filter]['filename']]
after_split_samples['begin_time'] = 0
new_metadata = pd.concat([original_metadata, after_split_samples], ignore_index = True)

In [None]:
# remove calls with length of zero
new_metadata = new_metadata[new_metadata['call_length'] != 0]

In [None]:
# remove overlapping files
new_metadata = non_overlap_df(new_metadata)

In [None]:
# correct files duration
new_metadata = correct_call_times_with_duration(new_metadata, audio_files_path=audio_files_path)

In [None]:
# recalc call_length
new_metadata['call_length'] = new_metadata['end_time'] - new_metadata['begin_time'] 

In [None]:
# save a clean version of the annotations
new_metadata.to_csv('RIWH_clean_annotations.csv')

In [None]:
with_bg_metadata = bg_from_non_overlap_calls(new_metadata)

In [None]:
with_bg_metadata.loc[with_bg_metadata['label'] != 0, 'label'] = 1

In [None]:
with_bg_metadata['label'] = np.array(with_bg_metadata['label']).astype('int')

In [None]:
# split to train and val
val_filter = with_bg_metadata['filename'].str.contains('20090330')
train_metadata = with_bg_metadata[~val_filter]
val_metadata = with_bg_metadata[val_filter]
print(f'{len(train_metadata)=}, {len(val_metadata)=}')

In [None]:
# split to high prob detection and all detections
high_prob_train_metadata = train_metadata[~((train_metadata['Detection_Confidence'] == 'Possibly_Detected') & (train_metadata['label']==1))]
high_prob_val_metadata = val_metadata[~((val_metadata['Detection_Confidence'] == 'Possibly_Detected') & (val_metadata['label']==1))]
print(f'{len(high_prob_train_metadata)=}, {len(high_prob_val_metadata)=}')

In [None]:
for name, meta in {'train': train_metadata, 'val': val_metadata, 
                   'train_high_prob': high_prob_train_metadata, 
                   'val_high_prob': high_prob_val_metadata}.items():
    print(name)
    # remove this label, it's already available by the csv split
    meta.drop(columns=['Detection_Confidence'], inplace=True) 
    print(f'Number of samples: {len(meta)}')
    print(f"Labels breakdown: {meta['label'].value_counts()}")
    print(f"Calls length: {time.strftime('%H:%M:%S', time.gmtime(meta['call_length'][meta['label']==1].sum()))}")
    print(f"Background length: {time.strftime('%H:%M:%S', time.gmtime(meta['call_length'][meta['label']==0].sum()))}")
    print('-----------------------------------------------------------------------------')

In [None]:
train_metadata.to_csv('train.csv', index=False)
val_metadata.to_csv('val.csv', index=False)
high_prob_train_metadata.to_csv('train_high_prob.csv', index=False)
high_prob_val_metadata.to_csv('val_high_prob.csv', index=False)