### Adapt the orcasound annotations file to our format

In [None]:
import pandas as pd
import numpy as np
import soundfile as sf
from soundbay.utils.metadata_processing import (reorder_columns_to_default_view, 
                                       correct_call_times_with_duration,
                                       non_overlap_df, bg_from_non_overlap_calls)

#### setup arguments

In [None]:
annotation_file = pd.read_csv('train_data/annotations.tsv', sep='\t')
wavs_path = 'train_data/wav'
output_filename = 'orcasound'

#### remove calls with duration of 2.450 (not precise, comes from a dl model with low temporal resolution)

In [None]:
print('Initial amount of records in dataset:', len(annotation_file))
suspects = annotation_file[np.isclose(annotation_file['duration_s'],2.450)]
annotation_file = annotation_file[~(np.isclose(annotation_file['duration_s'],2.450))]
print('number of records after filtering:', len(annotation_file))

In [None]:
annotation_file.rename(columns={'wav_filename': 'filename', 
                                'start_time_s':'begin_time', 
                                'duration_s': 'call_length'}, inplace=True)

In [None]:
annotation_file['end_time'] = annotation_file['begin_time'] + annotation_file['call_length']

In [None]:
annotation_file['label'] = np.ones((len(annotation_file),), dtype=int)

#### reorder the columns

In [None]:
annotation_file = reorder_columns_to_default_view(annotation_file)

#### remove .wav extension from the file names

In [None]:
name_series = [x.replace('.wav', '') for x in annotation_file['filename']]
annotation_file['filename'] = name_series

#### remove duplicate lines from the metadata

In [None]:
annotation_file.drop_duplicates(inplace=True)
print('Removed duplicated, number of records after filtering:', len(annotation_file))

#### filter out dataset 'podcast_round1' (non-accurate tags) and 'podcast_round3' (highly correlated with test)

In [None]:
annotation_file = annotation_file[~(annotation_file['dataset'] == 'podcast_round1')]
annotation_file = annotation_file[~(annotation_file['dataset'] == 'podcast_round3')]
print('Removed files from podcast_round1 (inaccurate tags) and \
      podcast_round3 (similar distribution to the test), number of \
      records after filtering:', len(annotation_file))

#### make sure end_time is not bigger than audio_len

In [None]:
annotation_file = correct_call_times_with_duration(annotation_file, wavs_path)

#### verify we have only a single sample-rate in the dataset

In [None]:
all_files = set(annotation_file['filename'])
srs = set()
for file in all_files:
    srs.add(sf.info(f'{wavs_path}/{file}.wav').samplerate)
print('sample-rates existing in the data:', srs)
assert len(srs)==1, 'Should contain only one sample rate across the dataset!'

#### add label 0 to files with call_length==0

In [None]:
annotation_file.loc[annotation_file['call_length'] == 0, 'label'] = 0

### label bg as call length==0, use part 3 for bg from calls

In [None]:
bg_files = list(annotation_file[annotation_file['label'] == 0]['filename'])
print(f'found {len(bg_files)} rows who are bg candidates')

#### filter out duplicates in bg and "bg" files that have calls in other rows on the annotation file

In [None]:
for file in bg_files:
    curr_call = annotation_file[annotation_file['filename'] == file]
    total_call_len = sum(curr_call['call_length'])
    # remove bg files that have calls as well and misclassified as bg
    if total_call_len > 0:
        indices = annotation_file[(annotation_file['filename']==file) & 
                                  (annotation_file['call_length'] == 0)].index
        annotation_file.drop(index=indices, inplace=True)
bg_files = list(annotation_file[annotation_file['label'] == 0]['filename'])
print(f'left with {len(bg_files)} rows who are legit bg candidates, others included calls')

#### modify end_time of the call to be the length of the wav file. We assume that the entire file is a bg noise

In [None]:
for bg_file in bg_files:
    annotation_file.loc[annotation_file['filename'] == bg_file, 'end_time'] = sf.info(f'{wavs_path}/{bg_file}.wav').duration
    annotation_file.loc[annotation_file['filename'] == bg_file, 'call_length'] = sf.info(f'{wavs_path}/{bg_file}.wav').duration

#### save file

In [None]:
annotation_file.to_csv(output_filename + '_reformated.csv', index=False)

### label bg , use part 3 for bg from calls

#### we're gonna use only recordings with positive calls to get bg noise, so remove all records with label==0

In [None]:
annotation_file = annotation_file[annotation_file['label'] == 1]

In [None]:
annotation_file = non_overlap_df(annotation_file)

In [None]:
annotation_file = bg_from_non_overlap_calls(annotation_file)

In [None]:
annotation_file.to_csv(output_filename + '_bg_from_calls.csv', index=False)