### Adapt the orcasound annotations file to our format

In [256]:
import pandas as pd
import numpy as np
import soundfile as sf

In [257]:
annotation_file = pd.read_csv('train_data/annotations.tsv', sep='\t')
example_file = pd.read_csv('../../../finding_willy/EDA/combined_annotations.csv')

#### remove calls with duration of 2.450 (not precise, comes from a dl model with low temporal resolution)

In [258]:
suspects = annotation_file[np.isclose(annotation_file['duration_s'],2.450)]
annotation_file = annotation_file[~(np.isclose(annotation_file['duration_s'],2.450))]

In [259]:
annotation_file.rename(columns={'wav_filename': 'filename', 
                                'start_time_s':'begin_time', 
                                'duration_s': 'call_length'}, inplace=True)

In [260]:
annotation_file['end_time'] = annotation_file['begin_time'] + annotation_file['call_length']

In [261]:
annotation_file['label'] = np.ones((len(annotation_file),), dtype=int)

#### reorder the columns

In [262]:
cols = annotation_file.columns.tolist()
orig_cols = example_file.columns.tolist()
remaining = list(set(cols) - set(orig_cols))
new_cols = orig_cols + remaining

In [263]:
annotation_file = annotation_file[new_cols]

#### remove .wav extension from the file names

In [264]:
name_series = [x.replace('.wav', '') for x in annotation_file['filename']]
annotation_file['filename'] = name_series

#### remove duplicate lines from the metadata

In [265]:
annotation_file.drop_duplicates(inplace=True)

#### filter out dataset 'podcast_round1'

In [266]:
annotation_file = annotation_file[~(annotation_file['dataset'] == 'podcast_round1')]

#### make sure end_time is not bigger than audio_len

In [268]:
for file in set(annotation_file['filename']):
    curr_call = annotation_file[annotation_file['filename'] == file]
    
    max_end = curr_call['end_time'].max()
    max_end_idx = curr_call['end_time'].idxmax()
    
    max_begin = curr_call['begin_time'].max()
    max_begin_idx = curr_call['begin_time'].idxmax()
    
    audio_len = sf.info(f'{wavs_path}/{file}.wav').duration
    if max_end > audio_len:
        annotation_file.at[max_end_idx, 'end_time'] = audio_len
        annotation_file.at[max_end_idx, 'call_length'] = audio_len - annotation_file.at[max_end_idx, 'begin_time']
        
    if max_begin > audio_len:
        annotation_file.drop(index=max_begin_idx, inplace=True)

blablabla 361


### label bg as bg, solve begin_time and call_len issues

In [269]:
annotation_file.loc[annotation_file['call_length'] == 0, 'label'] = 0

In [270]:
bg_files = list(annotation_file[annotation_file['label'] == 0]['filename'])

In [271]:
len(bg_files)

591

#### filter out duplicates in bg and "bg" files that have calls in other rows on the annotation file

In [272]:
for file in bg_files:
    curr_call = annotation_file[annotation_file['filename'] == file]
    total_call_len = sum(curr_call['call_length'])
    # remove bg files that have calls as well
    if total_call_len > 0:
        indices = annotation_file[(annotation_file['filename']==file) & (annotation_file['call_length'] == 0)].index
        annotation_file.drop(index=indices, inplace=True)

bg_files = list(annotation_file[annotation_file['label'] == 0]['filename'])
len(bg_files)

519

In [273]:
import soundfile as sf

In [274]:
wavs_path = 'train_data/wav'

In [275]:
for bg_file in bg_files:
    annotation_file.loc[annotation_file['filename'] == bg_file, 'end_time'] = sf.info(f'{wavs_path}/{bg_file}.wav').duration
    annotation_file.loc[annotation_file['filename'] == bg_file, 'call_length'] = sf.info(f'{wavs_path}/{bg_file}.wav').duration

In [276]:
files_3 = list(annotation_file['filename'])
srs = []
for file in files_3:
    srs.append(sf.info(f'{wavs_path}/{file}.wav').samplerate)
srs = set(srs)
print(srs)
assert len(srs)==1, 'Should contain only one sample rate across the dataset!'

{20000}


In [277]:
annotation_file.to_csv('orcasound_reformated.csv', index=False)