In [1]:
import pandas as pd
from soundbay.utils.metadata_processing import (bg_from_non_overlap_calls, 
                                                correct_call_times_with_duration, 
                                                non_overlap_df, 
                                                reorder_columns_to_default_view,
                                                load_n_adapt_raven_annotation_table_to_dv_dataset_requirements)
import numpy as np
from pathlib import Path
import soundfile as sf

In [2]:
annotation_files = list(Path('annotations').glob('*.txt'))
include_uncertain = False # class 1 is high prob porpoise, class 3 is low prob porpoise, class 2 is high prob sonar, class 4 is low prob sonar

In [3]:
all_annotations = []
for file in annotation_files:
    all_annotations.append(load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(file))
annotations_meta = pd.concat(all_annotations, ignore_index=True)

In [4]:
annotations_meta = annotations_meta[annotations_meta['View'] == 'Waveform 1']

In [5]:
min_freq = annotations_meta['Low Freq (Hz)'].min()
max_freq = annotations_meta['High Freq (Hz)'].max()

In [6]:
annotations_meta.loc[0]

Selection                                              1
View                                          Waveform 1
Channel                                                1
begin_time                                      1.956249
end_time                                         2.17905
Low Freq (Hz)                                  106494.37
High Freq (Hz)                                139586.967
Delta Time (s)                                    0.2228
Delta Freq (Hz)                                33092.597
Avg Power Density (dB FS/Hz)                         NaN
Annotation                                             3
filename                        738496579.150825043633-2
call_length                                     0.222801
Name: 0, dtype: object

In [7]:
annotations_meta['label'] = np.zeros(len(annotations_meta), dtype='int')
annotations_meta['label'][annotations_meta['Annotation'].str.contains('1')] = 1
if include_uncertain:
    annotations_meta['label'][annotations_meta['Annotation'].str.contains('3')] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_meta['label'][annotations_meta['Annotation'].str.contains('1')] = 1


In [8]:
min_call_length = annotations_meta['call_length'][annotations_meta['label']==1].min()
annotations_meta['call_length'][annotations_meta['label']==1].mean()

0.6407833957407408

In [9]:
annotations_meta = reorder_columns_to_default_view(annotations_meta)

In [10]:
annotations_meta = correct_call_times_with_duration(annotations_meta, 'splitted_data/')

In [11]:
annotations_meta = non_overlap_df(annotations_meta)

In [12]:
annotations_meta = bg_from_non_overlap_calls(annotations_meta)

In [13]:
annotations_meta = annotations_meta[annotations_meta['call_length'] >= min_call_length]

In [14]:
len(annotations_meta)

745

In [15]:
annotations_meta[annotations_meta['label']==1].call_length.sum()

103.80349226900002

In [16]:
testset_filename = '738496579.150825043633-0'
train_set = annotations_meta[annotations_meta['filename'] != testset_filename]
test_set = annotations_meta[annotations_meta['filename'] == testset_filename]

In [17]:
print(test_set[test_set['label'] == 1].call_length.sum(), 'total call len of testset')
print(train_set[train_set['label'] == 1].call_length.sum(), 'total call len of trainset')

17.21537349600003 total call len of testset
86.58811877299999 total call len of trainset


In [18]:
if include_uncertain:
    train_set.to_csv('train_with_low_prob.csv' ,index=False)
    test_set.to_csv('test_with_low_prob.csv' ,index=False)
else:
    train_set.to_csv('train_high_prob_only.csv' ,index=False)
    test_set.to_csv('test_high_prob_only.csv' ,index=False)