# imports

In [200]:
import pandas as pd
import numpy as np
import soundfile as sf
import os

from soundbay.utils.metadata_processing import (reorder_columns_to_default_view, 
                                       correct_call_times_with_duration,
                                       non_overlap_df, bg_from_non_overlap_calls,
                                                raven_to_df_annotations,
                                               )
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# params

In [201]:
annotation_dir_path = '/mnt/c/Mine/Code/DeepVoice/active_learning/taggers_results/'
# recording_path = '/mnt/c/Mine/heavy_stuff_dont_backup/DeepVoice/data/recordings_2018'

# annotation_filename = '180913_081527_sec_60_to_360.Table.1.selections.txt'
# annotation_filename = '180913_081527_sec_236_to_536.Table.1.selections.txt'
annotation_filename = '180916_120609_sec_1200_to_1500.Table.1.selections.txt'
annotation_full_path = os.path.join(annotation_dir_path, annotation_filename)

output_filename = '180916_120609_sec_1200_to_1500_adapted.txt'
output_full_path = os.path.join(annotation_dir_path, output_filename)

In [202]:
train_set_annotations_full_path = '../mozambiqe_2018/combined_annotations_filtered_train.csv'

# load & adapt 

## rename & add fields

In [203]:
def load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(annotation_path: str,
                                                                   filename_suffix: str = ".Table.1.selections.txt"
                                                                   ) -> pd.DataFrame:
    # todo: decide whether to add annotation treatment
    df_annotations = pd.read_csv(annotation_path, sep="\t")
    df_annotations['filename'] = os.path.basename(annotation_path).replace(filename_suffix, '')
    df_annotations = df_annotations.rename(columns={'Begin Time (s)': 'begin_time', 'End Time (s)': 'end_time'})
    df_annotations['call_length'] = df_annotations['end_time'] - df_annotations['begin_time']
    return df_annotations

In [204]:
df_adapted = load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(annotation_full_path)

In [205]:
# df_adapted.to_csv(output_full_path, sep='\t', index=False)

## add label = 1

In [206]:
df_adapted['label'] = 1

## reorder the columns

In [207]:
df_adapted.columns
df_adapted = reorder_columns_to_default_view(df_adapted)
df_adapted.columns

Index(['Selection', 'View', 'Channel', 'begin_time', 'end_time',
       'Low Freq (Hz)', 'High Freq (Hz)', 'Annotation', 'filename',
       'call_length', 'label'],
      dtype='object')

Index(['begin_time', 'end_time', 'filename', 'call_length', 'label',
       'High Freq (Hz)', 'Channel', 'View', 'Annotation', 'Low Freq (Hz)',
       'Selection'],
      dtype='object')

## remove duplicate lines from the metadata

In [208]:
df_adapted.shape
df_adapted.drop_duplicates(inplace=True)
print('Removed duplicated, number of records after filtering:', len(df_adapted))

(296, 11)

Removed duplicated, number of records after filtering: 296


## clean filename

In [209]:
df_adapted['filename'] = df_adapted.filename.str.split('_sec').apply(lambda x: x[0])

In [210]:
df_adapted.head()

Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
0,0.156927,0.292456,180916_120609,0.135528,1,752.865,1,Spectrogram 1,,191.406,2
1,0.570645,1.383814,180916_120609,0.813169,1,382.812,1,Spectrogram 1,,63.802,3
2,1.198355,1.704802,180916_120609,0.506448,1,816.667,1,Spectrogram 1,,216.927,4
3,2.525105,4.115778,180916_120609,1.590673,1,1033.594,1,Spectrogram 1,,689.062,5
4,2.189851,6.284229,180916_120609,4.094379,1,484.896,1,Spectrogram 1,,229.687,6


##  extract background

In [211]:
df_adapted.shape
df_adapted_no_overlap = non_overlap_df(df_adapted)
df_adapted_no_overlap.shape

(296, 11)

(107, 11)

In [212]:
df_adapted.head()
df_adapted_no_overlap.head()

Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
0,0.156927,0.292456,180916_120609,0.135528,1,752.865,1,Spectrogram 1,,191.406,2
1,0.570645,1.383814,180916_120609,0.813169,1,382.812,1,Spectrogram 1,,63.802,3
2,1.198355,1.704802,180916_120609,0.506448,1,816.667,1,Spectrogram 1,,216.927,4
3,2.525105,4.115778,180916_120609,1.590673,1,1033.594,1,Spectrogram 1,,689.062,5
4,2.189851,6.284229,180916_120609,4.094379,1,484.896,1,Spectrogram 1,,229.687,6


Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
0,0.156927,0.292456,180916_120609,0.135528,1,752.865,1,Spectrogram 1,,191.406,2
1,0.570645,1.704802,180916_120609,1.134157,1,382.812,1,Spectrogram 1,,63.802,3
4,2.189851,6.284229,180916_120609,4.094379,1,484.896,1,Spectrogram 1,,229.687,6
5,6.312761,8.338551,180916_120609,2.02579,1,791.146,1,Spectrogram 1,,242.448,7
9,8.645273,9.223051,180916_120609,0.577778,1,778.385,1,Spectrogram 1,,165.885,11


In [213]:
df_adapted_no_overlap.shape
df_adapted_w_bg = bg_from_non_overlap_calls(df_adapted_no_overlap)
df_adapted_w_bg.shape


(107, 11)

(213, 11)

In [214]:
df_adapted_no_overlap.head()
df_adapted_w_bg.sort_values('begin_time').head()

Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
0,0.156927,0.292456,180916_120609,0.135528,1,752.865,1,Spectrogram 1,,191.406,2
1,0.570645,1.704802,180916_120609,1.134157,1,382.812,1,Spectrogram 1,,63.802,3
4,2.189851,6.284229,180916_120609,4.094379,1,484.896,1,Spectrogram 1,,229.687,6
5,6.312761,8.338551,180916_120609,2.02579,1,791.146,1,Spectrogram 1,,242.448,7
9,8.645273,9.223051,180916_120609,0.577778,1,778.385,1,Spectrogram 1,,165.885,11


Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
106,0.156927,0.292456,180916_120609,0.135528,1,752.865,1,Spectrogram 1,,191.406,2
0,0.292456,0.570645,180916_120609,0.278189,0,382.812,1,Spectrogram 1,,63.802,3
107,0.570645,1.704802,180916_120609,1.134157,1,382.812,1,Spectrogram 1,,63.802,3
1,1.704802,2.189851,180916_120609,0.485048,0,484.896,1,Spectrogram 1,,229.687,6
108,2.189851,6.284229,180916_120609,4.094379,1,484.896,1,Spectrogram 1,,229.687,6


In [215]:
df_adapted_w_bg.label.value_counts()

1    107
0    106
Name: label, dtype: int64

## export

In [216]:
output_filename = annotation_filename.split('.')[0] + '_adapted_w_bg_from_calls.csv'
output_full_path = os.path.join(annotation_dir_path, output_filename)
output_full_path

'/mnt/c/Mine/Code/DeepVoice/active_learning/taggers_results/180916_120609_sec_1200_to_1500_adapted_w_bg_from_calls.csv'

In [217]:
df_adapted_w_bg.to_csv(output_full_path, sep='\t', index=False)

In [218]:
df_adapted_w_bg.head()

Unnamed: 0,begin_time,end_time,filename,call_length,label,High Freq (Hz),Channel,View,Annotation,Low Freq (Hz),Selection
0,0.292456,0.570645,180916_120609,0.278189,0,382.812,1,Spectrogram 1,,63.802,3
1,1.704802,2.189851,180916_120609,0.485048,0,484.896,1,Spectrogram 1,,229.687,6
2,6.284229,6.312761,180916_120609,0.028532,0,791.146,1,Spectrogram 1,,242.448,7
3,8.338551,8.645273,180916_120609,0.306722,0,778.385,1,Spectrogram 1,,165.885,11
4,9.223051,9.89356,180916_120609,0.670509,0,829.427,1,Spectrogram 1,,204.167,12


# concat to train set annotations

In [219]:
df_train = pd.read_csv(train_set_annotations_full_path)

In [220]:
df_train

Unnamed: 0,begin_time,end_time,filename,call_length,label
0,0.786270,0.807425,180912_073707,0.021155,0
1,1.241107,2.221300,180912_073707,0.980193,1
2,2.799544,2.824225,180912_073707,0.024681,0
3,4.040651,4.090013,180912_073707,0.049362,0
4,4.414394,4.516644,180912_073707,0.102250,0
...,...,...,...,...,...
2584,380.206581,380.743225,180916_112552,0.536645,1
2585,380.923992,381.358955,180916_112552,0.434962,1
2586,381.511475,385.556081,180916_112552,4.044606,1
2587,386.092719,391.425273,180916_112552,5.332554,1


In [221]:
df_train_plus_new = pd.concat([df_train, df_adapted_w_bg], join='inner', ignore_index=True)

In [222]:
df_train_plus_new.shape
df_train.shape
df_adapted_w_bg.shape

(2802, 5)

(2589, 5)

(213, 11)

In [223]:
df_train_plus_new.sample(20)

Unnamed: 0,begin_time,end_time,filename,call_length,label
397,599.68048,601.11697,180912_083838,1.436491,1
373,491.006842,491.06951,180912_083838,0.062669,1
858,746.461868,746.883143,180912_092922,0.421275,0
2244,920.962251,922.319954,180912_092922,1.357704,1
185,396.677489,397.37471,180912_075945,0.697221,0
2759,156.250081,158.839383,180916_120609,2.589302,1
1862,979.573415,981.331182,180912_083838,1.757767,1
1668,371.929546,372.276668,180912_083838,0.347122,1
1299,337.777788,338.161913,180916_112552,0.384125,0
729,446.661438,446.867019,180912_092922,0.205582,0


## export

In [224]:
output_filename

'180916_120609_sec_1200_to_1500_adapted_w_bg_from_calls.csv'

In [225]:
output_filename_w_train = 'train_annotations_plus_' + annotation_filename.split('.')[0] + '.csv'
output_full_path = os.path.join(annotation_dir_path, output_filename_w_train)
output_full_path

'/mnt/c/Mine/Code/DeepVoice/active_learning/taggers_results/train_annotations_plus_180916_120609_sec_1200_to_1500.csv'

In [226]:
df_train_plus_new.to_csv(output_full_path, sep='\t', index=False)

In [227]:
df_train_plus_new.describe()

Unnamed: 0,begin_time,end_time,call_length,label
count,2802.0,2802.0,2802.0,2802.0
mean,463.722047,465.050143,1.328097,0.60207
std,488.719134,488.767041,1.905055,0.489558
min,0.0,0.292456,0.0,0.0
25%,140.323203,141.454128,0.307601,0.0
50%,340.62074,342.079567,0.711085,1.0
75%,702.216507,702.892669,1.719906,1.0
max,3238.116638,3239.837327,46.153593,1.0
