In [191]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [122]:
from soundbay.utils.metadata_processing import load_n_adapt_raven_annotation_table_to_dv_dataset_requirements

# params

In [123]:
annotations_dir = '../datasets/2021_annotations/'
cols2drop = ['View', 'Channel', 'Low Freq (Hz)', 'High Freq (Hz)', 'Delta Time (s)', 'Delta Freq (Hz)',
       'Avg Power Density (dB FS/Hz)']

# load annotations

In [124]:
df_list = []
for filename in os.listdir(annotations_dir):
    try:
        #         print(filename)
        annotation_file_path = os.path.join(annotations_dir, filename)
        small_df = load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(annotation_file_path)
        df_list.append(small_df)
    except UnicodeDecodeError:
        continue

print(len(df_list))
df_all_annotations = pd.concat(df_list)
df_all_annotations = df_all_annotations.drop(cols2drop, axis=1)
# df_all_annotations.head()

33


In [125]:
df_all_annotations.shape
df_all_annotations.head()
df_all_annotations.tail()

(14898, 6)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,59.532117,60.647912,song (s),180913_081527 (1),1.115795
1,2,59.652743,60.937416,s,180913_081527 (1),1.284672
2,3,62.13765,63.609293,s,180913_081527 (1),1.471644
3,4,62.559843,63.434385,s,180913_081527 (1),0.874542
4,5,66.978572,68.263244,s,180913_081527 (1),1.284672


Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
244,249,579.833247,580.553433,sc ? un,25-115438_Tr2,0.720186
245,250,580.662828,581.683851,,25-115438_Tr2,1.021023
246,251,581.838828,582.759572,,25-115438_Tr2,0.920744
247,252,583.115107,584.281991,,25-115438_Tr2,1.166884
248,253,584.327572,585.175386,,25-115438_Tr2,0.847814


# Fix labels

## explore

In [126]:
df_all_annotations.Annotation.unique()

array(['song (s)', 's', 'un', nan, 'un/d', 'd', 'ד', 'sc', 'd?', 'sc ?',
       'SC', '?', 'few calls in this annotation', 'un ', 'un !!! What ?',
       'un ?', 'w & un', 'sc?', '!', 'baby whale?',
       'un- weird whale sound probably', 'un d/w', 'Dolphins?', 'dolphin',
       'd / cs ?', 'sc \\ d \\ un', 'cs ?', 'un \\ d ?', 'sc ? un'],
      dtype=object)

In [127]:
df_all_annotations.Annotation.value_counts(dropna=False)

NaN                               12975
sc                                  915
un                                  449
d                                   320
s                                   142
?                                    46
sc ?                                 14
un d/w                                7
d?                                    4
dolphin                               3
Dolphins?                             3
SC                                    2
un/d                                  2
un- weird whale sound probably        1
un \ d ?                              1
cs ?                                  1
sc \ d \ un                           1
d / cs ?                              1
song (s)                              1
un !!! What ?                         1
baby whale?                           1
!                                     1
sc?                                   1
w & un                                1
un ?                                  1


For labels that appear only once - find location

In [128]:
all_unique_labels = df_all_annotations.Annotation.value_counts()[
    df_all_annotations.Annotation.value_counts() == 1].index.values

In [129]:
all_unique_labels

array(['cs ?', 'sc \\ d \\ un', 'd / cs ?',
       'un- weird whale sound probably', 'un \\ d ?', 'song (s)', 'un ?',
       'baby whale?', '!', 'sc?', 'w & un', 'un !!! What ?', 'un ',
       'few calls in this annotation', 'ד', 'sc ? un'], dtype=object)

In [130]:
df_unique_labels = df_all_annotations[df_all_annotations.Annotation.isin(all_unique_labels)].copy()

In [131]:
df_unique_labels.columns

Index(['Selection', 'begin_time', 'end_time', 'Annotation', 'filename',
       'call_length'],
      dtype='object')

In [132]:
cols2keep = ['Selection', 'begin_time', 'end_time', 'Annotation', 'filename','call_length']

In [133]:
df_unique_labels = df_unique_labels[cols2keep]

In [134]:
df_unique_labels.to_csv('unique_labels_info.csv', index=False)

## standardize whale call tags

In [135]:
df_all_annotations.Annotation.unique()

array(['song (s)', 's', 'un', nan, 'un/d', 'd', 'ד', 'sc', 'd?', 'sc ?',
       'SC', '?', 'few calls in this annotation', 'un ', 'un !!! What ?',
       'un ?', 'w & un', 'sc?', '!', 'baby whale?',
       'un- weird whale sound probably', 'un d/w', 'Dolphins?', 'dolphin',
       'd / cs ?', 'sc \\ d \\ un', 'cs ?', 'un \\ d ?', 'sc ? un'],
      dtype=object)

In [136]:
def change_annotations(df_of_annotations: pd.DataFrame, annotations_to_change: list, target_value: str) -> None:
    """change specified annotations to target value. do this inplace."""
    df_of_annotations.replace(to_replace=annotations_to_change, value=target_value, inplace=True, limit=None, regex=False)
    return

In [148]:
whale_tags_to_change = ['sc', 'SC', 'sc ?', 'un- weird whale sound probably', 'cs ?', 'baby whale?', 'song (s)', 's', 'sc?']
target_value = 'w'

df_clean = df_all_annotations.replace(to_replace=whale_tags_to_change, value=target_value)

In [149]:
df_clean.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,59.532117,60.647912,w,180913_081527 (1),1.115795
1,2,59.652743,60.937416,w,180913_081527 (1),1.284672
2,3,62.13765,63.609293,w,180913_081527 (1),1.471644
3,4,62.559843,63.434385,w,180913_081527 (1),0.874542
4,5,66.978572,68.263244,w,180913_081527 (1),1.284672


In [150]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(14898, 6)

NaN                             12975
w                                1078
un                                449
d                                 320
?                                  46
un d/w                              7
d?                                  4
dolphin                             3
Dolphins?                           3
un/d                                2
un \ d ?                            1
sc \ d \ un                         1
d / cs ?                            1
un !!! What ?                       1
!                                   1
w & un                              1
un ?                                1
un                                  1
few calls in this annotation        1
ד                                   1
sc ? un                             1
Name: Annotation, dtype: int64

In [151]:
df_clean['Annotation'] = df_clean.Annotation.fillna('w')


In [152]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(14898, 6)

w                               14053
un                                449
d                                 320
?                                  46
un d/w                              7
d?                                  4
dolphin                             3
Dolphins?                           3
un/d                                2
un \ d ?                            1
sc \ d \ un                         1
d / cs ?                            1
un ?                                1
!                                   1
w & un                              1
un !!! What ?                       1
un                                  1
few calls in this annotation        1
ד                                   1
sc ? un                             1
Name: Annotation, dtype: int64

## filter out non-whales calls

In [153]:
df_clean = df_clean[df_clean.Annotation=='w']

In [154]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(14053, 6)

w    14053
Name: Annotation, dtype: int64

# label background

Methodology: all gaps between annotated segments are considered background

In [155]:
df_clean.shape
df_clean.filename.nunique()
df_clean.head()

(14053, 6)

33

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,59.532117,60.647912,w,180913_081527 (1),1.115795
1,2,59.652743,60.937416,w,180913_081527 (1),1.284672
2,3,62.13765,63.609293,w,180913_081527 (1),1.471644
3,4,62.559843,63.434385,w,180913_081527 (1),0.874542
4,5,66.978572,68.263244,w,180913_081527 (1),1.284672


In [156]:
df = df_clean


In [157]:
# for filename in df.filename.unique():
#     print(filename)

## merge overlapping calls

In [158]:

def merge_overlapping_calls(df: pd.DataFrame) -> pd.DataFrame:
    """
    Receives an annotation dataframe with (possibly) overlapping calls, and goes through merge-and-drop iterations until
    no more overlaps are found.
    :param df: Pandas DataFrame with the following columns: ['filename', 'begin_time', 'end_time']
    :return: pd.DataFrame
    """
    df = df.sort_values(['filename', 'begin_time']).reset_index(drop=True)
    df = reset_overlap_accessory_columns(df)
    df = mark_overlapping_rows(df)

    while 1 in df.overlap.unique():
        df = merge_overlapping_rows(df)
        df = reset_overlap_accessory_columns(df)
        df = mark_overlapping_rows(df)

    df = df.drop(['overlap', 'next_begin_time', 'next_end_time'], axis=1)
    return df


def merge_overlapping_rows(df) -> pd.DataFrame:
    """
    Merge (and drop) overlapping rows.
    """
    df.loc[df.overlap == 1, 'end_time'] = df[df.overlap == 1]['next_end_time']
    df = df.drop_duplicates(subset=['filename', 'end_time'], keep='first')
    return df


def reset_overlap_accessory_columns(df) -> pd.DataFrame:
    df['overlap'] = np.NaN
    df['next_begin_time'] = df.groupby('filename').begin_time.shift(-1)
    df['next_end_time'] = df.groupby('filename').end_time.shift(-1)
    return df


def mark_overlapping_rows(df) -> pd.DataFrame:
    df.loc[df.next_begin_time < df.end_time, 'overlap'] = 1
    return df

# def mark_non_overlapping_rows(df) -> pd.DataFrame:
#     """
#     Merely for sanity check.
#     """
#     df.loc[(df.begin_time > df.end_time.shift(1)) & (df.end_time < df.begin_time.shift(-1)), 'overlap'] = 0
#     return df


In [159]:
df_no_overlap = merge_overlapping_calls(df)

In [160]:
df_no_overlap.shape

(10573, 6)

In [161]:
df.shape
df_no_overlap.shape

(14053, 6)

(10573, 6)

In [162]:
df.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,59.532117,60.647912,w,180913_081527 (1),1.115795
1,2,59.652743,60.937416,w,180913_081527 (1),1.284672
2,3,62.13765,63.609293,w,180913_081527 (1),1.471644
3,4,62.559843,63.434385,w,180913_081527 (1),0.874542
4,5,66.978572,68.263244,w,180913_081527 (1),1.284672


In [163]:
df_no_overlap.sort_values(['filename', 'begin_time']).head(20)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,59.532117,60.937416,w,180913_081527 (1),1.115795
2,3,62.13765,63.434385,w,180913_081527 (1),1.471644
4,178,64.068321,65.576152,w,180913_081527 (1),1.507831
5,5,66.978572,68.263244,w,180913_081527 (1),1.284672
6,6,68.329589,69.511729,w,180913_081527 (1),1.18214
7,7,71.271167,72.754873,w,180913_081527 (1),1.887805
9,10,74.972095,76.172329,w,180913_081527 (1),1.109764
11,12,77.059941,78.061142,w,180913_081527 (1),1.151983
13,14,80.034604,81.650999,w,180913_081527 (1),1.616395
14,15,82.201279,83.57039,w,180913_081527 (1),1.369111


## mark gaps as background

In [164]:
df_bg = df_no_overlap[['filename', 'begin_time', 'end_time']].sort_values(['filename', 'begin_time']).reset_index(drop=True).copy()


In [165]:
df_bg.head()

Unnamed: 0,filename,begin_time,end_time
0,180913_081527 (1),59.532117,60.937416
1,180913_081527 (1),62.13765,63.434385
2,180913_081527 (1),64.068321,65.576152
3,180913_081527 (1),66.978572,68.263244
4,180913_081527 (1),68.329589,69.511729


In [166]:
df_bg['next_begin_time'] = df_bg.groupby('filename').begin_time.shift(-1)
# df_bg['next_end_time'] = df_bg.groupby('filename').end_time.shift(-1)

In [167]:

df_bg = df_bg.rename({'end_time': 'bg_begin_time', 'next_begin_time': 'bg_end_time'}, axis=1)

Manually look at shifts between different recordings to make sure there aren't mix-ups:

In [168]:
df_bg.filename.unique()
df_bg.filename.value_counts(dropna=False)

array(['180913_081527 (1)', '210824-095226_Tr1.txt',
       '210824-095226_Tr2.txt', '210824-100209_Tr1.txt',
       '210824-100209_Tr2.txt', '210824-104507_Tr1',
       '210824-104507_Tr1.txt', '210824-115331_Tr1.txt',
       '210824-125439_Tr1.txt', '210825-102141_Tr1.txt',
       '210825-112937_Tr1.txt', '210825-112937_Tr2.txt',
       '210825-132034_Tr1', '210825-132034_Tr2', '210825-135601_Tr1',
       '210825-135601_Tr1.Table.1.selections (1).txt',
       '210825-135601_Tr1.txt', '210826-083608_Tr1', '210827-081513_Tr1',
       '210827-081513_Tr2', '210827-090209_Tr1',
       '210827-133618_Tr1.Table.1.selections.donetxt.txt',
       '210827-133618_Tr2', '210828-080644_Tr1', '210903-095104_Tr1',
       '210903-095104_Tr2', '210903-110841_Tr1', '210903-110841_Tr2',
       '210904-074321_Tr1', '210904-111316_Tr2(first 4 minutes)(1).txt',
       '210904-111316_Tr2(first 4 minutes).txt', '210904-111316_Tr2.txt',
       '25-115438_Tr2'], dtype=object)

210824-115331_Tr1.txt                               737
210904-074321_Tr1                                   676
210824-104507_Tr1.txt                               633
210826-083608_Tr1                                   614
210824-125439_Tr1.txt                               594
210827-081513_Tr2                                   577
210828-080644_Tr1                                   538
210825-135601_Tr1.txt                               513
210825-135601_Tr1                                   479
210825-135601_Tr1.Table.1.selections (1).txt        479
210825-102141_Tr1.txt                               443
210827-090209_Tr1                                   405
210827-081513_Tr1                                   368
210904-111316_Tr2(first 4 minutes)(1).txt           360
210904-111316_Tr2(first 4 minutes).txt              360
210904-111316_Tr2.txt                               317
210825-132034_Tr2                                   302
210825-132034_Tr1                               

In [169]:
df_bg.loc[110:130]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
110,180913_081527 (1),283.24307,284.198384,286.704075
111,180913_081527 (1),286.704075,287.42056,289.842962
112,180913_081527 (1),289.842962,290.542387,292.067182
113,180913_081527 (1),292.067182,292.800726,296.314915
114,180913_081527 (1),296.314915,298.208482,300.762484
115,180913_081527 (1),300.762484,301.683679,
116,210824-095226_Tr1.txt,18.700366,19.764583,20.169273
117,210824-095226_Tr1.txt,20.169273,20.650971,40.781279
118,210824-095226_Tr1.txt,40.781279,41.599046,44.016323
119,210824-095226_Tr1.txt,44.016323,44.73327,64.40487


In [170]:
df_bg.tail()

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
10568,25-115438_Tr2,579.313619,580.243479,580.662828
10569,25-115438_Tr2,580.662828,581.683851,581.838828
10570,25-115438_Tr2,581.838828,582.759572,583.115107
10571,25-115438_Tr2,583.115107,584.281991,584.327572
10572,25-115438_Tr2,584.327572,585.175386,


In [171]:
df_bg[df_bg.filename=='25-115438_Tr2'].index.min()

10386

In [172]:
df_bg.loc[10900:10917]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time


## Finalize bg annotation df

In [173]:
df_bg.filename.nunique()

33

In [174]:
df_bg.shape
df_bg = df_bg[~df_bg.bg_end_time.isna()]
df_bg.shape

(10573, 4)

(10540, 4)

In [175]:
df_bg = df_bg.drop('begin_time', axis=1)

In [176]:
df_bg['Annotation'] = 'bg'

In [182]:
df_bg['call_length'] = df_bg['end_time'] - df_bg['begin_time']

In [183]:
df_bg

Unnamed: 0,filename,begin_time,end_time,Annotation,call_length
0,180913_081527 (1),60.937416,62.137650,bg,1.200234
1,180913_081527 (1),63.434385,64.068321,bg,0.633936
2,180913_081527 (1),65.576152,66.978572,bg,1.402420
3,180913_081527 (1),68.263244,68.329589,bg,0.066345
4,180913_081527 (1),69.511729,71.271167,bg,1.759438
...,...,...,...,...,...
10567,25-115438_Tr2,577.526285,579.313619,bg,1.787334
10568,25-115438_Tr2,580.243479,580.662828,bg,0.419349
10569,25-115438_Tr2,581.683851,581.838828,bg,0.154977
10570,25-115438_Tr2,582.759572,583.115107,bg,0.355535


# Concat both dfs together

In [184]:
df_bg = df_bg.rename({'bg_begin_time': 'begin_time', 'bg_end_time': 'end_time'}, axis=1)
df_bg.head()

Unnamed: 0,filename,begin_time,end_time,Annotation,call_length
0,180913_081527 (1),60.937416,62.13765,bg,1.200234
1,180913_081527 (1),63.434385,64.068321,bg,0.633936
2,180913_081527 (1),65.576152,66.978572,bg,1.40242
3,180913_081527 (1),68.263244,68.329589,bg,0.066345
4,180913_081527 (1),69.511729,71.271167,bg,1.759438


In [185]:
df_concat = pd.concat([df_bg, df_no_overlap[['begin_time', 'end_time', 'filename', 'call_length', 'Annotation']]])

In [186]:
df_concat = df_concat.rename({'Annotation': 'label'}, axis=1)

In [187]:
df_concat.head()

Unnamed: 0,filename,begin_time,end_time,label,call_length
0,180913_081527 (1),60.937416,62.13765,bg,1.200234
1,180913_081527 (1),63.434385,64.068321,bg,0.633936
2,180913_081527 (1),65.576152,66.978572,bg,1.40242
3,180913_081527 (1),68.263244,68.329589,bg,0.066345
4,180913_081527 (1),69.511729,71.271167,bg,1.759438


## Sanity

Sanity: do we have the same number of 'w' annotations as 'bg' ones?

In [188]:
df_concat.label.value_counts()

w     10573
bg    10540
Name: label, dtype: int64

Is the difference due to 'bg' not coming between a 'w'and the start/end of a recording?

In [189]:
df_concat.filename.nunique()

33

Yes. Good.

## Save

In [190]:
df_concat.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg.csv', index=False)

# Split to sets

## Different ratios

### By: 60% - 20% - 20%

In [192]:
df_train, df_test = train_test_split(df_concat, test_size=0.2, random_state=1)

In [193]:
 df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [194]:
df_train.shape
df_test.shape
df_val.shape

(12667, 5)

(4223, 5)

(4223, 5)

### By: 80% - 10% - 10%

In [209]:
df_train, df_test = train_test_split(df_concat, test_size=0.1, random_state=1)
df_train, df_val = train_test_split(df_train, test_size=0.111, random_state=1) # 0.25 x 0.9 = 0.1

In [210]:
df_train.shape
df_test.shape
df_val.shape

(16891, 5)

(2112, 5)

(2110, 5)

## Save

In [213]:
df_train.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_train.csv', index=False)
df_test.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_test.csv', index=False)
df_val.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_val.csv', index=False)