In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [3]:
from soundbay.utils.metadata_processing import load_n_adapt_raven_annotation_table_to_dv_dataset_requirements

In [4]:
pwd

'/home/rafael/DeepVoice/soundbay/notebooks'

# params

In [5]:
annotations_dir = '../datasets/mozambique_2021/annotation_files/'
cols2drop = ['View', 'Channel', 'Low Freq (Hz)', 'High Freq (Hz)', 'Delta Time (s)', 'Delta Freq (Hz)',
             'Avg Power Density (dB FS/Hz)']

# load annotations

In [6]:
filenames = os.listdir(annotations_dir)

In [9]:
annotation_filename_dict = {'210825-135601_Tr1.Table.1.selections.txt' : '210825-135601_Tr1',
               '210825-135601_Tr1.Table.1.selections (1).txt': 'duplicate',
               '210904-111316_Tr2.txt': '210904-111316_Tr2', 
               '210904-111316_Tr2restofrecord.txt': 'corrupt',
               '210904-111316_Tr2(first 4 minutes)(1).txt': 'duplicate', 
               '210904-111316_Tr2(first 4 minutes).txt': 'duplicate',
               '210827-133618_Tr2.Table.1.selections.txt': '210827-133618_Tr2',
               '210904-093942_Tr2.Table.1.selections.txt': '210904-093942_Tr2',
               '210828-080644_Tr1.Table.1.selections.txt': '210828-080644_Tr1',
               '210827-081513_Tr1.Table.1.selections.txt': '210827-081513_Tr1',
               '210825-132034_Tr1.Table.1.selections.txt': '210825-132034_Tr1',
               '210824-104507_Tr1.Table.1.selections.txt': '210824-104507_Tr1',
               '210824-104507_Tr1.txt': '210824-104507_Tr1',
               '210825-112937_Tr1.txt' : '210825-112937_Tr1',
               '210904-074321_Tr1.Table.1.selections.txt': '210904-074321_Tr1',
               '25-115438_Tr2.Table.1.selections.txt': 'unknown',
               '210903-110841_Tr2.Table.1.selections.txt': '210903-110841_Tr2',
               '210825-102141_Tr1.txt': '210825-102141_Tr1',
               '210903-095104_Tr2.Table.1.selections.txt': '210903-095104_Tr2',
               '210903-095104_Tr1.Table.1.selections.txt': '210903-095104_Tr1',
               '210825-135601_Tr1.txt': '210825-135601_Tr1',
               '210824-125439_Tr1.txt': '210824-125439_Tr1',
               '180913_081527 (1).Table.1.selections.txt': 'unknown',
               '210824-115331_Tr1.txt': '210824-115331_Tr1',
               '210825-112937_Tr2.txt': '210825-112937_Tr2',
               '210825-132034_Tr2.Table.1.selections.txt': '210825-132034_Tr2',
               '210827-133618_Tr1.Table.1.selections.donetxt.txt': '210827-133618_Tr1',
               '210826-083608_Tr1.Table.1.selections.txt': '210826-083608_Tr1',
               '210827-081513_Tr2.Table.1.selections.txt': '210827-081513_Tr2',
               '210824-100209_Tr1.txt': '210824-100209_Tr1',
               '210824-095226_Tr2.txt': '210824-095226_Tr2',
               '210827-090209_Tr1.Table.1.selections.txt': '210827-090209_Tr2',
               '210903-110841_Tr1.Table.1.selections.txt': '210903-110841_Tr1',
               '210824-100209_Tr2.txt': '210824-100209_Tr2',
               '210824-095226_Tr1.txt' : '210824-095226_Tr1',
               '210904-093942_Tr1.Table.1.selections- Annotated.txt': '210904-093942_Tr1',
               '210903-095104_Tr1.Table.1.selections.txt': '210903-095104_Tr1'
                           }


In [12]:
len(annotation_filename_dict)

36

In [11]:
df_list = []
print('FILENAMES:')
for filename in sorted(filenames):
    print(filename)
    try:
        #         print(filename)
        annotation_file_path = os.path.join(annotations_dir, filename)
        small_df = load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(annotation_file_path, annotation_filename_dict)
        df_list.append(small_df)
    except UnicodeDecodeError:
        continue
    except Exception as e:
        continue

print(f'\nlen(df_list): {len(df_list)}')
df_all_annotations = pd.concat(df_list)
df_all_annotations = df_all_annotations.drop(cols2drop, axis=1)
# df_all_annotations.head()

print(f'\nunique filenames:\n{df_all_annotations.filename.unique()}')

FILENAMES:
210824-095226_Tr1.txt
210824-095226_Tr2.txt
210824-100209_Tr1.txt
210824-100209_Tr2.txt
210824-104507_Tr1.Table.1.selections.txt
210824-104507_Tr1.txt
210824-115331_Tr1.txt
210824-125439_Tr1.txt
210825-102141_Tr1.txt
210825-112937_Tr1.txt
210825-112937_Tr2.txt
210825-132034_Tr1.Table.1.selections.txt
210825-132034_Tr2.Table.1.selections.txt
210825-135601_Tr1.Table.1.selections.txt
210825-135601_Tr1.txt
210826-083608_Tr1.Table.1.selections.txt
210827-081513_Tr1.Table.1.selections.txt
210827-081513_Tr2.Table.1.selections.txt
210827-090209_Tr1.Table.1.selections.txt
210827-133618_Tr1.Table.1.selections.txt
210827-133618_Tr2.Table.1.selections.txt
210828-080644_Tr1.Table.1.selections.txt
210903-095104_Tr1.Table.1.selections.txt
210903-095104_Tr2.Table.1.selections.txt
210903-110841_Tr1.Table.1.selections.txt
210903-110841_Tr2.Table.1.selections.txt
210904-074321_Tr1.Table.1.selections.txt
210904-093942_Tr2.Table.1.selections.txt
210904-111316_Tr2.txt
25-115438_Tr2.Table.1.select

In [13]:
df_all_annotations.shape
df_all_annotations.head()
df_all_annotations.tail()

(13010, 7)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,,210824-095226_Tr1,0.481698,
2,3.0,29.632165,30.752394,un,210824-095226_Tr1,1.120229,
3,4.0,40.781279,41.599046,,210824-095226_Tr1,0.817767,
4,5.0,44.016323,44.73327,,210824-095226_Tr1,0.716947,


Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
244,249.0,579.833247,580.553433,sc ? un,unknown,0.720186,
245,250.0,580.662828,581.683851,,unknown,1.021023,
246,251.0,581.838828,582.759572,,unknown,0.920744,
247,252.0,583.115107,584.281991,,unknown,1.166884,
248,253.0,584.327572,585.175386,,unknown,0.847814,


In [16]:
df_all_annotations = df_all_annotations[df_all_annotations.filename != 'unknown']
df_all_annotations = df_all_annotations[df_all_annotations.filename != 'duplicate']


In [17]:
df_all_annotations.shape
df_all_annotations.head()
df_all_annotations.tail()

(12761, 7)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,,210824-095226_Tr1,0.481698,
2,3.0,29.632165,30.752394,un,210824-095226_Tr1,1.120229,
3,4.0,40.781279,41.599046,,210824-095226_Tr1,0.817767,
4,5.0,44.016323,44.73327,,210824-095226_Tr1,0.716947,


Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
423,445.0,1117.207228,1118.345822,sc,210904-111316_Tr2,1.138594,
424,446.0,1118.909808,1120.023828,sc,210904-111316_Tr2,1.11402,
425,455.0,626.065093,627.235105,,210904-111316_Tr2,1.170012,
426,456.0,1120.34131,1120.800024,sc,210904-111316_Tr2,0.458714,
427,456.0,1120.34131,1120.800024,sc,210904-111316_Tr2,0.458714,


# Fix labels

## explore

In [18]:
df_all_annotations.Annotation.unique()

array([nan, 'un', 'd', 's', 'ד', 'sc', 'd?', 'sc ?', 'SC', '?',
       'few calls in this annotation', 'un ', 'un !!! What ?', 'un ?',
       'w & un', 'sc?', 'baby whale?', 'un- weird whale sound probably',
       'un d/w', 'Dolphins?'], dtype=object)

In [19]:
df_all_annotations.Annotation.value_counts(dropna=False)

NaN                               11660
un                                  366
sc                                  330
d                                   320
s                                    45
?                                    16
un d/w                                7
d?                                    4
SC                                    2
w & un                                1
un- weird whale sound probably        1
baby whale?                           1
sc?                                   1
few calls in this annotation          1
un ?                                  1
un !!! What ?                         1
un                                    1
sc ?                                  1
ד                                     1
Dolphins?                             1
Name: Annotation, dtype: int64

For labels that appear only once - find location

In [20]:
all_unique_labels = df_all_annotations.Annotation.value_counts()[
    df_all_annotations.Annotation.value_counts() == 1].index.values

In [21]:
all_unique_labels

array(['w & un', 'un- weird whale sound probably', 'baby whale?', 'sc?',
       'few calls in this annotation', 'un ?', 'un !!! What ?', 'un ',
       'sc ?', 'ד', 'Dolphins?'], dtype=object)

In [22]:
df_unique_labels = df_all_annotations[df_all_annotations.Annotation.isin(all_unique_labels)].copy()

In [23]:
df_unique_labels.columns

Index(['Selection', 'begin_time', 'end_time', 'Annotation', 'filename',
       'call_length', 'git pullSelection'],
      dtype='object')

In [24]:
cols2keep = ['Selection', 'begin_time', 'end_time', 'Annotation', 'filename', 'call_length']

In [25]:
df_unique_labels = df_unique_labels[cols2keep]

In [26]:
df_unique_labels.to_csv('unique_labels_info.csv', index=False)

## standardize whale call tags

In [27]:
df_all_annotations.Annotation.unique()

array([nan, 'un', 'd', 's', 'ד', 'sc', 'd?', 'sc ?', 'SC', '?',
       'few calls in this annotation', 'un ', 'un !!! What ?', 'un ?',
       'w & un', 'sc?', 'baby whale?', 'un- weird whale sound probably',
       'un d/w', 'Dolphins?'], dtype=object)

In [28]:
def change_annotations(df_of_annotations: pd.DataFrame, annotations_to_change: list, target_value: str) -> None:
    """change specified annotations to target value. do this inplace."""
    df_of_annotations.replace(to_replace=annotations_to_change, value=target_value, inplace=True, limit=None,
                              regex=False)
    return

In [29]:
whale_tags_to_change = ['sc', 'SC', 'sc ?', 'un- weird whale sound probably', 'cs ?', 'baby whale?', 'song (s)', 's',
                        'sc?']
target_value = 'w'

df_clean = df_all_annotations.replace(to_replace=whale_tags_to_change, value=target_value)

In [30]:
df_clean.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,,210824-095226_Tr1,0.481698,
2,3.0,29.632165,30.752394,un,210824-095226_Tr1,1.120229,
3,4.0,40.781279,41.599046,,210824-095226_Tr1,0.817767,
4,5.0,44.016323,44.73327,,210824-095226_Tr1,0.716947,


In [31]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(12761, 7)

NaN                             11660
w                                 381
un                                366
d                                 320
?                                  16
un d/w                              7
d?                                  4
ד                                   1
few calls in this annotation        1
un                                  1
un !!! What ?                       1
un ?                                1
w & un                              1
Dolphins?                           1
Name: Annotation, dtype: int64

In [32]:
df_clean['Annotation'] = df_clean.Annotation.fillna('w')


In [33]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(12761, 7)

w                               12041
un                                366
d                                 320
?                                  16
un d/w                              7
d?                                  4
ד                                   1
few calls in this annotation        1
un                                  1
un !!! What ?                       1
un ?                                1
w & un                              1
Dolphins?                           1
Name: Annotation, dtype: int64

## filter out non-whales calls

In [34]:
df_clean = df_clean[df_clean.Annotation == 'w']

In [35]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(12041, 7)

w    12041
Name: Annotation, dtype: int64

# label background

Methodology: all gaps between annotated segments are considered background

In [36]:
df_clean.shape
df_clean.filename.nunique()
df_clean.head()

(12041, 7)

25

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,w,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,w,210824-095226_Tr1,0.481698,
3,4.0,40.781279,41.599046,w,210824-095226_Tr1,0.817767,
4,5.0,44.016323,44.73327,w,210824-095226_Tr1,0.716947,
14,15.0,64.40487,64.897771,w,210824-095226_Tr1,0.492901,


In [37]:
df = df_clean


In [38]:
# for filename in df.filename.unique():
#     print(filename)

## merge overlapping calls

In [39]:

def merge_overlapping_calls(df: pd.DataFrame) -> pd.DataFrame:
    """
    Receives an annotation dataframe with (possibly) overlapping calls, and goes through merge-and-drop iterations until
    no more overlaps are found.
    :param df: Pandas DataFrame with the following columns: ['filename', 'begin_time', 'end_time']
    :return: pd.DataFrame
    """
    df = df.sort_values(['filename', 'begin_time']).reset_index(drop=True)
    df = reset_overlap_accessory_columns(df)
    df = mark_overlapping_rows(df)

    while 1 in df.overlap.unique():
        df = merge_overlapping_rows(df)
        df = reset_overlap_accessory_columns(df)
        df = mark_overlapping_rows(df)

    df = df.drop(['overlap', 'next_begin_time', 'next_end_time'], axis=1)
    return df


def merge_overlapping_rows(df) -> pd.DataFrame:
    """
    Merge (and drop) overlapping rows.
    """
    df.loc[df.overlap == 1, 'end_time'] = df[df.overlap == 1]['next_end_time']
    df = df.drop_duplicates(subset=['filename', 'end_time'], keep='first')
    return df


def reset_overlap_accessory_columns(df) -> pd.DataFrame:
    df['overlap'] = np.NaN
    df['next_begin_time'] = df.groupby('filename').begin_time.shift(-1)
    df['next_end_time'] = df.groupby('filename').end_time.shift(-1)
    return df


def mark_overlapping_rows(df) -> pd.DataFrame:
    df.loc[df.next_begin_time < df.end_time, 'overlap'] = 1
    return df

# def mark_non_overlapping_rows(df) -> pd.DataFrame:
#     """
#     Merely for sanity check.
#     """
#     df.loc[(df.begin_time > df.end_time.shift(1)) & (df.end_time < df.begin_time.shift(-1)), 'overlap'] = 0
#     return df


In [40]:
df_no_overlap = merge_overlapping_calls(df)

In [41]:
df_no_overlap.shape

(8220, 7)

In [42]:
df.shape
df_no_overlap.shape

(12041, 7)

(8220, 7)

In [43]:
df.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,w,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,w,210824-095226_Tr1,0.481698,
3,4.0,40.781279,41.599046,w,210824-095226_Tr1,0.817767,
4,5.0,44.016323,44.73327,w,210824-095226_Tr1,0.716947,
14,15.0,64.40487,64.897771,w,210824-095226_Tr1,0.492901,


In [44]:
df_no_overlap.sort_values(['filename', 'begin_time']).head(20)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length,git pullSelection
0,1.0,18.700366,19.764583,w,210824-095226_Tr1,1.064218,
1,2.0,20.169273,20.650971,w,210824-095226_Tr1,0.481698,
2,4.0,40.781279,41.599046,w,210824-095226_Tr1,0.817767,
3,5.0,44.016323,44.73327,w,210824-095226_Tr1,0.716947,
4,15.0,64.40487,64.897771,w,210824-095226_Tr1,0.492901,
5,16.0,72.60506,73.109163,w,210824-095226_Tr1,0.504103,
6,18.0,74.447204,75.410601,w,210824-095226_Tr1,0.963397,
7,20.0,76.689479,77.294402,w,210824-095226_Tr1,0.604924,
8,21.0,77.783556,78.086018,w,210824-095226_Tr1,0.302462,
9,22.0,78.086018,78.646132,w,210824-095226_Tr1,0.560114,


## mark gaps as background

In [45]:
df_bg = df_no_overlap[['filename', 'begin_time', 'end_time']].sort_values(['filename', 'begin_time']).reset_index(
    drop=True).copy()


In [46]:
df_bg.head()

Unnamed: 0,filename,begin_time,end_time
0,210824-095226_Tr1,18.700366,19.764583
1,210824-095226_Tr1,20.169273,20.650971
2,210824-095226_Tr1,40.781279,41.599046
3,210824-095226_Tr1,44.016323,44.73327
4,210824-095226_Tr1,64.40487,64.897771


In [47]:
df_bg['next_begin_time'] = df_bg.groupby('filename').begin_time.shift(-1)
# df_bg['next_end_time'] = df_bg.groupby('filename').end_time.shift(-1)

In [48]:

df_bg = df_bg.rename({'end_time': 'bg_begin_time', 'next_begin_time': 'bg_end_time'}, axis=1)

Manually look at shifts between different recordings to make sure there aren't mix-ups:

In [49]:
df_bg.filename.unique()
df_bg.filename.value_counts(dropna=False)

array(['210824-095226_Tr1', '210824-095226_Tr2', '210824-100209_Tr1',
       '210824-100209_Tr2', '210824-104507_Tr1', '210824-115331_Tr1',
       '210824-125439_Tr1', '210825-102141_Tr1', '210825-112937_Tr1',
       '210825-112937_Tr2', '210825-132034_Tr1', '210825-132034_Tr2',
       '210825-135601_Tr1', '210826-083608_Tr1', '210827-081513_Tr1',
       '210827-081513_Tr2', '210827-090209_Tr2', '210827-133618_Tr2',
       '210828-080644_Tr1', '210903-095104_Tr1', '210903-095104_Tr2',
       '210903-110841_Tr1', '210903-110841_Tr2', '210904-074321_Tr1',
       '210904-111316_Tr2'], dtype=object)

210824-115331_Tr1    737
210904-074321_Tr1    676
210824-104507_Tr1    636
210826-083608_Tr1    614
210824-125439_Tr1    594
210827-081513_Tr2    577
210825-135601_Tr1    551
210828-080644_Tr1    538
210825-102141_Tr1    443
210827-090209_Tr2    405
210827-081513_Tr1    368
210904-111316_Tr2    317
210825-132034_Tr2    302
210825-132034_Tr1    264
210903-110841_Tr2    223
210825-112937_Tr2    214
210825-112937_Tr1    212
210903-110841_Tr1    194
210824-100209_Tr1     81
210824-095226_Tr1     67
210827-133618_Tr2     64
210824-095226_Tr2     55
210824-100209_Tr2     44
210903-095104_Tr1     25
210903-095104_Tr2     19
Name: filename, dtype: int64

In [50]:
df_bg.loc[110:130]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
110,210824-095226_Tr2,409.207281,409.8595,416.672204
111,210824-095226_Tr2,416.672204,417.152786,419.143771
112,210824-095226_Tr2,419.143771,419.693008,422.88701
113,210824-095226_Tr2,422.88701,423.459132,427.95485
114,210824-095226_Tr2,427.95485,428.33245,431.479121
115,210824-095226_Tr2,431.479121,431.925376,445.233505
116,210824-095226_Tr2,445.233505,445.828512,449.569814
117,210824-095226_Tr2,449.569814,450.050396,454.334522
118,210824-095226_Tr2,454.334522,454.860874,463.656413
119,210824-095226_Tr2,463.656413,464.285747,483.425449


In [51]:
df_bg.tail()

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
8215,210904-111316_Tr2,1115.282497,1115.683872,1116.437244
8216,210904-111316_Tr2,1116.437244,1116.928723,1117.207228
8217,210904-111316_Tr2,1117.207228,1118.345822,1118.909808
8218,210904-111316_Tr2,1118.909808,1120.023828,1120.34131
8219,210904-111316_Tr2,1120.34131,1120.800024,


In [52]:
df_bg[df_bg.filename == '25-115438_Tr2'].index.min()

nan

In [53]:
df_bg.loc[10900:10917]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time


## Finalize bg annotation df

In [54]:
df_bg.filename.nunique()

25

In [55]:
df_bg.shape
df_bg = df_bg[~df_bg.bg_end_time.isna()]
df_bg.shape

(8220, 4)

(8195, 4)

In [56]:
df_bg = df_bg.drop('begin_time', axis=1)

In [57]:
df_bg['Annotation'] = 'bg'

In [58]:
df_bg['call_length'] = df_bg['bg_end_time'] - df_bg['bg_begin_time']

# Concat both dfs together

In [59]:
df_bg = df_bg.rename({'bg_begin_time': 'begin_time', 'bg_end_time': 'end_time'}, axis=1)
df_bg.head()

Unnamed: 0,filename,begin_time,end_time,Annotation,call_length
0,210824-095226_Tr1,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1,64.897771,72.60506,bg,7.70729


In [60]:
df_concat = pd.concat([df_bg, df_no_overlap[['begin_time', 'end_time', 'filename', 'call_length', 'Annotation']]])

In [61]:
df_concat = df_concat.rename({'Annotation': 'label'}, axis=1)

In [62]:
df_concat.head()

Unnamed: 0,filename,begin_time,end_time,label,call_length
0,210824-095226_Tr1,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1,64.897771,72.60506,bg,7.70729


## Sanity

Sanity: do we have the same number of 'w' annotations as 'bg' ones?

In [63]:
df_concat.label.value_counts()

w     8220
bg    8195
Name: label, dtype: int64

Is the difference due to 'bg' not coming between a 'w'and the start/end of a recording?

In [64]:
df_concat.filename.nunique()

25

In [65]:
df_concat.label.value_counts()[0]-df_concat.label.value_counts()[1]

25

Yes. Good.

## Save

In [66]:
df_concat.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg.csv', index=False)

# Split to sets

Preferred DV ratio: 70-20-10

## Explore different recordings

In [67]:
df_concat.head()

Unnamed: 0,filename,begin_time,end_time,label,call_length
0,210824-095226_Tr1,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1,64.897771,72.60506,bg,7.70729


In [68]:
df_meta = df_concat[df_concat.label=='w'].groupby('filename').agg(
    count_calls = pd.NamedAgg('begin_time', 'count'),
    sum_call_length = pd.NamedAgg('call_length', 'sum'),
    avg_call_length = pd.NamedAgg('call_length', 'mean'),
).sort_values('filename')



In [69]:
df_meta['cum_call_length'] = df_meta.sum_call_length.cumsum()
df_meta['cum_perc'] = 100*df_meta['cum_call_length']/(df_meta['sum_call_length'].sum())

In [70]:
df_meta

Unnamed: 0_level_0,count_calls,sum_call_length,avg_call_length,cum_call_length,cum_perc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
210824-095226_Tr1,67,41.941372,0.625991,41.941372,0.654544
210824-095226_Tr2,55,37.918654,0.68943,79.860026,1.246308
210824-100209_Tr1,81,38.09241,0.470277,117.952436,1.840784
210824-100209_Tr2,44,18.947215,0.430619,136.899651,2.136477
210824-104507_Tr1,636,352.458521,0.55418,489.358172,7.637
210824-115331_Tr1,737,410.209489,0.556594,899.567661,14.038793
210824-125439_Tr1,594,466.242653,0.78492,1365.810314,21.315049
210825-102141_Tr1,443,359.170015,0.810768,1724.980329,26.920312
210825-112937_Tr1,212,182.875824,0.862622,1907.856154,29.774301
210825-112937_Tr2,214,185.013619,0.86455,2092.869773,32.661652


## Assign recordings to sets

Preferred DV ratio: 70-20-10

In [71]:
train_files = df_meta.loc[df_meta['cum_perc']<70].index.tolist()
# train_files.append('25-115438_Tr2') #to get closer to 70%
train_files

['210824-095226_Tr1',
 '210824-095226_Tr2',
 '210824-100209_Tr1',
 '210824-100209_Tr2',
 '210824-104507_Tr1',
 '210824-115331_Tr1',
 '210824-125439_Tr1',
 '210825-102141_Tr1',
 '210825-112937_Tr1',
 '210825-112937_Tr2',
 '210825-132034_Tr1',
 '210825-132034_Tr2',
 '210825-135601_Tr1',
 '210826-083608_Tr1',
 '210827-081513_Tr1',
 '210827-081513_Tr2',
 '210827-090209_Tr2']

In [72]:
val_files = df_meta.loc[((df_meta['cum_perc']>70) & (df_meta['cum_perc']<90))].index.tolist()
val_files

['210827-133618_Tr2',
 '210828-080644_Tr1',
 '210903-095104_Tr1',
 '210903-095104_Tr2',
 '210903-110841_Tr1',
 '210903-110841_Tr2']

In [73]:
test_files = df_meta.loc[(df_meta['cum_perc']>90)].index.tolist()
test_files

['210904-074321_Tr1', '210904-111316_Tr2']

### validate split

In [74]:
df_meta.loc[train_files]

Unnamed: 0_level_0,count_calls,sum_call_length,avg_call_length,cum_call_length,cum_perc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
210824-095226_Tr1,67,41.941372,0.625991,41.941372,0.654544
210824-095226_Tr2,55,37.918654,0.68943,79.860026,1.246308
210824-100209_Tr1,81,38.09241,0.470277,117.952436,1.840784
210824-100209_Tr2,44,18.947215,0.430619,136.899651,2.136477
210824-104507_Tr1,636,352.458521,0.55418,489.358172,7.637
210824-115331_Tr1,737,410.209489,0.556594,899.567661,14.038793
210824-125439_Tr1,594,466.242653,0.78492,1365.810314,21.315049
210825-102141_Tr1,443,359.170015,0.810768,1724.980329,26.920312
210825-112937_Tr1,212,182.875824,0.862622,1907.856154,29.774301
210825-112937_Tr2,214,185.013619,0.86455,2092.869773,32.661652


In [75]:
call_length_total = df_meta.sum_call_length.sum()

In [76]:
for files in [train_files, test_files, val_files]:
#     print(files)
    np.round(df_meta.loc[files].sum_call_length.sum() / call_length_total*100)

69.0

15.0

15.0

## Actually split

In [77]:
df_train = df_concat[df_concat.filename.isin(train_files)]
df_val = df_concat[df_concat.filename.isin(val_files)]
df_test = df_concat[df_concat.filename.isin(test_files)]

In [78]:
df_train.shape
df_val.shape
df_test.shape

(12311, 5)

(2120, 5)

(1984, 5)

## Save

In [79]:
df_train.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_train.csv', index=False)
df_test.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_test.csv', index=False)
df_val.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_val.csv', index=False)