In [30]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = None

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
from soundbay.utils.metadata_processing import load_n_adapt_raven_annotation_table_to_dv_dataset_requirements

# params

In [3]:
annotations_dir = '../datasets/mozambique_2021/annotation_files/'
cols2drop = ['View', 'Channel', 'Low Freq (Hz)', 'High Freq (Hz)', 'Delta Time (s)', 'Delta Freq (Hz)',
             'Avg Power Density (dB FS/Hz)']

# load annotations

In [4]:
filenames = os.listdir(annotations_dir)

In [5]:
df_list = []
print('FILENAMES:')
for filename in sorted(filenames):
    print(filename)
    try:
        #         print(filename)
        annotation_file_path = os.path.join(annotations_dir, filename)
        small_df = load_n_adapt_raven_annotation_table_to_dv_dataset_requirements(annotation_file_path)
        df_list.append(small_df)
    except UnicodeDecodeError:
        continue

print(f'\nlen(df_list): {len(df_list)}')
df_all_annotations = pd.concat(df_list)
df_all_annotations = df_all_annotations.drop(cols2drop, axis=1)
# df_all_annotations.head()

print(f'\nunique filenames:\n{df_all_annotations.filename.unique()}')

FILENAMES:
210824-095226_Tr1.txt
210824-100209_Tr1.txt
210824-104507_Tr1.Table.1.selections.txt
210824-115331_Tr1.txt
210824-125439_Tr1.txt
210825-102141_Tr1.txt
210825-112937_Tr1.txt
210825-132034_Tr1.Table.1.selections.txt
210825-135601_Tr1.Table.1.selections.txt
210826-083608_Tr1.Table.1.selections.txt
210827-081513_Tr1.Table.1.selections.txt
210827-090209_Tr1.Table.1.selections.txt
210827-133618_Tr1.Table.1.selections.txt
210828-080644_Tr1.Table.1.selections.txt
210903-095104_Tr1.Table.1.selections.txt
210903-110841_Tr1.Table.1.selections.txt
210904-074321_Tr1.Table.1.selections.txt
210904-093942_Tr2.Table.1.selections.txt
210904-111316_Tr2.txt
25-115438_Tr2.Table.1.selections.txt

len(df_list): 19

unique filenames:
['210824-095226_Tr1.txt' '210824-100209_Tr1.txt' '210824-104507_Tr1'
 '210824-115331_Tr1.txt' '210824-125439_Tr1.txt' '210825-102141_Tr1.txt'
 '210825-112937_Tr1.txt' '210825-132034_Tr1' '210825-135601_Tr1'
 '210826-083608_Tr1' '210827-081513_Tr1' '210827-090209_Tr1'
 

In [6]:
df_all_annotations.shape
df_all_annotations.head()
df_all_annotations.tail()

(9902, 6)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,18.700366,19.764583,,210824-095226_Tr1.txt,1.064218
1,2,20.169273,20.650971,,210824-095226_Tr1.txt,0.481698
2,3,29.632165,30.752394,un,210824-095226_Tr1.txt,1.120229
3,4,40.781279,41.599046,,210824-095226_Tr1.txt,0.817767
4,5,44.016323,44.73327,,210824-095226_Tr1.txt,0.716947


Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
244,249,579.833247,580.553433,sc ? un,25-115438_Tr2,0.720186
245,250,580.662828,581.683851,,25-115438_Tr2,1.021023
246,251,581.838828,582.759572,,25-115438_Tr2,0.920744
247,252,583.115107,584.281991,,25-115438_Tr2,1.166884
248,253,584.327572,585.175386,,25-115438_Tr2,0.847814


# Fix labels

## explore

In [7]:
df_all_annotations.Annotation.unique()

array([nan, 'un', 'd', 's', 'ד', 'sc', 'd?', 'sc ?', 'SC',
       'few calls in this annotation', 'un ', 'un !!! What ?', 'sc?', '!',
       'un- weird whale sound probably', 'un d/w', '?', 'Dolphins?',
       'dolphin', 'd / cs ?', 'sc \\ d \\ un', 'cs ?', 'un \\ d ?',
       'sc ? un'], dtype=object)

In [8]:
df_all_annotations.Annotation.value_counts(dropna=False)

NaN                               8905
sc                                 387
un                                 332
d                                  175
s                                   45
?                                   15
sc ?                                14
un d/w                               7
d?                                   4
dolphin                              3
SC                                   2
un \ d ?                             1
cs ?                                 1
sc \ d \ un                          1
d / cs ?                             1
Dolphins?                            1
sc?                                  1
un- weird whale sound probably       1
!                                    1
un !!! What ?                        1
un                                   1
few calls in this annotation         1
ד                                    1
sc ? un                              1
Name: Annotation, dtype: int64

For labels that appear only once - find location

In [9]:
all_unique_labels = df_all_annotations.Annotation.value_counts()[
    df_all_annotations.Annotation.value_counts() == 1].index.values

In [10]:
all_unique_labels

array(['un \\ d ?', 'cs ?', 'sc \\ d \\ un', 'd / cs ?', 'Dolphins?',
       'sc?', 'un- weird whale sound probably', '!', 'un !!! What ?',
       'un ', 'few calls in this annotation', 'ד', 'sc ? un'],
      dtype=object)

In [11]:
df_unique_labels = df_all_annotations[df_all_annotations.Annotation.isin(all_unique_labels)].copy()

In [12]:
df_unique_labels.columns

Index(['Selection', 'begin_time', 'end_time', 'Annotation', 'filename',
       'call_length'],
      dtype='object')

In [13]:
cols2keep = ['Selection', 'begin_time', 'end_time', 'Annotation', 'filename', 'call_length']

In [14]:
df_unique_labels = df_unique_labels[cols2keep]

In [15]:
df_unique_labels.to_csv('unique_labels_info.csv', index=False)

## standardize whale call tags

In [16]:
df_all_annotations.Annotation.unique()

array([nan, 'un', 'd', 's', 'ד', 'sc', 'd?', 'sc ?', 'SC',
       'few calls in this annotation', 'un ', 'un !!! What ?', 'sc?', '!',
       'un- weird whale sound probably', 'un d/w', '?', 'Dolphins?',
       'dolphin', 'd / cs ?', 'sc \\ d \\ un', 'cs ?', 'un \\ d ?',
       'sc ? un'], dtype=object)

In [17]:
def change_annotations(df_of_annotations: pd.DataFrame, annotations_to_change: list, target_value: str) -> None:
    """change specified annotations to target value. do this inplace."""
    df_of_annotations.replace(to_replace=annotations_to_change, value=target_value, inplace=True, limit=None,
                              regex=False)
    return

In [18]:
whale_tags_to_change = ['sc', 'SC', 'sc ?', 'un- weird whale sound probably', 'cs ?', 'baby whale?', 'song (s)', 's',
                        'sc?']
target_value = 'w'

df_clean = df_all_annotations.replace(to_replace=whale_tags_to_change, value=target_value)

In [19]:
df_clean.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,18.700366,19.764583,,210824-095226_Tr1.txt,1.064218
1,2,20.169273,20.650971,,210824-095226_Tr1.txt,0.481698
2,3,29.632165,30.752394,un,210824-095226_Tr1.txt,1.120229
3,4,40.781279,41.599046,,210824-095226_Tr1.txt,0.817767
4,5,44.016323,44.73327,,210824-095226_Tr1.txt,0.716947


In [20]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(9902, 6)

NaN                             8905
w                                451
un                               332
d                                175
?                                 15
un d/w                             7
d?                                 4
dolphin                            3
Dolphins?                          1
un \ d ?                           1
sc \ d \ un                        1
d / cs ?                           1
!                                  1
un !!! What ?                      1
un                                 1
few calls in this annotation       1
ד                                  1
sc ? un                            1
Name: Annotation, dtype: int64

In [21]:
df_clean['Annotation'] = df_clean.Annotation.fillna('w')


In [22]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(9902, 6)

w                               9356
un                               332
d                                175
?                                 15
un d/w                             7
d?                                 4
dolphin                            3
Dolphins?                          1
un \ d ?                           1
sc \ d \ un                        1
d / cs ?                           1
!                                  1
un !!! What ?                      1
un                                 1
few calls in this annotation       1
ד                                  1
sc ? un                            1
Name: Annotation, dtype: int64

## filter out non-whales calls

In [23]:
df_clean = df_clean[df_clean.Annotation == 'w']

In [24]:
df_clean.shape
df_clean.Annotation.value_counts(dropna=False)

(9356, 6)

w    9356
Name: Annotation, dtype: int64

# label background

Methodology: all gaps between annotated segments are considered background

In [25]:
df_clean.shape
df_clean.filename.nunique()
df_clean.head()

(9356, 6)

19

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,18.700366,19.764583,w,210824-095226_Tr1.txt,1.064218
1,2,20.169273,20.650971,w,210824-095226_Tr1.txt,0.481698
3,4,40.781279,41.599046,w,210824-095226_Tr1.txt,0.817767
4,5,44.016323,44.73327,w,210824-095226_Tr1.txt,0.716947
14,15,64.40487,64.897771,w,210824-095226_Tr1.txt,0.492901


In [26]:
df = df_clean


In [27]:
# for filename in df.filename.unique():
#     print(filename)

## merge overlapping calls

In [28]:

def merge_overlapping_calls(df: pd.DataFrame) -> pd.DataFrame:
    """
    Receives an annotation dataframe with (possibly) overlapping calls, and goes through merge-and-drop iterations until
    no more overlaps are found.
    :param df: Pandas DataFrame with the following columns: ['filename', 'begin_time', 'end_time']
    :return: pd.DataFrame
    """
    df = df.sort_values(['filename', 'begin_time']).reset_index(drop=True)
    df = reset_overlap_accessory_columns(df)
    df = mark_overlapping_rows(df)

    while 1 in df.overlap.unique():
        df = merge_overlapping_rows(df)
        df = reset_overlap_accessory_columns(df)
        df = mark_overlapping_rows(df)

    df = df.drop(['overlap', 'next_begin_time', 'next_end_time'], axis=1)
    return df


def merge_overlapping_rows(df) -> pd.DataFrame:
    """
    Merge (and drop) overlapping rows.
    """
    df.loc[df.overlap == 1, 'end_time'] = df[df.overlap == 1]['next_end_time']
    df = df.drop_duplicates(subset=['filename', 'end_time'], keep='first')
    return df


def reset_overlap_accessory_columns(df) -> pd.DataFrame:
    df['overlap'] = np.NaN
    df['next_begin_time'] = df.groupby('filename').begin_time.shift(-1)
    df['next_end_time'] = df.groupby('filename').end_time.shift(-1)
    return df


def mark_overlapping_rows(df) -> pd.DataFrame:
    df.loc[df.next_begin_time < df.end_time, 'overlap'] = 1
    return df

# def mark_non_overlapping_rows(df) -> pd.DataFrame:
#     """
#     Merely for sanity check.
#     """
#     df.loc[(df.begin_time > df.end_time.shift(1)) & (df.end_time < df.begin_time.shift(-1)), 'overlap'] = 0
#     return df


In [31]:
df_no_overlap = merge_overlapping_calls(df)

In [32]:
df_no_overlap.shape

(6657, 6)

In [33]:
df.shape
df_no_overlap.shape

(9356, 6)

(6657, 6)

In [34]:
df.head()

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,18.700366,19.764583,w,210824-095226_Tr1.txt,1.064218
1,2,20.169273,20.650971,w,210824-095226_Tr1.txt,0.481698
3,4,40.781279,41.599046,w,210824-095226_Tr1.txt,0.817767
4,5,44.016323,44.73327,w,210824-095226_Tr1.txt,0.716947
14,15,64.40487,64.897771,w,210824-095226_Tr1.txt,0.492901


In [35]:
df_no_overlap.sort_values(['filename', 'begin_time']).head(20)

Unnamed: 0,Selection,begin_time,end_time,Annotation,filename,call_length
0,1,18.700366,19.764583,w,210824-095226_Tr1.txt,1.064218
1,2,20.169273,20.650971,w,210824-095226_Tr1.txt,0.481698
2,4,40.781279,41.599046,w,210824-095226_Tr1.txt,0.817767
3,5,44.016323,44.73327,w,210824-095226_Tr1.txt,0.716947
4,15,64.40487,64.897771,w,210824-095226_Tr1.txt,0.492901
5,16,72.60506,73.109163,w,210824-095226_Tr1.txt,0.504103
6,18,74.447204,75.410601,w,210824-095226_Tr1.txt,0.963397
7,20,76.689479,77.294402,w,210824-095226_Tr1.txt,0.604924
8,21,77.783556,78.086018,w,210824-095226_Tr1.txt,0.302462
9,22,78.086018,78.646132,w,210824-095226_Tr1.txt,0.560114


## mark gaps as background

In [36]:
df_bg = df_no_overlap[['filename', 'begin_time', 'end_time']].sort_values(['filename', 'begin_time']).reset_index(
    drop=True).copy()


In [37]:
df_bg.head()

Unnamed: 0,filename,begin_time,end_time
0,210824-095226_Tr1.txt,18.700366,19.764583
1,210824-095226_Tr1.txt,20.169273,20.650971
2,210824-095226_Tr1.txt,40.781279,41.599046
3,210824-095226_Tr1.txt,44.016323,44.73327
4,210824-095226_Tr1.txt,64.40487,64.897771


In [38]:
df_bg['next_begin_time'] = df_bg.groupby('filename').begin_time.shift(-1)
# df_bg['next_end_time'] = df_bg.groupby('filename').end_time.shift(-1)

In [39]:

df_bg = df_bg.rename({'end_time': 'bg_begin_time', 'next_begin_time': 'bg_end_time'}, axis=1)

Manually look at shifts between different recordings to make sure there aren't mix-ups:

In [40]:
df_bg.filename.unique()
df_bg.filename.value_counts(dropna=False)

array(['210824-095226_Tr1.txt', '210824-100209_Tr1.txt',
       '210824-104507_Tr1', '210824-115331_Tr1.txt',
       '210824-125439_Tr1.txt', '210825-102141_Tr1.txt',
       '210825-112937_Tr1.txt', '210825-132034_Tr1', '210825-135601_Tr1',
       '210826-083608_Tr1', '210827-081513_Tr1', '210827-090209_Tr1',
       '210827-133618_Tr1', '210828-080644_Tr1', '210903-095104_Tr1',
       '210903-110841_Tr1', '210904-074321_Tr1', '210904-111316_Tr2.txt',
       '25-115438_Tr2'], dtype=object)

210824-115331_Tr1.txt    737
210904-074321_Tr1        676
210826-083608_Tr1        614
210824-125439_Tr1.txt    594
210828-080644_Tr1        538
210825-135601_Tr1        479
210825-102141_Tr1.txt    443
210827-090209_Tr1        405
210827-081513_Tr1        368
210904-111316_Tr2.txt    360
210825-132034_Tr1        264
210824-104507_Tr1        233
210825-112937_Tr1.txt    212
210903-110841_Tr1        194
25-115438_Tr2            187
210827-133618_Tr1        180
210824-100209_Tr1.txt     81
210824-095226_Tr1.txt     67
210903-095104_Tr1         25
Name: filename, dtype: int64

In [41]:
df_bg.loc[110:130]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
110,210824-100209_Tr1.txt,164.754418,165.070299,166.166312
111,210824-100209_Tr1.txt,166.166312,166.740641,167.209677
112,210824-100209_Tr1.txt,167.209677,167.563847,170.582758
113,210824-100209_Tr1.txt,170.582758,170.908211,171.147515
114,210824-100209_Tr1.txt,171.147515,171.755348,172.940627
115,210824-100209_Tr1.txt,172.940627,173.266081,174.807685
116,210824-100209_Tr1.txt,174.807685,175.185785,176.515916
117,210824-100209_Tr1.txt,176.515916,176.989738,177.559281
118,210824-100209_Tr1.txt,177.559281,177.808157,178.253263
119,210824-100209_Tr1.txt,178.253263,178.765373,179.0239


In [42]:
df_bg.tail()

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time
6652,25-115438_Tr2,579.313619,580.243479,580.662828
6653,25-115438_Tr2,580.662828,581.683851,581.838828
6654,25-115438_Tr2,581.838828,582.759572,583.115107
6655,25-115438_Tr2,583.115107,584.281991,584.327572
6656,25-115438_Tr2,584.327572,585.175386,


In [43]:
df_bg[df_bg.filename == '25-115438_Tr2'].index.min()

6470

In [44]:
df_bg.loc[10900:10917]

Unnamed: 0,filename,begin_time,bg_begin_time,bg_end_time


## Finalize bg annotation df

In [45]:
df_bg.filename.nunique()

19

In [46]:
df_bg.shape
df_bg = df_bg[~df_bg.bg_end_time.isna()]
df_bg.shape

(6657, 4)

(6638, 4)

In [47]:
df_bg = df_bg.drop('begin_time', axis=1)

In [50]:
df_bg['Annotation'] = 'bg'

In [51]:
df_bg['call_length'] = df_bg['bg_end_time'] - df_bg['bg_begin_time']

# Concat both dfs together

In [52]:
df_bg = df_bg.rename({'bg_begin_time': 'begin_time', 'bg_end_time': 'end_time'}, axis=1)
df_bg.head()

Unnamed: 0,filename,begin_time,end_time,Annotation,call_length
0,210824-095226_Tr1.txt,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1.txt,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1.txt,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1.txt,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1.txt,64.897771,72.60506,bg,7.70729


In [53]:
df_concat = pd.concat([df_bg, df_no_overlap[['begin_time', 'end_time', 'filename', 'call_length', 'Annotation']]])

In [54]:
df_concat = df_concat.rename({'Annotation': 'label'}, axis=1)

In [55]:
df_concat.head()

Unnamed: 0,filename,begin_time,end_time,label,call_length
0,210824-095226_Tr1.txt,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1.txt,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1.txt,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1.txt,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1.txt,64.897771,72.60506,bg,7.70729


## Sanity

Sanity: do we have the same number of 'w' annotations as 'bg' ones?

In [56]:
df_concat.label.value_counts()

w     6657
bg    6638
Name: label, dtype: int64

Is the difference due to 'bg' not coming between a 'w'and the start/end of a recording?

In [57]:
df_concat.filename.nunique()

19

In [58]:
df_concat.label.value_counts()[0]-df_concat.label.value_counts()[1]

19

Yes. Good.

## Save

In [59]:
df_concat.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg.csv', index=False)

# Split to sets

Preferred DV ratio: 70-20-10

## Explore different recordings

In [60]:
df_concat.head()

Unnamed: 0,filename,begin_time,end_time,label,call_length
0,210824-095226_Tr1.txt,19.764583,20.169273,bg,0.40469
1,210824-095226_Tr1.txt,20.650971,40.781279,bg,20.130308
2,210824-095226_Tr1.txt,41.599046,44.016323,bg,2.417277
3,210824-095226_Tr1.txt,44.73327,64.40487,bg,19.6716
4,210824-095226_Tr1.txt,64.897771,72.60506,bg,7.70729


In [80]:
df_meta = df_concat[df_concat.label=='w'].groupby('filename').agg(
    count_calls = pd.NamedAgg('begin_time', 'count'),
    sum_call_length = pd.NamedAgg('call_length', 'sum'),
    avg_call_length = pd.NamedAgg('call_length', 'mean'),
).sort_values('filename')



In [81]:
df_meta['cum_call_length'] = df_meta.sum_call_length.cumsum()
df_meta['cum_perc'] = 100*df_meta['cum_call_length']/(df_meta['sum_call_length'].sum())

In [123]:
df_meta

Unnamed: 0_level_0,count_calls,sum_call_length,avg_call_length,cum_call_length,cum_perc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
210824-095226_Tr1.txt,67,41.941372,0.625991,41.941372,0.770581
210824-100209_Tr1.txt,81,38.09241,0.470277,80.033782,1.470447
210824-104507_Tr1,233,161.62449,0.693667,241.658272,4.439945
210824-115331_Tr1.txt,737,410.209489,0.556594,651.867761,11.976652
210824-125439_Tr1.txt,594,466.242653,0.78492,1118.110414,20.542846
210825-102141_Tr1.txt,443,359.170015,0.810768,1477.280429,27.141813
210825-112937_Tr1.txt,212,182.875824,0.862622,1660.156253,30.501758
210825-132034_Tr1,264,174.965576,0.662748,1835.12183,33.71637
210825-135601_Tr1,479,339.551953,0.708877,2174.673783,39.954898
210826-083608_Tr1,614,506.587613,0.825061,2681.261396,49.262343


## Assign recordings to sets

Preferred DV ratio: 70-20-10

In [124]:
train_files = df_meta.loc[df_meta['cum_perc']<70].index.tolist()
# train_files.append('25-115438_Tr2') #to get closer to 70%
train_files

['210824-095226_Tr1.txt',
 '210824-100209_Tr1.txt',
 '210824-104507_Tr1',
 '210824-115331_Tr1.txt',
 '210824-125439_Tr1.txt',
 '210825-102141_Tr1.txt',
 '210825-112937_Tr1.txt',
 '210825-132034_Tr1',
 '210825-135601_Tr1',
 '210826-083608_Tr1',
 '210827-081513_Tr1',
 '210827-090209_Tr1',
 '210827-133618_Tr1']

In [125]:
val_files = df_meta.loc[((df_meta['cum_perc']>70) & (df_meta['cum_perc']<90))].index.tolist()
val_files

['210828-080644_Tr1',
 '210903-095104_Tr1',
 '210903-110841_Tr1',
 '210904-074321_Tr1']

In [126]:
test_files = df_meta.loc[(df_meta['cum_perc']>90)].index.tolist()
test_files

['210904-111316_Tr2.txt', '25-115438_Tr2']

### validate split

In [127]:
df_meta.loc[train_files]

Unnamed: 0_level_0,count_calls,sum_call_length,avg_call_length,cum_call_length,cum_perc
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
210824-095226_Tr1.txt,67,41.941372,0.625991,41.941372,0.770581
210824-100209_Tr1.txt,81,38.09241,0.470277,80.033782,1.470447
210824-104507_Tr1,233,161.62449,0.693667,241.658272,4.439945
210824-115331_Tr1.txt,737,410.209489,0.556594,651.867761,11.976652
210824-125439_Tr1.txt,594,466.242653,0.78492,1118.110414,20.542846
210825-102141_Tr1.txt,443,359.170015,0.810768,1477.280429,27.141813
210825-112937_Tr1.txt,212,182.875824,0.862622,1660.156253,30.501758
210825-132034_Tr1,264,174.965576,0.662748,1835.12183,33.71637
210825-135601_Tr1,479,339.551953,0.708877,2174.673783,39.954898
210826-083608_Tr1,614,506.587613,0.825061,2681.261396,49.262343


In [128]:
call_length_total = df_meta.sum_call_length.sum()

In [129]:
for files in [train_files, test_files, val_files]:
#     print(files)
    np.round(df_meta.loc[files].sum_call_length.sum() / call_length_total*100)

64.0

10.0

26.0

## Actually split

In [130]:
df_train = df_concat[df_concat.filename.isin(train_files)]
df_val = df_concat[df_concat.filename.isin(val_files)]
df_test = df_concat[df_concat.filename.isin(test_files)]

In [131]:
df_train.shape
df_val.shape
df_test.shape

(9341, 5)

(2862, 5)

(1092, 5)

## Save

In [133]:
df_train.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_train.csv', index=False)
df_test.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_test.csv', index=False)
df_val.to_csv('../datasets/mozambique_2021/2021_data_prepped_with_bg_val.csv', index=False)