In [None]:
import pandas as pd
import soundfile as sf
from pathlib import Path
import numpy as np
from soundbay.utils.metadata_processing import correct_call_times_with_duration, reorder_columns_to_default_view
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
audio_folder = 'raw_data'
csv_paths = list(Path('.').glob('*xlsx'))
rename_dict = {"filename (CINEA)": "filename", "Filename ": "filename", "Filename": "filename", "duration (sec)": "call_length"}
all_csv = []
for csv_path in csv_paths:
    csv = pd.read_excel(csv_path)
    csv = csv.rename(columns=rename_dict)
    all_csv.append(csv)
input_csv = pd.concat(all_csv, ignore_index=True)

In [None]:
input_csv = input_csv[~(input_csv['Species'] == "Delphinapterus leucas")]

In [None]:
input_csv['begin_time'] = np.zeros(len(input_csv))
input_csv['end_time'] = np.zeros(len(input_csv))
input_csv['end_time'][input_csv["call_length"] == 10] = 6
input_csv['end_time'][~(input_csv["call_length"] == 10)] = 15

In [None]:
input_csv = correct_call_times_with_duration(input_csv, audio_folder)
input_csv['call_length'] = input_csv['end_time'] - input_csv['begin_time']

In [None]:
le = LabelEncoder()
input_csv['label'] = le.fit_transform(input_csv['Species'])

In [None]:
input_csv = reorder_columns_to_default_view(input_csv)

In [None]:
training_set = []
val_set = []
for label, group in input_csv.groupby('label'):
    train, val = train_test_split(group, test_size=0.15)
    training_set.append(train)
    val_set.append(val)
training_csv = pd.concat(training_set, ignore_index=True)
val_csv = pd.concat(val_set, ignore_index=True)
print(training_csv.label.value_counts())
print(val_csv.label.value_counts())

In [None]:
training_csv.to_csv('train.csv', index=False)
val_csv.to_csv('val.csv', index=False)
np.save('classes.npy', le.classes_)