In [1]:
import pandas as pd
from soundbay.utils.metadata_processing import (bg_from_non_overlap_calls, 
                                                correct_call_times_with_duration, 
                                                non_overlap_df, 
                                                reorder_columns_to_default_view,
                                                load_n_adapt_raven_annotation_table_to_dv_dataset_requirements)
import numpy as np
from pathlib import Path
import soundfile as sf
import math

In [2]:
main_folder = Path('Noise_wav')
audio_folders = [x for x in main_folder.iterdir() if x.is_dir() and x.name != 'extra']
labels = [x.name for x in audio_folders]
labels_dict = {x:y for (x,y) in zip(labels, range(len(labels)))}

In [3]:
labels_dict

{'honk': 0,
 'traffic': 1,
 'minihonk': 2,
 'creak': 3,
 'motorcycle': 4,
 'Sirena': 5}

In [4]:
BEGIN_TIME = 2.9
END_TIME = 4.1
CALL_LENGTH = 1.2
TRAIN_RATIO = 0.8

In [5]:
train_list = []
val_list = [] 

In [6]:
# ['begin_time', 'end_time', 'filename', 'call_length', 'label']

In [7]:
for folder in audio_folders:
    all_wavs = list(folder.glob('*.wav'))
    train_point = math.floor(len(all_wavs) * TRAIN_RATIO)
    train_list.extend(all_wavs[:train_point])
    val_list.extend(all_wavs[train_point:])

In [8]:
training_data = []
for file in train_list:
    entry = {'begin_time': BEGIN_TIME, 'end_time': END_TIME, 'filename': f'{file.parent.name}/{file.stem}', 
             'call_length': CALL_LENGTH, 'label': labels_dict[file.parent.name]}
    training_data.append(entry)

val_data = []
for file in val_list:
    entry = {'begin_time': BEGIN_TIME, 'end_time': END_TIME, 'filename': f'{file.parent.name}/{file.stem}', 
             'call_length': CALL_LENGTH, 'label': labels_dict[file.parent.name]}
    val_data.append(entry)

In [9]:
train_df = pd.DataFrame(training_data)
val_df = pd.DataFrame(val_data)

In [10]:
train_df['label'].value_counts(ascending=True)

label
5       3
4      79
2     312
1     488
3     566
0    1205
Name: count, dtype: int64

In [13]:
len(train_df)

2653

In [11]:
val_df['label'].value_counts(ascending=True)

label
5      1
4     20
2     78
1    123
3    142
0    302
Name: count, dtype: int64

In [12]:
train_df.to_csv('train.csv' ,index=False)
val_df.to_csv('val.csv' ,index=False)