# Setup

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# data paths:
ORIG_SPLIT_METADATA_FILES = {
    # "unbalanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_unbalanced_train.csv",
    "balanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_balanced_train.csv",
    "eval": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_eval.csv"
}

In [None]:
# script options:
val_fract = 0.1
test_fract = 0.1
new_split_metadata_dir = "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/new_split_metadata_files"
new_split_metadata_files = {
    "train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/new_split_metadata_files/metadata_train.csv",
    "val": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/new_split_metadata_files/metadata_val.csv",
    "test": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/new_split_metadata_files/metadata_test.csv"
}

# Get Audio Files

In [4]:
# load original split metadata files:
orig_split_metadata_dfs = {}
for subset, file_path in ORIG_SPLIT_METADATA_FILES.items():
    print("Loading {} set labels...".format(subset))
    orig_split_metadata_dfs[subset] = pd.read_csv(file_path)

# concatenate original split labels into a single dataframe:
all_metadata = pd.concat(orig_split_metadata_dfs.values(), axis="index")
all_metadata = all_metadata.reset_index(drop=True)
print()
print(all_metadata.info())

Loading unbalanced_train set labels...
Loading balanced_train set labels...
Loading eval set labels...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13713 entries, 0 to 13712
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   orig_subset     13713 non-null  object
 1   file_name       13713 non-null  object
 2   length_samples  13713 non-null  int64 
 3   label           13713 non-null  object
dtypes: int64(1), object(3)
memory usage: 428.7+ KB
None


In [5]:
# print subset sizes:
train_set_size = int(np.around((1 - test_fract - val_fract) * all_metadata.shape[0]))
print("Training set size: {:.2f}% of dataset = {} samples.".format(100 * (1 - test_fract - val_fract), train_set_size))
val_set_size = int(np.around(val_fract * all_metadata.shape[0]))
print("Validation set size: {:.2f}% of dataset = {} samples.".format(100 * val_fract, val_set_size))
test_set_size = int(np.around(test_fract * all_metadata.shape[0]))
print("Test set size: {:.2f}% of dataset = {} samples.".format(100 * test_fract, test_set_size))

Training set size: 80.00% of dataset = 10970 samples.
Validation set size: 10.00% of dataset = 1371 samples.
Test set size: 10.00% of dataset = 1371 samples.


In [6]:
# get label counts:
label_counts = all_metadata["label"].value_counts()
for label in all_metadata["label"].value_counts().index:
    print("Number of {} clips: {}".format(label, label_counts[label]))

Number of Exciting music clips: 4576
Number of Tender music clips: 3358
Number of Scary music clips: 1401
Number of Sad music clips: 1385
Number of Happy music clips: 1152
Number of Angry music clips: 946
Number of Funny music clips: 895


# Split Dataset

In [7]:
# split into stratified training/val and test sets:
metadata_train_val, metadata_test = train_test_split(all_metadata, test_size=test_set_size, stratify=all_metadata["label"], random_state=42)
assert metadata_test.shape[0] == test_set_size, "Test set metadata has incorrect size."
print(metadata_test.info())

# print test set class distribution:
print()
label_counts = metadata_test["label"].value_counts()
for label in metadata_test["label"].value_counts().index:
    print("Number of {} clips: {}".format(label, label_counts[label]))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371 entries, 12030 to 9009
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   orig_subset     1371 non-null   object
 1   file_name       1371 non-null   object
 2   length_samples  1371 non-null   int64 
 3   label           1371 non-null   object
dtypes: int64(1), object(3)
memory usage: 53.6+ KB
None

Number of Exciting music clips: 458
Number of Tender music clips: 336
Number of Scary music clips: 140
Number of Sad music clips: 138
Number of Happy music clips: 115
Number of Angry music clips: 95
Number of Funny music clips: 89


In [8]:
# split into stratified training and validation sets:
metadata_train, metadata_val = train_test_split(metadata_train_val, test_size=val_set_size, stratify=metadata_train_val["label"], random_state=42)
assert metadata_val.shape[0] == val_set_size, "Validation set metadata has incorrect size."
print(metadata_val.info())

# print validation set class distribution:
print()
label_counts = metadata_val["label"].value_counts()
for label in metadata_val["label"].value_counts().index:
    print("Number of {} clips: {}".format(label, label_counts[label]))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1371 entries, 7707 to 3650
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   orig_subset     1371 non-null   object
 1   file_name       1371 non-null   object
 2   length_samples  1371 non-null   int64 
 3   label           1371 non-null   object
dtypes: int64(1), object(3)
memory usage: 53.6+ KB
None

Number of Exciting music clips: 457
Number of Tender music clips: 336
Number of Scary music clips: 140
Number of Sad music clips: 138
Number of Happy music clips: 115
Number of Angry music clips: 95
Number of Funny music clips: 90


In [9]:
# check that all subsets are disjoint:
metadata_subsets = [metadata_train, metadata_val, metadata_test]
subset_names = list(new_split_metadata_files.keys())
for subset_1, name_1 in zip(metadata_subsets, subset_names):
    for subset_2, name_2 in zip(metadata_subsets, subset_names):
        if name_1 != name_2:
            assert set(subset_1.index).isdisjoint(set(subset_2.index)), "{} and {} are not disjoint".format(name_1, name_2)

In [10]:
# reset indices:
metadata_train = metadata_train.reset_index(drop=True)
metadata_val = metadata_val.reset_index(drop=True)
metadata_test = metadata_test.reset_index(drop=True)

# sanity checks:
assert all_metadata.shape[0] == metadata_train.shape[0] + metadata_val.shape[0] + metadata_test.shape[0], "Subset set sizes don't add up."
# check that all subsets are disjoint:
metadata_subsets = [metadata_train, metadata_val, metadata_test]
subset_names = list(new_split_metadata_files.keys())
for subset_1, name_1 in zip(metadata_subsets, subset_names):
    for subset_2, name_2 in zip(metadata_subsets, subset_names):
        if name_1 != name_2:
            assert set(subset_1["file_name"].tolist()).isdisjoint(set(subset_2["file_name"].tolist())), "{} and {} are not disjoint".format(name_1, name_2)
# more sanity checks:
class_counts_all = all_metadata["label"].value_counts()
class_counts_train = metadata_train["label"].value_counts()
class_counts_val = metadata_val["label"].value_counts()
class_counts_test = metadata_test["label"].value_counts()
for class_label in all_metadata["label"].unique().tolist():
    assert class_counts_all[class_label] == class_counts_train[class_label] + class_counts_val[class_label] + class_counts_test[class_label], "Error with splitting dataset."

# save to file:
metadata_train.to_csv(new_split_metadata_files["train"], index=False)
metadata_val.to_csv(new_split_metadata_files["val"], index=False)
metadata_test.to_csv(new_split_metadata_files["test"], index=False)