# Setup

In [1]:
import os
import pandas as pd
import json
import torchaudio

In [None]:
# data paths:
DATA_ROOT = "/Users/eugenekim/Emo-CLIM/dataset/AudioSet"
AUDIO_DIR = "audio_files"
ONTOLOGY_FILE = "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/orig_metadata_files/ontology.json"
ORIG_METADATA_FILES = {
    "unbalanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/orig_metadata_files/unbalanced_train_segments.csv",
    "balanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/orig_metadata_files/balanced_train_segments.csv",
    "eval": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/orig_metadata_files/eval_segments.csv"
}
# AudioSet music mood subset label names:
MOOD_LABEL_NAMES = ["Happy music", "Funny music", "Sad music", "Tender music", "Exciting music", "Angry music", "Scary music"]

In [None]:
# script options:
data_subsets = ["balanced_train", "eval"]
new_metadata_files = {
    "unbalanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_unbalanced_train.csv",
    "balanced_train": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_balanced_train.csv",
    "eval": "/Users/eugenekim/Emo-CLIM/dataset/AudioSet/metadata_eval.csv"
}

# Get Label IDs

In [4]:
# load ontology dictionary:
with open(ONTOLOGY_FILE, "r") as json_file:
    ontology = json.load(json_file)

In [5]:
# get label IDs of music mood labels:
mood_label_ids = {}
for entry in ontology:
    if entry["name"] in MOOD_LABEL_NAMES:
        if entry["name"] in mood_label_ids.values():
            raise RuntimeError("'{}' already found in ontology.".format(entry["name"]))
        else:
            mood_label_ids[entry["id"]] = entry["name"]

print("Music mood label ids and names:\n")
for label_name, label_id in mood_label_ids.items():
    print("{}: {}".format(label_name, label_id))

Music mood label ids and names:

/t/dd00031: Happy music
/t/dd00032: Funny music
/t/dd00033: Sad music
/t/dd00034: Tender music
/t/dd00035: Exciting music
/t/dd00036: Angry music
/t/dd00037: Scary music


# Extract Labels

In [6]:
# load original metadata files:
orig_metadata_dfs = {}
col_names = ["youtube_id", "start_seconds", "end_seconds", "positive_labels"]
for subset in data_subsets:
    print("Loading {} set metadata...\n".format(subset))
    orig_metadata_dfs[subset] = pd.read_csv(ORIG_METADATA_FILES[subset], sep=", ", names=col_names, header=None, skiprows=[0, 1, 2])
    print(orig_metadata_dfs[subset].info())
    print("\n")

Loading balanced_train set metadata...



  orig_metadata_dfs[subset] = pd.read_csv(ORIG_METADATA_FILES[subset], sep=", ", names=col_names, header=None, skiprows=[0, 1, 2])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22160 entries, 0 to 22159
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   youtube_id       22160 non-null  object 
 1   start_seconds    22160 non-null  float64
 2   end_seconds      22160 non-null  float64
 3   positive_labels  22160 non-null  object 
dtypes: float64(2), object(2)
memory usage: 692.6+ KB
None


Loading eval set metadata...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20371 entries, 0 to 20370
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   youtube_id       20371 non-null  object 
 1   start_seconds    20371 non-null  float64
 2   end_seconds      20371 non-null  float64
 3   positive_labels  20371 non-null  object 
dtypes: float64(2), object(2)
memory usage: 636.7+ KB
None




  orig_metadata_dfs[subset] = pd.read_csv(ORIG_METADATA_FILES[subset], sep=", ", names=col_names, header=None, skiprows=[0, 1, 2])


In [7]:
# extract audio file names:
audio_file_names = {}
for subset in data_subsets:
    subset_root = os.path.join(DATA_ROOT, AUDIO_DIR, subset)
    audio_file_names[subset] = [name for name in os.listdir(subset_root) if os.path.isfile(os.path.join(subset_root, name))]
    print("Original size of {} set: {}".format(subset, len(audio_file_names[subset])))

Original size of balanced_train set: 325
Original size of eval set: 344


In [8]:
# extract labels of audio files:
new_metadata_dfs = {}
for subset in data_subsets:
    orig_subset_names = []
    file_names = []
    clip_length_samples = []
    mood_labels = []
    n_bad_files = 0     # number of audio files with not exactly 1 music mood label
    print("Extracting labels for {} set...".format(subset))
    for file_name in audio_file_names[subset]:
        # extract youtube ID from file name:
        youtube_id_start_time = file_name.replace(".wav", "")
        assert len(youtube_id_start_time) == len(file_name) - len(".wav"), "Error with removing '.wav'."
        start_time = "_" + youtube_id_start_time.split("_")[-1]
        youtube_id = youtube_id_start_time.replace(start_time, "")
        assert len(youtube_id) == len(youtube_id_start_time) - len(start_time), "Error with removing start_time."

        # get audio clip length (in samples):
        file_path = os.path.join(DATA_ROOT, AUDIO_DIR, subset, file_name)
        metadata = torchaudio.info(file_path)
        length_samples = metadata.num_frames

        # get label ids by querying metadata with youtube id:
        label_ids = orig_metadata_dfs[subset]["positive_labels"][orig_metadata_dfs[subset]["youtube_id"] == youtube_id]

        # convert format of labels ids:
        label_ids = label_ids.reset_index(drop=True)
        label_ids = label_ids[0]
        # remove leading/trailing quotation marks:
        orig_len = len(label_ids)
        label_ids = label_ids.strip('"')
        assert len(label_ids) == orig_len - 2, "Error with removing leading/trailing quotation marks from labels string."
        # convert to list:
        label_ids = label_ids.split(",")
        assert type(label_ids) == list, "Error with converting labels string to list."

        # extract music mood label (should be exactly 1):
        mood_label = None
        n_mood_labels = 0
        for label_id in label_ids:
            if label_id in mood_label_ids.keys():
                mood_label = mood_label_ids[label_id]
                n_mood_labels += 1
        # only keep audio files with exactly 1 music mood label:
        if n_mood_labels == 1:
            orig_subset_names.append(subset)
            file_names.append(file_name)
            clip_length_samples.append(length_samples)
            mood_labels.append(mood_label)
        else:
            n_bad_files += 1
    
    print("Number of audio files with not exactly 1 music mood label: {}".format(n_bad_files))
    
    # save as dataframe:
    new_metadata_dfs[subset] = pd.DataFrame(
        data={
            "orig_subset": orig_subset_names,
            "file_name": file_names,
            "length_samples": clip_length_samples,
            "label": mood_labels
        }
    )
    print()
    print(new_metadata_dfs[subset].info())
    print()
    print(new_metadata_dfs[subset].head())
    print("\n")
    # save to file:
    new_metadata_dfs[subset].to_csv(new_metadata_files[subset], index=False)

Extracting labels for balanced_train set...
Number of audio files with not exactly 1 music mood label: 7

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   orig_subset     318 non-null    object
 1   file_name       318 non-null    object
 2   length_samples  318 non-null    int64 
 3   label           318 non-null    object
dtypes: int64(1), object(3)
memory usage: 10.1+ KB
None

      orig_subset              file_name  length_samples           label
0  balanced_train   Vo6eT8eMMfQ_30.0.wav          160000  Exciting music
1  balanced_train  BscoQHJrNm8_170.0.wav          160000     Angry music
2  balanced_train  Vk-V0EZ3UIY_150.0.wav          160000     Funny music
3  balanced_train   BtdzVnXZ0i4_30.0.wav          160000       Sad music
4  balanced_train  0khKvVDyYV4_240.0.wav          160000     Happy music


Extracting labels for eval se