In [24]:
import os
import numpy as np
import pandas as pd

import librosa
from tqdm.notebook import tqdm

TS_DATASET_FOLDER = os.path.join("..", "dataset")
SONG_DATASET_FOLDER = os.path.join(TS_DATASET_FOLDER,"Audio_Song_Actors_01-24")
SPEECH_DATASET_FOLDER = os.path.join(TS_DATASET_FOLDER,"Audio_Speech_Actors_01-24")

def get_actor_files(vc, actor_n):
    fold = SONG_DATASET_FOLDER if vc == "song" else SPEECH_DATASET_FOLDER
    act_fold = f"Actor_{str(actor_n).zfill(2)}"
    fold = os.path.join(fold, act_fold )
    return [os.path.join(fold,f) for f in os.listdir(fold) if f.endswith(".wav")]


CATEGORICAL_FEATURES_NAMES = {
    "modality": {"01": "full-AV", "02": "video-only", "03": "audio-only"}, 
    "vocal_channel": {"01": "speech", "02": "song"},
    "emotion": {"01" : "neutral", "02" : "calm", "03" : "happy", "04" : "sad", "05" : "angry", "06" : "fearful", "07" : "disgust", "08" : "surprised"},
    "emotional_intensity": {"01" : "normal", "02" : "strong"},
    "statement": {"01" : "Kids", "02" : "Dogs"},
    "repetition": {"01" : "1st", "02" : "2nd"},
    "actor": {str(i).zfill(2): str(i).zfill(2) for i in range(1, 25)}
}

# File import
Files must be downsampled before being saved in memory because they are enormous (200_000 floating point numbers, roughly 0.5 MB)

In [25]:
import sys
from scipy.signal import decimate

df = pd.DataFrame(columns = list(CATEGORICAL_FEATURES_NAMES.keys()) + ["audio_trace", "path"])

for actor_n in tqdm(range(1,5)):
    for f in get_actor_files("song", actor_n):
        segment, sr = librosa.load(f, sr=None)
        segment = decimate(segment, q=8)

        categ_values = str(os.path.basename(f)).strip(".wav").split("-")

        row = {attr:val for attr, val in zip(CATEGORICAL_FEATURES_NAMES.keys(), categ_values)}
        row["audio_trace"] = [segment]
        row["path"] = f
        row = pd.DataFrame(row, index = [0])
        df = pd.concat([df,  row], ignore_index=True)

  0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
for column in df.columns:
    df = df.replace(CATEGORICAL_FEATURES_NAMES)

df["sex"] = ["F" if i % 2 == 0 else "M" for i in df["actor"].astype(int)]
df = df.drop(columns = ["modality"])

In [32]:
df.loc[0, "audio_trace"]

array([-1.2550673e-06,  4.9670591e-05, -2.3272896e-05, ...,
        2.4392716e-10, -7.8054792e-11, -1.2493626e-10], dtype=float32)