In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time
from torchaudio.functional import compute_deltas
import torch
from natsort import natsorted

In [2]:

label2name = {
    "L": 0,
    "A": 1,
    "E": 2,
    "F": 3,
    "T": 4,
    "W": 5,
    "N": 6
}

def prepare_EMODB(data_root):
    dirs = os.listdir(data_root)
    dirs = natsorted(dirs)
    
    paths, emotions, actors = [], [], []
    
    for d in dirs:
        cur = os.path.join(data_root, d)
        
        paths.append(cur)
        emotions.append(label2name[d[5]])
        actors.append(d[:2])
    
    return paths, emotions, actors

In [3]:
wav_paths, emotions, actors = prepare_EMODB('EMO-DB')

In [4]:
sample_rate = 16000

frame_length = 0.05
frame_stride = 0.0125

In [5]:
def get_features(paths):
    data = []
    sz = []
    for i, path, in tqdm(enumerate(paths), desc='get features, size.....'):
        input_nfft = int(round(sample_rate*frame_length))
        input_stride = int(round(sample_rate*frame_stride))
        
        y, _ = librosa.load(path, sr=sample_rate)
        
        S = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=128, n_fft=input_nfft, window='hamming', hop_length=input_stride)
        log_S = librosa.power_to_db(S, ref=np.max)
        
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=39)
        
        data.append(mfcc)
        sz.append(mfcc.shape[1])
        
    return data, sz

In [6]:
features, sz = get_features(wav_paths)

get features, size.....: 535it [00:01, 355.01it/s]


In [7]:
def make_df(features, emotions, actors, sz):
    data = []
    
    for i, feature in tqdm(enumerate(features), desc='Make dataframe.....'):
        mfcc_feature = np.zeros((39, sz))
        mfcc_feature = feature[:,:sz]
        
        data.append({
            'feature': mfcc_feature,
            'emotion': emotions[i],
            'actor': actors[i]
        })
    
    return pd.DataFrame(data)    

In [8]:
df = make_df(features, emotions, actors, min(sz))

Make dataframe.....: 535it [00:00, 534401.68it/s]


In [9]:
def get_fold(fold, df):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: ['03'],
        1: ['08'],
        2: ['09'],
        3: ['10'],
        4: ['11'],
        5: ['12'],
        6: ['13'],
        7: ['14'],
        8: ['15'],
        9: ['16']
    }

    df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    
    X = np.array([data for data in df['feature']])
    y = np.array([data for data in df['emotion']])
    
    # with open(save_path+'.npy', 'wb') as f:
    #     np.save(f, X_train)
    #     np.save(f, y_train)
    #     np.save(f, X_test)
    #     np.save(f, y_test)
        
    return np.array(X), np.array(y)

In [10]:
X_dataset = []
y_dataset = []
for fold in range(10):
    X, y = get_fold(fold, df)
    X_dataset.append(X)
    y_dataset.append(y)

In [11]:
print(X_dataset[0].shape)

(49, 39, 99)


In [12]:
X = X_dataset[0]
y = y_dataset[0]
for fold in range(1, 10):
    X = np.concatenate((X, X_dataset[fold]), axis=0)
    y = np.concatenate((y, y_dataset[fold]), axis=0)

X = np.expand_dims(X, -1)

os.makedirs('dataset', exist_ok=True)
with open('dataset/EMODB.npy', 'wb') as f:
    np.save(f, X)
    np.save(f, y)

In [13]:
print(X.shape)

(535, 39, 99, 1)
