In [1]:
import os
from natsort import natsorted
import tqdm
import tensorflow as tf
import numpy as np
import time
import pandas as pd

data_root = 'data/EMO-DB_segmented'

dict_emotions = {
    'anger': 0,
    'anxiety_fear': 1,
    'boredom': 2,
    'disgust': 3,
    'happiness': 4,
    'neutral': 5,
    'sadness': 6
}

def prepare_EMODB():
    dirs = os.listdir(data_root)
    dirs = natsorted(dirs)
    
    paths, emotions, actors = [], [], []
    
    for d in dirs:
        cur = os.path.join(data_root, d)
        
        cur_emotion = dict_emotions[d]
        
        cur_audios = os.listdir(cur)
        cur_audios = natsorted(cur_audios)
        for aud in cur_audios:
            paths.append(os.path.join(cur, aud))
            emotions.append(cur_emotion)
            actors.append(aud[:2])
    
    return paths, emotions, actors

In [2]:
paths, emotions, actors = prepare_EMODB()

In [3]:
FRAME_LENGTH = 1024
FRAME_STEP = 256
FFT_LENGTH=1024

N_MFCC = 40

NUM_SPECTROGRAM_BINS = 513
NUM_MEL_BINS = 128
LOWER_EDGE_HERTZ = 80.0
UPPER_EDGE_HERTZ = 7600.0

SAMPLE_RATE = 16000

linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(NUM_MEL_BINS,
									                                NUM_SPECTROGRAM_BINS,
									                                SAMPLE_RATE,
									                                LOWER_EDGE_HERTZ,
									                                UPPER_EDGE_HERTZ)

In [4]:
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_waveform(file_path):
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform

In [5]:
waveforms = []
for p in paths:
    waveforms.append(get_waveform(p))

In [6]:
waveforms = np.array(waveforms)
waveforms.shape

(535, 48000)

In [7]:
def get_spectrogram(waveform):
    waveform = tf.cast(waveform, tf.float32)
    spectrogram = tf.signal.stft(waveform, frame_length=FRAME_LENGTH, frame_step=FRAME_STEP, fft_length=FFT_LENGTH)

    spectrogram = tf.abs(spectrogram)

    return spectrogram

def get_mel_spectrogram(spectrogram):
    mel_spectrogram = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1)
    mel_spectrogram.set_shape(spectrogram.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)

    return log_mel_spectrogram

# def get_mfcc(waveform, clip_value=10):
#     waveform = tf.cast(waveform, tf.float32)
#     spectrogram = tf.raw_ops.AudioSpectrogram(input=waveform,
#                                               window_size=FRAME_LENGTH,
#                                               stride=FRAME_STEP,
#                                               magnitude_squared=True,
#                                              )
    
#     mfcc = tf.raw_ops.Mfcc(spectrogram=spectrogram,
#                            sample_rate=SAMPLE_RATE,
#                            upper_frequency_limit=UPPER_EDGE_HERTZ,
#                            lower_frequency_limit=LOWER_EDGE_HERTZ,
#                            filterbank_channel_count=NUM_MEL_BINS,
#                            dct_coefficient_count=N_MFCC,
#                           )
#     return tf.clip_by_value(mfcc, -clip_value, clip_value)

def get_mfcc(log_mel_spectrograms, clip_value=10):
    mfcc = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :N_MFCC]

    return tf.clip_by_value(mfcc, -clip_value, clip_value)


In [8]:

def get_features(audio, input_type="mfcc", merge_tflite=False):
    if input_type == "spectrogram":
        spectrogram = get_spectrogram(audio)
        return spectrogram
    elif input_type == "mel_spectrogram":
        spectrogram = get_spectrogram(audio)
        mel_spectrogram = get_mel_spectrogram(spectrogram)
        return mel_spectrogram
    elif input_type == "mfcc":
        if merge_tflite:
            mfcc = get_mfcc(audio)[0]
        else: 
            spectrogram = get_spectrogram(audio)
            mel_spectrogram = get_mel_spectrogram(spectrogram)
            mfcc = get_mfcc(mel_spectrogram)
        return mfcc
    
    else:
        raise ValueError('input_type not valid!')

In [9]:
mfccs = get_features(waveforms)
len(mfccs)

535

In [14]:
def make_df(features, emotions, actors):
    data = []
    
    for i in range(len(features)):
        data.append({
            'feature': features[i],
            'emotion': emotions[i],
            'actor': actors[i]
        })
    
    return pd.DataFrame(data)    

In [15]:
df = make_df(mfccs, emotions, actors)

In [16]:
def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: ['03'],
        1: ['08'],
        2: ['09'],
        3: ['10'],
        4: ['11'],
        5: ['12'],
        6: ['13'],
        7: ['14'],
        8: ['15'],
        9: ['16']
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    X_train = np.array([data for data in train_df['feature']])
    y_train = np.array([data for data in train_df['emotion']])
    X_test = np.array([data for data in test_df['feature']])
    y_test = np.array([data for data in test_df['emotion']])
    
    X_train = np.expand_dims(X_train, 1)
    X_test = np.expand_dims(X_test, 1)
    
    with open(save_path+'.npy', 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
        
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [17]:
for fold in range(10):
    save_root = 'LIGHT-SERNET dataset'
    os.makedirs(save_root, exist_ok=True)
    
    save_path = os.path.join(save_root, "fold"+str(fold))
    
    generate_train_test(fold, df, save_path)
    time.sleep(5)

(486, 1, 184, 40) (486,) (49, 1, 184, 40) (49,)
(477, 1, 184, 40) (477,) (58, 1, 184, 40) (58,)
(492, 1, 184, 40) (492,) (43, 1, 184, 40) (43,)
(497, 1, 184, 40) (497,) (38, 1, 184, 40) (38,)
(480, 1, 184, 40) (480,) (55, 1, 184, 40) (55,)
(500, 1, 184, 40) (500,) (35, 1, 184, 40) (35,)
(474, 1, 184, 40) (474,) (61, 1, 184, 40) (61,)
(466, 1, 184, 40) (466,) (69, 1, 184, 40) (69,)
(479, 1, 184, 40) (479,) (56, 1, 184, 40) (56,)
(464, 1, 184, 40) (464,) (71, 1, 184, 40) (71,)
