In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time
from torchaudio.functional import compute_deltas
import torch

In [2]:
def prepare_TESS_DS(path_audios):
    wav_paths, emotions, actors = [], [], []
    
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        f = str(path)
        f = f.split('\\')[-2]
        f = f.split('_')
        
        emotion = None
        emo = f[1]
        if emo == 'angry':
            emotion = 0
        elif emo == 'disgust':
            emotion = 1
        elif emo == 'fear':
            emotion = 2
        elif emo == 'happy':
            emotion = 3
        elif emo == 'neutral':
            emotion = 4
        elif emo == 'pleasant':
            emotion = 5
        elif emo == 'sad':
            emotion = 6
                    
        wav_paths.append(path)
        emotions.append(emotion)
        actors.append(f[0])
    
    return wav_paths, emotions, actors        

In [3]:
wav_paths, emotions, actors = prepare_TESS_DS('TESS')

2800it [00:00, 231045.67it/s]


In [4]:
print(emotions)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [5]:
sample_rate = 16000

frame_length = 0.05
frame_stride = 0.0125

In [6]:
def get_features(paths):
    data = []
    sz = []
    for i, path, in tqdm(enumerate(paths), desc='get features, size.....'):
        input_nfft = int(round(sample_rate*frame_length))
        input_stride = int(round(sample_rate*frame_stride))
        
        y, _ = librosa.load(path, sr=sample_rate)
        
        S = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=128, n_fft=input_nfft, window='hamming', hop_length=input_stride)
        log_S = librosa.power_to_db(S, ref=np.max)
        
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=39)
        
        data.append(mfcc)
        sz.append(mfcc.shape[1])
        
    return data, sz

In [7]:
features, sz = get_features(wav_paths)

get features, size.....: 2800it [01:24, 33.27it/s]


In [8]:
def make_df(features, emotions, actors, sz):
    data = []
    
    for i, feature in tqdm(enumerate(features), desc='Make dataframe.....'):
        mfcc_feature = np.zeros((39, sz))
        mfcc_feature = feature[:,:sz]
        
        data.append({
            'feature': mfcc_feature,
            'emotion': emotions[i],
            'actor': actors[i]
        })
    
    return pd.DataFrame(data)    

In [9]:
df = make_df(features, emotions, actors, min(sz))

Make dataframe.....: 2800it [00:00, 350025.37it/s]


In [10]:
def get_fold(fold, df):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: ["OAF"],
        1: ["YAF"]
    }

    df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    
    X = np.array([data for data in df['feature']])
    y = np.array([data for data in df['emotion']])
    
    return np.array(X), np.array(y)

In [11]:
X_dataset = []
y_dataset = []
for fold in range(2):
    X, y = get_fold(fold, df)
    X_dataset.append(X)
    y_dataset.append(y)

In [12]:
X = X_dataset[0]
y = y_dataset[0]
for fold in range(1, 2):
    X = np.concatenate((X, X_dataset[fold]), axis=0)
    y = np.concatenate((y, y_dataset[fold]), axis=0)

X = np.expand_dims(X, -1)
print(X.shape, y.shape)

os.makedirs('dataset', exist_ok=True)
with open('dataset/TESS.npy', 'wb') as f:
    np.save(f, X)
    np.save(f, y)

(2800, 39, 101, 1) (2800,)
