In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time
from torchaudio.functional import compute_deltas
import torch

In [2]:
def prepare_CREMA_DS(path_audios):
    wav_paths, emotions, actors = [], [], []
    
    for path in tqdm(Path(path_audios).glob("*.wav")):
        f = str(path)
        
        emotion = None
        emo = f.split('_')[2]
        if emo == 'SAD':
            emotion = 0
        elif emo == 'ANG':
            emotion = 1
        elif emo == 'DIS':
            emotion = 2
        elif emo == 'FEA':
            emotion = 3
        elif emo == 'HAP':
            emotion = 4
        elif emo == 'NEU':
            emotion = 5
                    
        wav_paths.append(path)
        emotions.append(emotion)
        actors.append(int(path.stem.split('_')[0]) - 1001)
    
    return wav_paths, emotions, actors        

In [3]:
wav_paths, emotions, actors = prepare_CREMA_DS('CREMA-D')

7442it [00:00, 225515.20it/s]


In [4]:
max(actors), min(actors)

(90, 0)

In [5]:
len(wav_paths)

7442

In [6]:
sample_rate = 16000

frame_length = 0.05
frame_stride = 0.0125

In [7]:
def get_features(paths):
    data = []
    sz = []
    for i, path, in tqdm(enumerate(paths), desc='get features, size.....'):
        input_nfft = int(round(sample_rate*frame_length))
        input_stride = int(round(sample_rate*frame_stride))
        
        y, _ = librosa.load(path, sr=sample_rate)
        
        S = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=128, n_fft=input_nfft, window='hamming', hop_length=input_stride)
        log_S = librosa.power_to_db(S, ref=np.max)
        
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=39)
        
        data.append(mfcc)
        sz.append(mfcc.shape[1])
        
    return data, sz

In [8]:
features, sz = get_features(wav_paths)

get features, size.....: 7442it [00:22, 337.12it/s]


In [9]:
def make_df(features, emotions, actors, sz):
    data = []
    
    for i, feature in tqdm(enumerate(features), desc='Make dataframe.....'):
        mfcc_feature = np.zeros((39, sz))
        mfcc_feature = feature[:,:sz]
        
        data.append({
            'feature': mfcc_feature,
            'emotion': emotions[i],
            'actor': actors[i]
        })
    
    return pd.DataFrame(data)    

In [10]:
df = make_df(features, emotions, actors, min(sz))

Make dataframe.....: 7442it [00:00, 121999.78it/s]


In [11]:
def get_deltas(feature):
    f = np.expand_dims(feature, -1)
    f = torch.Tensor(f)
    
    delta = compute_deltas(f, win_length=7)
    delta2 = compute_deltas(delta, win_length=7)
    
    ret = torch.cat([f, delta, delta2], dim=-1)
    
    return ret

In [12]:
def get_fold(fold, df):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: [i for i in range(0,18)],
        1: [i for i in range(18,36)],
        2: [i for i in range(36,54)],
        3: [i for i in range(54,72)],
        4: [i for i in range(72,91)]
    }

    df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    
    X = np.array([data for data in df['feature']])
    y = np.array([data for data in df['emotion']])
    
    X = get_deltas(X)
    
    return np.array(X), np.array(y)

In [13]:
X_dataset = []
y_dataset = []
for fold in range(5):
    X, y = get_fold(fold, df)
    X_dataset.append(X)
    y_dataset.append(y)

In [14]:
X = X_dataset[0]
y = y_dataset[0]
for fold in range(1, 5):
    X = np.concatenate((X, X_dataset[fold]), axis=0)
    y = np.concatenate((y, y_dataset[fold]), axis=0)

print(X.shape, y.shape)

os.makedirs('dataset', exist_ok=True)
with open('dataset/CREMA-deltas.npy', 'wb') as f:
    np.save(f, X)
    np.save(f, y)

(7442, 39, 102, 3) (7442,)
