In [4]:
from pathlib import Path
from tqdm import tqdm
import csv
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import warnings
from natsort import natsorted
import pandas as pd

warnings.filterwarnings(action='ignore')

In [5]:
def prepare_RAVDESS_DS(path_audios):
    """
    Generation of the dataframe with the information of the dataset. The dataframe has the following structure:
     ______________________________________________________________________________________________________________________________
    |             name            |                     path                                   |     emotion      |     actor     |
    ______________________________________________________________________________________________________________________________
    |  01-01-01-01-01-01-01.wav   |    <RAVDESS_dir>/audios_16kHz/01-01-01-01-01-01-01.wav     |     Neutral      |     1         |
    ______________________________________________________________________________________________________________________________
    ...
    :param path_audios: Path to the folder that contains all the audios in .wav format, 16kHz and single-channel(mono)
    """
    dict_emotions = {
        0: 'Neutral',
        1: 'Calm',
        2: 'Happy',
        3: 'Sad',
        4: 'Angry',
        5: 'Fear',
        6: 'Disgust',
        7: 'Surprise'
    }
    
    wav_paths, emotions, names = [], [], []
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        actor = int(name.split("-")[-1])
        label = int(name.split("-")[2]) - 1  # Start emotions in 0

        try:
            wav_paths.append(path)
            emotions.append(label)
            names.append(actor)
        except Exception as e:
            # print(str(path), e)
            pass
        
    return wav_paths, emotions, names

In [6]:
paths, emotions, actors = prepare_RAVDESS_DS('only_speech')

1440it [00:00, 43288.90it/s]


In [9]:
size = []

def save_melspectrogram(paths, emotions, actors):
    data = []
    for i, path in tqdm(enumerate(paths), desc='melspectrogram image generate.....'):
        print(path)
        y, sr = librosa.load(path, sr=16000)
        
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=512, window='hamming', hop_length=256, n_mels=256, fmax=sr/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        
        size.append(melspectrogram.shape[1])
        
        temp = np.zeros((256, 562))
        temp[:,:melspectrogram.shape[1]] = melspectrogram
        
        data.append({
            'melspectrogram_feature': temp,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        
    df = pd.DataFrame(data)
    return df

In [10]:
save_melspectrogram(paths, emotions, actors)
max(size)

melspectrogram image generate.....: 4it [00:00, 32.56it/s]

only_speech\Actor_01\03-01-01-01-01-01-01.wav
only_speech\Actor_01\03-01-01-01-01-02-01.wav
only_speech\Actor_01\03-01-01-01-02-01-01.wav
only_speech\Actor_01\03-01-01-01-02-02-01.wav
only_speech\Actor_01\03-01-02-01-01-01-01.wav
only_speech\Actor_01\03-01-02-01-01-02-01.wav
only_speech\Actor_01\03-01-02-01-02-01-01.wav


melspectrogram image generate.....: 12it [00:00, 29.82it/s]

only_speech\Actor_01\03-01-02-01-02-02-01.wav
only_speech\Actor_01\03-01-02-02-01-01-01.wav
only_speech\Actor_01\03-01-02-02-01-02-01.wav
only_speech\Actor_01\03-01-02-02-02-01-01.wav
only_speech\Actor_01\03-01-02-02-02-02-01.wav


melspectrogram image generate.....: 16it [00:00, 30.83it/s]

only_speech\Actor_01\03-01-03-01-01-01-01.wav
only_speech\Actor_01\03-01-03-01-01-02-01.wav
only_speech\Actor_01\03-01-03-01-02-01-01.wav
only_speech\Actor_01\03-01-03-01-02-02-01.wav
only_speech\Actor_01\03-01-03-02-01-01-01.wav
only_speech\Actor_01\03-01-03-02-01-02-01.wav
only_speech\Actor_01\03-01-03-02-02-01-01.wav


melspectrogram image generate.....: 24it [00:00, 31.46it/s]

only_speech\Actor_01\03-01-03-02-02-02-01.wav
only_speech\Actor_01\03-01-04-01-01-01-01.wav
only_speech\Actor_01\03-01-04-01-01-02-01.wav
only_speech\Actor_01\03-01-04-01-02-01-01.wav
only_speech\Actor_01\03-01-04-01-02-02-01.wav
only_speech\Actor_01\03-01-04-02-01-01-01.wav
only_speech\Actor_01\03-01-04-02-01-02-01.wav


melspectrogram image generate.....: 32it [00:01, 30.99it/s]

only_speech\Actor_01\03-01-04-02-02-01-01.wav
only_speech\Actor_01\03-01-04-02-02-02-01.wav
only_speech\Actor_01\03-01-05-01-01-01-01.wav
only_speech\Actor_01\03-01-05-01-01-02-01.wav
only_speech\Actor_01\03-01-05-01-02-01-01.wav
only_speech\Actor_01\03-01-05-01-02-02-01.wav
only_speech\Actor_01\03-01-05-02-01-01-01.wav
only_speech\Actor_01\03-01-05-02-01-02-01.wav
only_speech\Actor_01\03-01-05-02-02-01-01.wav
only_speech\Actor_01\03-01-05-02-02-02-01.wav


melspectrogram image generate.....: 40it [00:01, 27.65it/s]

only_speech\Actor_01\03-01-06-01-01-01-01.wav
only_speech\Actor_01\03-01-06-01-01-02-01.wav
only_speech\Actor_01\03-01-06-01-02-01-01.wav
only_speech\Actor_01\03-01-06-01-02-02-01.wav
only_speech\Actor_01\03-01-06-02-01-01-01.wav


melspectrogram image generate.....: 43it [00:01, 24.03it/s]

only_speech\Actor_01\03-01-06-02-01-02-01.wav
only_speech\Actor_01\03-01-06-02-02-01-01.wav
only_speech\Actor_01\03-01-06-02-02-02-01.wav
only_speech\Actor_01\03-01-07-01-01-01-01.wav
only_speech\Actor_01\03-01-07-01-01-02-01.wav


melspectrogram image generate.....: 49it [00:01, 23.82it/s]

only_speech\Actor_01\03-01-07-01-02-01-01.wav
only_speech\Actor_01\03-01-07-01-02-02-01.wav
only_speech\Actor_01\03-01-07-02-01-01-01.wav
only_speech\Actor_01\03-01-07-02-01-02-01.wav
only_speech\Actor_01\03-01-07-02-02-01-01.wav


melspectrogram image generate.....: 56it [00:02, 27.26it/s]

only_speech\Actor_01\03-01-07-02-02-02-01.wav
only_speech\Actor_01\03-01-08-01-01-01-01.wav
only_speech\Actor_01\03-01-08-01-01-02-01.wav
only_speech\Actor_01\03-01-08-01-02-01-01.wav
only_speech\Actor_01\03-01-08-01-02-02-01.wav
only_speech\Actor_01\03-01-08-02-01-01-01.wav
only_speech\Actor_01\03-01-08-02-01-02-01.wav
only_speech\Actor_01\03-01-08-02-02-01-01.wav


melspectrogram image generate.....: 64it [00:02, 31.35it/s]

only_speech\Actor_01\03-01-08-02-02-02-01.wav
only_speech\Actor_02\03-01-01-01-01-01-02.wav
only_speech\Actor_02\03-01-01-01-01-02-02.wav
only_speech\Actor_02\03-01-01-01-02-01-02.wav
only_speech\Actor_02\03-01-01-01-02-02-02.wav
only_speech\Actor_02\03-01-02-01-01-01-02.wav
only_speech\Actor_02\03-01-02-01-01-02-02.wav


melspectrogram image generate.....: 68it [00:02, 30.86it/s]

only_speech\Actor_02\03-01-02-01-02-01-02.wav
only_speech\Actor_02\03-01-02-01-02-02-02.wav
only_speech\Actor_02\03-01-02-02-01-01-02.wav
only_speech\Actor_02\03-01-02-02-01-02-02.wav
only_speech\Actor_02\03-01-02-02-02-01-02.wav
only_speech\Actor_02\03-01-02-02-02-02-02.wav


melspectrogram image generate.....: 76it [00:02, 29.84it/s]

only_speech\Actor_02\03-01-03-01-01-01-02.wav
only_speech\Actor_02\03-01-03-01-01-02-02.wav
only_speech\Actor_02\03-01-03-01-02-01-02.wav
only_speech\Actor_02\03-01-03-01-02-02-02.wav
only_speech\Actor_02\03-01-03-02-01-01-02.wav
only_speech\Actor_02\03-01-03-02-01-02-02.wav


melspectrogram image generate.....: 80it [00:02, 29.07it/s]

only_speech\Actor_02\03-01-03-02-02-01-02.wav
only_speech\Actor_02\03-01-03-02-02-02-02.wav
only_speech\Actor_02\03-01-04-01-01-01-02.wav
only_speech\Actor_02\03-01-04-01-01-02-02.wav
only_speech\Actor_02\03-01-04-01-02-01-02.wav
only_speech\Actor_02\03-01-04-01-02-02-02.wav


melspectrogram image generate.....: 88it [00:03, 30.77it/s]

only_speech\Actor_02\03-01-04-02-01-01-02.wav
only_speech\Actor_02\03-01-04-02-01-02-02.wav
only_speech\Actor_02\03-01-04-02-02-01-02.wav
only_speech\Actor_02\03-01-04-02-02-02-02.wav
only_speech\Actor_02\03-01-05-01-01-01-02.wav
only_speech\Actor_02\03-01-05-01-01-02-02.wav
only_speech\Actor_02\03-01-05-01-02-01-02.wav


melspectrogram image generate.....: 96it [00:03, 30.08it/s]

only_speech\Actor_02\03-01-05-01-02-02-02.wav
only_speech\Actor_02\03-01-05-02-01-01-02.wav
only_speech\Actor_02\03-01-05-02-01-02-02.wav
only_speech\Actor_02\03-01-05-02-02-01-02.wav
only_speech\Actor_02\03-01-05-02-02-02-02.wav
only_speech\Actor_02\03-01-06-01-01-01-02.wav


melspectrogram image generate.....: 100it [00:03, 28.72it/s]

only_speech\Actor_02\03-01-06-01-01-02-02.wav
only_speech\Actor_02\03-01-06-01-02-01-02.wav
only_speech\Actor_02\03-01-06-01-02-02-02.wav
only_speech\Actor_02\03-01-06-02-01-01-02.wav





ValueError: Input signal length=0 is too small to resample from 48000->16000

In [5]:
df = save_melspectrogram(paths, emotions, actors)

melspectrogram image generate.....: 535it [00:02, 236.00it/s]


In [6]:
from torchaudio.functional import compute_deltas
import torch

def get_deltas(feature):
    f = np.expand_dims(feature, 1)
    f = torch.Tensor(f)
    
    delta = compute_deltas(f)
    delta2 = compute_deltas(delta)
    
    ret = torch.cat([f, delta, delta2], dim=1)
    
    return ret

In [7]:
def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    X_train = np.array([data for data in train_df['melspectrogram_feature']])
    y_train = np.array([data for data in train_df['emotion']])
    X_test = np.array([data for data in test_df['melspectrogram_feature']])
    y_test = np.array([data for data in test_df['emotion']])
    
    X_train = get_deltas(X_train)
    X_test = get_deltas(X_test)
    
    with open(save_path+'.npy', 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
        
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [8]:
import time

for fold in range(5):
    
    save_path = os.path.join('5-CV-only-speech', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    generate_train_test(fold, df, save_path)
    time.sleep(10)

torch.Size([428, 3, 256, 562]) (428,) torch.Size([107, 3, 256, 562]) (107,)
torch.Size([454, 3, 256, 562]) (454,) torch.Size([81, 3, 256, 562]) (81,)
torch.Size([445, 3, 256, 562]) (445,) torch.Size([90, 3, 256, 562]) (90,)
torch.Size([405, 3, 256, 562]) (405,) torch.Size([130, 3, 256, 562]) (130,)
torch.Size([408, 3, 256, 562]) (408,) torch.Size([127, 3, 256, 562]) (127,)
