In [1]:
from pathlib import Path
from tqdm import tqdm
import csv
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import warnings
from natsort import natsorted
import pandas as pd

warnings.filterwarnings(action='ignore')

In [2]:
def prepare_RAVDESS_DS(path_audios):
    wav_paths, emotions, actors = [], [], []
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = int(name.split("-")[2]) - 1  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            wav_paths.append(path)
            emotions.append(label)
            actors.append(actor)
        except Exception as e:
            # print(str(path), e)
            pass
        
    return wav_paths, emotions, actors

In [3]:
paths, emotions, actors = prepare_RAVDESS_DS('dataset')

1440it [00:00, 72000.07it/s]


In [4]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

In [5]:
sz = []
def size_check(paths):
    
    for i, path in tqdm(enumerate(paths), desc='melspectrogram image generate.....'):
        y, sr = librosa.load(path, sr=16000)
        augmented_y = augment(samples=y, sample_rate=16000)
        
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=512, window='hamming', hop_length=256, n_mels=256, fmax=sr/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        
        S_aug = librosa.feature.melspectrogram(y=augmented_y, sr=sr, n_fft=1024, win_length=512, window='hamming', hop_length=256, n_mels=256, fmax=sr/2)
        melspectrogram_aug = librosa.power_to_db(S_aug, ref=np.max)
        
        sz.append(max(melspectrogram.shape[1], melspectrogram_aug.shape[1]))

In [6]:
size_check(paths)

melspectrogram image generate.....: 1440it [02:46,  8.64it/s]


In [7]:
def save_melspectrogram(paths, emotions, actors, sz):
    data = []
    data_aug = []
    for i, path in tqdm(enumerate(paths), desc='melspectrogram image generate.....'):
        y, sr = librosa.load(path, sr=16000)
        augmented_y = augment(samples=y, sample_rate=16000)
        
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=512, window='hamming', hop_length=256, n_mels=256, fmax=sr/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        
        S_aug = librosa.feature.melspectrogram(y=augmented_y, sr=sr, n_fft=1024, win_length=512, window='hamming', hop_length=256, n_mels=256, fmax=sr/2)
        melspectrogram_aug = librosa.power_to_db(S_aug, ref=np.max)
        
        origin = np.zeros((256, sz))
        origin[:, :melspectrogram.shape[1]] = melspectrogram
        
        aug = np.zeros((256, sz))
        aug[:, :melspectrogram_aug.shape[1]] = melspectrogram_aug
        
        data.append({
            'melspectrogram_feature': origin,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        
        data_aug.append({
            'melspectrogram_feature': origin,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        data_aug.append({
            'melspectrogram_feature': aug,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        
    df = pd.DataFrame(data)
    df_aug = pd.DataFrame(data_aug)
    return df, df_aug

In [8]:
df, df_aug = save_melspectrogram(paths, emotions, actors, max(sz))

melspectrogram image generate.....: 1440it [02:42,  8.84it/s]


In [9]:
from torchaudio.functional import compute_deltas
import torch

def get_deltas(feature):
    f = np.expand_dims(feature, 1)
    f = torch.Tensor(f)
    
    delta = compute_deltas(f)
    delta2 = compute_deltas(delta)
    
    ret = torch.cat([f, delta, delta2], dim=1)
    
    return ret

In [14]:
def generate_train_test(fold, df, df_aug, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df_aug.loc[~df_aug['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    X_train = np.array([data for data in train_df['melspectrogram_feature']])
    y_train = np.array([data for data in train_df['emotion']])
    X_test = np.array([data for data in test_df['melspectrogram_feature']])
    y_test = np.array([data for data in test_df['emotion']])
    
    X_train = get_deltas(X_train)
    X_test = get_deltas(X_test)
    
    with open(save_path+'.npy', 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
        
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [15]:
import time

for fold in range(5):
    
    save_path = os.path.join('5-CV-augment', "fold"+str(fold))
    
    generate_train_test(fold, df, df_aug, save_path)
    time.sleep(5)

torch.Size([2280, 3, 256, 330]) (2280,) torch.Size([300, 3, 256, 330]) (300,)
torch.Size([2280, 3, 256, 330]) (2280,) torch.Size([300, 3, 256, 330]) (300,)
torch.Size([2280, 3, 256, 330]) (2280,) torch.Size([300, 3, 256, 330]) (300,)
torch.Size([2280, 3, 256, 330]) (2280,) torch.Size([300, 3, 256, 330]) (300,)
torch.Size([2400, 3, 256, 330]) (2400,) torch.Size([240, 3, 256, 330]) (240,)
