### using [audiomentations](https://github.com/iver56/audiomentations)

In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time
from torchaudio.functional import compute_deltas
import torch
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

In [2]:
def prepare_RAVDESS_DS(path_audios):
    wav_paths, emotions, actors = [], [], []
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = int(name.split("-")[2]) - 1  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            wav_paths.append(path)
            emotions.append(label)
            actors.append(actor)
        except Exception as e:
            # print(str(path), e)
            pass
        
    return wav_paths, emotions, actors

In [3]:
wav_paths, emotions, actors = prepare_RAVDESS_DS('only_speech')

1440it [00:00, 57794.34it/s]


In [4]:
sample_rate = 16000

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

In [5]:
def get_size(paths):
    sz = []
    
    for i, path, in tqdm(enumerate(paths), desc='get size.....'):
        y, sr = librosa.load(path, sr=sample_rate)
        
        S = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_fft=512, win_length=400, window='hamming', hop_length=128, n_mels=224, fmax=sample_rate/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        
        sz.append(melspectrogram.shape[1])
    
    return max(sz)

In [6]:
sz = get_size(wav_paths)

  return f(*args, **kwargs)
get size.....: 1440it [01:01, 23.48it/s]


In [7]:
def feature_augment(paths, emotions, actors, sz):
    data = []
    
    for i, path in tqdm(enumerate(paths), desc='audio augmentation.....'):
        y, sr = librosa.load(path, sr=sample_rate)
        
        S = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_fft=512, win_length=400, window='hamming', hop_length=128, n_mels=224, fmax=sample_rate/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        temp = np.zeros((224, sz))
        temp[:,:melspectrogram.shape[1]] = melspectrogram
        data.append({
            'feature': temp,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        
        augmented_samples = augment(samples=y, sample_rate=16000)
        
        S = librosa.feature.melspectrogram(y=augmented_samples, sr=sample_rate, n_fft=512, win_length=400, window='hamming', hop_length=160, n_mels=224, fmax=sample_rate/2)
        melspectrogram = librosa.power_to_db(S, ref=np.max)
        temp = np.zeros((224,sz))
        temp[:,:melspectrogram.shape[1]] = melspectrogram
        data.append({
            'feature': temp,
            'emotion': emotions[i],
            'actor': actors[i]
        })
        
    
    df = pd.DataFrame(data)
    return df

In [8]:
df = feature_augment(wav_paths, emotions, actors, sz)

audio augmentation.....: 1440it [01:22, 17.39it/s]


In [9]:
def get_deltas(feature):
    f = np.expand_dims(feature, 1)
    f = torch.Tensor(f)
    
    delta = compute_deltas(f)
    delta2 = compute_deltas(delta)
    
    ret = torch.cat([f, delta, delta2], dim=1)
    
    return ret

In [10]:
def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    X_train = np.array([data for data in train_df['feature']])
    y_train = np.array([data for data in train_df['emotion']])
    X_test = np.array([data for data in test_df['feature']])
    y_test = np.array([data for data in test_df['emotion']])
    
    X_train = get_deltas(X_train)
    X_test = get_deltas(X_test)
    
    with open(save_path+'.npy', 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
        
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [11]:
for fold in range(5):
    
    save_path = os.path.join('5-CV-augment', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    generate_train_test(fold, df, save_path)
    time.sleep(10)

torch.Size([2280, 3, 224, 614]) (2280,) torch.Size([600, 3, 224, 614]) (600,)
torch.Size([2280, 3, 224, 614]) (2280,) torch.Size([600, 3, 224, 614]) (600,)
torch.Size([2280, 3, 224, 614]) (2280,) torch.Size([600, 3, 224, 614]) (600,)
torch.Size([2280, 3, 224, 614]) (2280,) torch.Size([600, 3, 224, 614]) (600,)
torch.Size([2400, 3, 224, 614]) (2400,) torch.Size([480, 3, 224, 614]) (480,)


: 