In [1]:
import librosa
import librosa.display as librosa_display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import time
from torchaudio.functional import compute_deltas
import torch

In [2]:
def prepare_RAVDESS_DS(path_audios):
    wav_paths, emotions, actors = [], [], []
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = int(name.split("-")[2]) - 1  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            wav_paths.append(path)
            emotions.append(label)
            actors.append(actor)
        except Exception as e:
            # print(str(path), e)
            pass
        
    return wav_paths, emotions, actors

In [3]:
wav_paths, emotions, actors = prepare_RAVDESS_DS('dataset')

1440it [00:00, 51601.03it/s]


In [4]:
sample_rate = 16000

frame_length = 0.064
frame_stride = 0.016

def scaled(li, min_v=-1, max_v=1):
    ret = [(x%(max_v-min_v+1)+min_v) for x in li]
    return np.array(ret)

In [5]:
def get_sample(paths):
    data = []
    
    for _, path, in tqdm(enumerate(paths), desc='get features, size.....'):
        audio_features = librosa.load(path, sr=sample_rate)[0]
        li = []
        for x in audio_features:
            if x <= 5e-4 and x >= -5e-4:
                x = 0
            li.append(x)
        data.append(li)
        
    return data

In [6]:
audio = get_sample(wav_paths)

get features, size.....: 1440it [03:08,  7.64it/s]


In [7]:
import soundfile

soundfile.write('sample.wav',
                audio[0],
                16000,
                format='WAV')

In [8]:
def get_features(audio):
    data = []
    sz = []
    for _, wav, in tqdm(enumerate(audio), desc='get features, size.....'):
        wav = np.array(wav)
        input_nfft = int(round(sample_rate*frame_length))
        input_stride = int(round(sample_rate*frame_stride))
        
        spectrogram = librosa.feature.melspectrogram(y=wav, sr=sample_rate, n_mels=40, n_fft=input_nfft, window='hamming', hop_length=input_stride)
        melspectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        
        S = librosa.feature.melspectrogram(y=wav, sr=sample_rate, n_mels=128, n_fft=input_nfft, window='hamming', hop_length=input_stride)
        log_S = librosa.power_to_db(S, ref=np.max)
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=40)
        
        melspectrogram = np.expand_dims(melspectrogram, 0)
        melspectrogram = torch.Tensor(melspectrogram)
        
        mfcc = np.expand_dims(mfcc, 0)
        mfcc = torch.Tensor(mfcc)
        
        ret = torch.cat([melspectrogram, mfcc], dim=0)
        
        data.append(ret)
        sz.append(mfcc.shape[-1])
        
    return data, sz

In [9]:
features, sz = get_features(audio)

get features, size.....: 1440it [00:15, 95.43it/s] 


In [10]:
print(min(sz))
print(features[0].shape)
a=features[0]
a[0,:,:].shape

184
torch.Size([2, 40, 207])


torch.Size([40, 207])

In [11]:
def make_df(features, emotions, actors, sz):
    data = []
    
    for i, feature in tqdm(enumerate(features), desc='Make dataframe.....'):
        temp = feature.numpy()
        mfcc_feature = np.zeros((2, 40, sz))
        mfcc_feature = temp[:,:,:sz]
        
        data.append({
            'feature': mfcc_feature,
            'emotion': emotions[i],
            'actor': actors[i]
        })
    
    return pd.DataFrame(data)    

In [12]:
df = make_df(features, emotions, actors, min(sz))
df['feature'].shape

Make dataframe.....: 1440it [00:00, 120408.24it/s]


(1440,)

In [13]:
def get_deltas(feature):
    f = np.expand_dims(feature, 1)
    f = torch.Tensor(f)
    
    delta = compute_deltas(f, win_length=7)
    delta2 = compute_deltas(delta, win_length=7)
    
    ret = torch.cat([f, delta, delta2], dim=1)
    
    return ret

In [14]:
def generate_train_test_deltas(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    X_train = np.array([data for data in train_df['feature']])
    y_train = np.array([data for data in train_df['emotion']])
    X_test = np.array([data for data in test_df['feature']])
    y_test = np.array([data for data in test_df['emotion']])
    
    # X_train = get_deltas(X_train)
    # X_test = get_deltas(X_test)
    
    with open(save_path+'.npy', 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)
        
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [15]:
for fold in range(5):
    
    save_root = 'scaling-spectrogram-mfcc dataset'
    os.makedirs(save_root, exist_ok=True)
    
    save_path = os.path.join(save_root, "fold"+str(fold))
    
    generate_train_test_deltas(fold, df, save_path)
    time.sleep(5)

(1140, 2, 40, 184) (1140,) (300, 2, 40, 184) (300,)
(1140, 2, 40, 184) (1140,) (300, 2, 40, 184) (300,)
(1140, 2, 40, 184) (1140,) (300, 2, 40, 184) (300,)
(1140, 2, 40, 184) (1140,) (300, 2, 40, 184) (300,)
(1200, 2, 40, 184) (1200,) (240, 2, 40, 184) (240,)
