In [None]:
import librosa
import numpy as np
import os
from tqdm.auto import tqdm
# matplotlib complains about the behaviour of librosa.display, so we'll ignore those warnings:
import warnings; warnings.filterwarnings('ignore')

In [None]:
def get_features(file):
    X, sample_rate = librosa.load(file, 48000)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),sr=sample_rate).T,axis=0)
    
    feature_matrix=np.array([])
    feature_matrix = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
        
    return feature_matrix

In [None]:
def speech_file_to_array_fn(path):
    return get_features(path)

def label_to_id(label):
    label_list = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fear', 'Disgust', 'Surprise']

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples, input_column = "path", output_column = "emotion"):
    """
    Load the recordings with their labels.
    :param examples:[DataFrame]  with the samples of the training or test sets.
    :param input_column:[str]  Column that contain the paths to the recordings
    :param output_column:[str]  Column that contain the emotion associated to each recording
    :param target_sampling_rate:[int] Global variable with the expected sampling rate of the model
    """
    speech_list = [speech_file_to_array_fn(path) for path in tqdm(examples[input_column])]
    target_list = [label_to_id(label) for label in examples[output_column]]

    result = {
        'input_values': speech_list,
        'labels': target_list
    }

    return result

In [None]:
import numpy as np
from datasets import load_dataset

save_dir = 'via_bagustris'

dataset_1d = []
for fold in range(5):
    save_path = os.path.join('audio_48k', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    data_files = {
        "train": os.path.join(save_path, "train.csv"),
        "validation": os.path.join(save_path, "test.csv"),
    }
    
    #Load data
    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    
    train = preprocess_function(train_dataset)
    test = preprocess_function(eval_dataset)
    
    X_train = np.array(train["input_values"])
    y_train = np.array(train['labels'])
    X_test = np.array(test["input_values"])
    y_test = np.array(test['labels'])
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    numpy_name = os.path.join(save_dir, str(fold) + '.npy')
    os.makedirs(save_dir, exist_ok=True)
    
    with open(numpy_name, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential  
from tensorflow.keras.layers import Dense, Activation, GRU, Flatten, LSTM, Flatten 
from tensorflow.keras.layers import Dropout, BatchNormalization, Bidirectional
from sklearn.model_selection import train_test_split  
from sklearn.metrics import confusion_matrix  
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import random as rn
import tensorflow as tf
import os

np.random.seed(123)
rn.seed(123)
tf.random.set_seed(123)

In [None]:
def create_model(n_dim, n_classes):  
    model = Sequential()
    model.add(BatchNormalization(axis=-1, input_shape=(1, 193)))
    model.add(LSTM(n_dim, return_sequences=True, dropout=0.1,recurrent_dropout=0.2))  
    model.add(LSTM(n_dim*2, dropout=0.1, recurrent_dropout=0.2, return_sequences=True))
    model.add(LSTM(n_dim, dropout=0.1, recurrent_dropout=0.2, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(n_classes, activation='softmax'))
              
    # model compilation  
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
    return model

In [None]:
def one_hot_encode(labels):  
    n_labels = len(labels)  
    n_unique_labels = len(np.unique(labels))  
    one_hot_encode = np.zeros((n_labels,n_unique_labels+1))  
    one_hot_encode[np.arange(n_labels), labels] = 1  
    one_hot_encode=np.delete(one_hot_encode, 0, axis=1)  
    return one_hot_encode

In [None]:
npy_path = 'via_bagustris'
all_npy = os.listdir(npy_path)

earlystop = EarlyStopping(monitor='val_acc', mode='max', patience=100, restore_best_weights=True)
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)

for fold in all_npy:
    npy = os.path.join(npy_path, fold)
    
    with open(npy, 'rb') as f:
        X_train = np.load(f)
        y_train = np.load(f)
        X_test = np.load(f)
        y_test = np.load(f)
    
    X_train = np.expand_dims(X_train, axis=1)
    X_test = np.expand_dims(X_test, axis=1)
    y_train = one_hot_encode(y_train)
    y_test = one_hot_encode(y_test)
    
    print(X_train.shape)
    print(y_train.shape)
    model = create_model(X_train.shape[2], y_train.shape[1])
    print(model.summary())
    
    hist = model.fit(x=X_train, y=y_train, epochs=500, batch_size=64, 
                 validation_data=[X_test, y_test], callbacks=[earlystop])
    
    print(max(hist.history['accuracy']), max(hist.history['val_accuracy']))