# <center>Real-time emotion detection</center>

Authors : DOUET Marie, GRINDEL Brice, MARTIN Lucas, SOUVANNAVONG Elise

## Global imports

In [56]:
import numpy as np
import os
import soundfile
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm_notebook
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Global functions

In [57]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")

        sample_rate = sound_file.samplerate

        if chroma:
            stft = np.abs(librosa.stft(X))
        result=np.array([])

        # MFCC Criterias (short-term power spectrum of a sound)
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))

        # Pertains to the 12 diffrent pitch classes
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        
        # MEL Spectrogram Frequency
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
    return result

## Benchmark of datasets

### RAVDESS

In [58]:
emotions = {
    '01': "neutral", 
    '02': "calm", 
    '03': "happy", 
    '04': "sad", 
    '05': "angry", 
    '06': "fearful", 
    '07': "disgust", 
    '08': "surprised"
}

# Removed 'calm' emotion
observed_emotions = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [59]:
def load_data(folder_path, emotions, excluded_files=[], test_size=0.2):
    x,y = [],[]
    files = os.listdir(folder_path)
    
    # iterations with the progress bar
    for i in tqdm_notebook(range(len(files)-len(excluded_files))):
        file = files[i]
        
        if file not in excluded_files:
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split("-")[2]]

            if emotion not in observed_emotions:
                continue

            feature = extract_feature(folder_path + "/" + file)

            x.append(feature)
            y.append(emotion)
    return np.array(x), np.array(y)

## Preprocessing

In [60]:
ravdess_folder = "../assets/data_samples/RAVDESS"
excluded_files = ["README.md","03-01-06-01-01-02-20.wav", "03-01-08-01-02-02-01.wav", "03-01-03-01-02-01-20.wav"]

x,y = load_data(ravdess_folder, emotions=emotions, excluded_files=excluded_files)
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.25, random_state=42)

  0%|          | 0/1437 [00:00<?, ?it/s]

## Benchmark of models

### 1 - Support Vector Machine (SVM)

In [None]:
# GRIDSEARCH CELL - Only execute if best hyperparametersy are not already known

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['rbf','linear']
}

grid_svm = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose=3)
grid_svm.fit(x_train, y_train) 

print(grid_svm.best_params_)

In [61]:
# Select best hyper-parameters for SVM model
#C_opti, gamma_opti, kernel_opti = grid_svm.best_params_['C'], grid_svm.best_params_['gamma'], grid_svm.best_params_['kernel']
C_opti, gamma_opti, kernel_opti = 100, 0.0001, "rbf"

svc = svm.SVC(C=C_opti, gamma=gamma_opti, kernel=kernel_opti)
svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

accuracy = round(accuracy_score(y_true=y_test,y_pred=y_pred)*100,2)
print("Accuracy : ", accuracy,"%")

Accuracy :  61.09 %


### 2 - Multi-layer Perceptron Classifer (MLPC)

In [None]:
# GRIDSEARCH CELL - Only execute if best hyperparametersy are not already known

param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

grid_mlpc = GridSearchCV(MLPClassifier(max_iter=100), param_grid, n_jobs=-1, cv=3, verbose=0)
grid_mlpc.fit(x_train, y_train)

print(grid_mlpc.best_params_)

In [62]:
# Select best hyper-parameters for MLPC model
# activation_opti, alpha_opti, hidden_layer_sizes_opti, learnin_rate_opti, solver_opti = grid_mlpc.best_params_['activation'], grid_mlpc.best_params_['alpha'], grid_mlpc.best_params_['hidden_layer_sizes'], grid_mlpc.best_params_['learning_rate'], grid_mlpc.best_params_['solver']
activation_opti, alpha_opti, hidden_layer_sizes_opti, learnin_rate_opti, solver_opti = "tanh", 0.05, (100,), 'adaptive', 'adam'

mlpc = MLPClassifier(activation=activation_opti, solver=solver_opti, alpha=alpha_opti, batch_size=256, epsilon=1e-08, hidden_layer_sizes=hidden_layer_sizes_opti, learning_rate=learnin_rate_opti, max_iter=500)
mlpc.fit(x_train,y_train)

y_pred = mlpc.predict(x_test)

accuracy = round(accuracy_score(y_true=y_test,y_pred=y_pred)*100,2)
print("Accuracy : ", accuracy,"%")

Accuracy :  59.49 %




### 3 - Random Forest

In [None]:
# GRIDSEARCH CELL - Only execute if best hyperparametersy are not already known

param_grid = {
    'max_depth':[3,5,10,None],
    'n_estimators':[10,100,200],
    'max_features':[1,3,5,7],
    'min_samples_leaf':[1,2,3],
    'min_samples_split':[1,2,3]
}

grid_rforest = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, cv=3, verbose=0)
grid_rforest.fit(x_train, y_train)

print(grid_rforest.best_params_)

In [63]:
# Select best hyper-parameters for Random Forest model
# max_depth_opti, max_features_opti, min_samples_leaf_opti, min_samples_split_opti, n_estimators_opti = grid_rforest.best_params_['max_depth'], grid_rforest.best_params_['max_features'], grid_rforest.best_params_['min_samples_leaf'], grid_rforest.best_params_['min_samples_split'], grid_rforest.best_params_['n_estimators']
max_depth_opti, max_features_opti, min_samples_leaf_opti, min_samples_split_opti, n_estimators_opti = None, 7, 1, 2, 200


rforest = RandomForestClassifier(max_depth=max_depth_opti, max_features=max_features_opti, min_samples_leaf=min_samples_leaf_opti, min_samples_split=min_samples_split_opti, n_estimators=n_estimators_opti)
rforest.fit(x_train, y_train)

y_pred = rforest.predict(x_test)

accuracy = round(accuracy_score(y_true=y_test,y_pred=y_pred)*100,2)
print("Accuracy : ", accuracy,"%")

Accuracy :  52.73 %


### 4 - Convolutional Neural Network (CNN)

In [64]:
# Étiquetage des émotions
label_encoder = LabelEncoder()
y_cnn = label_encoder.fit_transform(y)
y_cnn = to_categorical(y_cnn, num_classes=len(label_encoder.classes_))

x_train, x_test, y_train, y_test = train_test_split(x, y_cnn, test_size=0.2, random_state=42)

# Création du modèle CNN
cnn = models.Sequential()
cnn.add(layers.Reshape((180, 1), input_shape=(180,)))  # Reshape pour ajouter la dimension du canal
cnn.add(layers.Conv1D(32, 3, activation='relu'))
cnn.add(layers.MaxPooling1D(2))
cnn.add(layers.Conv1D(64, 3, activation='relu'))
cnn.add(layers.MaxPooling1D(2))
cnn.add(layers.Conv1D(128, 3, activation='relu'))
cnn.add(layers.MaxPooling1D(2))
cnn.add(layers.Flatten())
cnn.add(layers.Dense(128, activation='relu'))
cnn.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

# Compilation du modèle
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Entraînement du modèle
cnn.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test))

# Évaluation du modèle
test_loss, test_acc = cnn.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.7108433842658997


### 5 - Linear Prediction Cepstral Coefficient (LPCC) + Support Vector Model (SVM)

In [65]:
def extract_feature_with_LPCC(file_name, mfcc=True, chroma=True, mel=True, lpcc=True):
    # Charger le fichier audio
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate

    # Initialiser le vecteur de résultats
    result = np.array([])

    # Calculer le spectrogramme
    stft = np.abs(librosa.stft(X))

    # Caractéristiques MFCC
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))

    # Caractéristiques Chroma
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma))

    # Caractéristiques MEL
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel))

    # Caractéristiques LPCC
    if lpcc:
        # Utiliser un ordre arbitraire (10 dans cet exemple, ajustez selon vos besoins)
        lpccs = librosa.lpc(X, order=10)
        result = np.hstack((result, lpccs))

    return result

In [66]:
def load_data_with_LPCC(folder_path, emotions, excluded_files=[], test_size=0.2):
    x,y = [],[]
    files = os.listdir(folder_path)
    
    # iterations with the progress bar
    for i in tqdm_notebook(range(len(files)-len(excluded_files))):
        file = files[i]
        
        if file not in excluded_files:
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split("-")[2]]

            if emotion not in observed_emotions:
                continue

            feature = extract_feature_with_LPCC(folder_path + "/" + file)

            x.append(feature)
            y.append(emotion)
    return np.array(x), np.array(y)

#### LPCC

In [67]:
x_bis,y_bis = load_data_with_LPCC(ravdess_folder, emotions=emotions, excluded_files=excluded_files)

  0%|          | 0/1437 [00:00<?, ?it/s]

In [68]:
x_train_bis, x_test_bis, y_train_bis, y_test_bis = train_test_split(x_bis, y_bis, shuffle=True, test_size=0.25, random_state=42)

#### SVM

In [None]:
# GRIDSEARCH CELL - Only execute if best hyperparametersy are not already known

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['rbf','linear']
}

grid_svm_bis = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose=3)
grid_svm_bis.fit(x_train_bis, y_train_bis) 

print(grid_svm_bis.best_params_)

In [69]:
# Select best hyper-parameters for SVM model
# C_opti, gamma_opti, kernel_opti = grid_svm_bis.best_params_['C'], grid_svm_bis.best_params_['gamma'], grid_svm_bis.best_params_['kernel']
C_opti, gamma_opti, kernel_opti = 100, 0.0001, "rbf"

svc_lpcc = svm.SVC(C=C_opti, gamma=gamma_opti, kernel=kernel_opti)
svc_lpcc.fit(x_train_bis, y_train_bis)

y_pred_bis = svc_lpcc.predict(x_test_bis)

accuracy = round(accuracy_score(y_true=y_test_bis,y_pred=y_pred_bis)*100,2)
print("Accuracy : ", accuracy,"%")

Accuracy :  61.41 %


## Results of the benchmark

### Cross validation

In [70]:
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
import numpy as np

cnn_scores = []

label_encoder = LabelEncoder()
y_cnn = label_encoder.fit_transform(y)
y_cnn = to_categorical(y_cnn, num_classes=len(label_encoder.classes_))

x_train, x_test, y_train, y_test = train_test_split(x, y_cnn, test_size=0.2, random_state=42)

# Merge inputs and targets
inputs = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  cnn = models.Sequential()
  cnn.add(layers.Reshape((180, 1), input_shape=(180,)))  # Reshape pour ajouter la dimension du canal
  cnn.add(layers.Conv1D(32, 3, activation='relu'))
  cnn.add(layers.MaxPooling1D(2))
  cnn.add(layers.Conv1D(64, 3, activation='relu'))
  cnn.add(layers.MaxPooling1D(2))
  cnn.add(layers.Conv1D(128, 3, activation='relu'))
  cnn.add(layers.MaxPooling1D(2))
  cnn.add(layers.Flatten())
  cnn.add(layers.Dense(128, activation='relu'))
  cnn.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))

  # Compile the model
  cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = cnn.fit(inputs[train], targets[train],
              batch_size=32,
              epochs=50)

  # Generate generalization metrics
  scores = cnn.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {cnn.metrics_names[0]} of {scores[0]}; {cnn.metrics_names[1]} of {scores[1]*100}%')
  cnn_scores.append(scores[1])

  # Increase fold number
  fold_no = fold_no + 1

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Score for fold 1: loss of 2.0291247367858887; accuracy of 67.06827282905579%
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epo

In [71]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svc, x, y, cv=5)
mlpc_scores = cross_val_score(mlpc, x, y, cv=5)
rforest_scores = cross_val_score(rforest, x, y, cv=5)
svm_lpcc_scores = cross_val_score(svc_lpcc, x_bis, y_bis, cv=5)

names = ["SVM", "MLPC", "Random Forest", "CNN", "SVM + LPCC"]
means = [np.mean(svm_scores), np.mean(mlpc_scores), np.mean(rforest_scores), np.mean(cnn_scores), np.mean(svm_lpcc_scores)]
stds = [np.std(svm_scores), np.std(mlpc_scores), np.std(rforest_scores), np.std(cnn_scores), np.std(svm_lpcc_scores)]
mins = [min(svm_scores), min(mlpc_scores), min(rforest_scores), min(cnn_scores), min(svm_lpcc_scores)]
maxs = [max(svm_scores), max(mlpc_scores), max(rforest_scores), max(cnn_scores), max(svm_lpcc_scores)]

data = {
    'name': names,
    'min': mins,
    'mean': means,
    'max': maxs,
    'var': stds
}

df = pd.DataFrame(data=data)
df



Unnamed: 0,name,min,mean,max,var
0,SVM,0.582329,0.617564,0.645161,0.023393
1,MLPC,0.634538,0.651373,0.662651,0.009523
2,Random Forest,0.497992,0.560445,0.604839,0.036276
3,CNN,0.649194,0.66506,0.689516,0.014175
4,SVM + LPCC,0.598394,0.629641,0.657258,0.021532
