# Speech Emotion Recognition


### Starters Code

In [70]:
# To use google colab's GPU
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [0]:
#!pip install soundfile
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

*We need to extract mfcc, chroma, and mel features from the soundfile. Let's define a function to do that. This function will take in 4 parameters (the filename and three boolean parameters for the three features)*

In [0]:
def extract_features(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype = "float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfcc = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc = 40).T,axis=0)
            result = np.hstack((result, mfcc))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis = 0)
            result = np.hstack((result, mel))
        return result

In [0]:
# Define a dictionary to hold emotions and numbers in the RAVDESS dataset
emotions = {
    '01':'neutral',
    '02':'calm',
    '03':'happy',
    '04':'sad',
    '05':'angry',
    '06':'fearful',
    '07':'disgust',
    '08':'surprised'
}

# Define a list to hold the emotions we want
observed_emotions = ['calm','happy','fearful','disgust']

## Load the Data

In [0]:
# Define your file path
file_path = "/content/drive/My Drive/Colab Notebooks/Personal Projects/Speech Emotion Recognition/Data/speech-emotion-recognition-ravdess-data/Actor_*/*.wav"

In [0]:
#define a load_data function using glob() function in the glob module to get all the pathnames for the files in our dataset


def load_data(test_size=0.2):
    x,y=[], []
    for file in glob.glob(file_path):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split('-')[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_features(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size = test_size, random_state = 42)

## Train Test Split

In [0]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

In [77]:
print((X_train.shape[0], X_test.shape[0]))

(576, 192)


In [78]:
# get the number of features extracted
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 180


## The Model Using MLP (Multi Layer Perceptron)

In [0]:
model = MLPClassifier(alpha=0.001, batch_size=16, hidden_layer_sizes=(350,),learning_rate_init=0.0001, learning_rate='adaptive', max_iter=500)

In [80]:
# Train the model
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.001, batch_size=16, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(350,), learning_rate='adaptive',
              learning_rate_init=0.0001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [0]:
# Prediction
y_pred = model.predict(X_test)

In [82]:
#Accuracy of the model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print(f"Accuracy: {round(accuracy*100,2)}%")

Accuracy: 77.08%
