In [2]:
import os
import librosa
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DATASET_PATH = "ravdess-emotional-speech-audio/Audio_Speech_Actors_01-24"

emotion_map = {
    "01": "neutral",
    "03": "happy",
    "04": "sad",
    "05": "angry"
}

X = []
y = []

for actor in os.listdir(DATASET_PATH):
    actor_path = os.path.join(DATASET_PATH, actor)

    for file in os.listdir(actor_path):
        emotion_code = file.split("-")[2]

        if emotion_code in emotion_map:
            file_path = os.path.join(actor_path, file)

            audio, sr = librosa.load(file_path, duration=3)
            mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
            mfcc = np.mean(mfcc.T, axis=0)

            X.append(mfcc)
            y.append(emotion_map[emotion_code])

X = np.array(X)
y = np.array(y)

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Emotions:", encoder.classes_)

print("Emotion Recognition from Speech (ML Model) COMPLETED")

Accuracy: 0.6518518518518519

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.83      0.84        47
           1       0.58      0.59      0.58        32
           2       0.50      0.14      0.22        21
           3       0.54      0.77      0.64        35

    accuracy                           0.65       135
   macro avg       0.62      0.58      0.57       135
weighted avg       0.65      0.65      0.63       135

Emotions: ['angry' 'happy' 'neutral' 'sad']
Emotion Recognition from Speech (ML Model) COMPLETED
