In [8]:
import os
import librosa
import numpy as np
import pandas as pd
import glob

emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fear',
    '07': 'disgust',
    '08': 'surprise'
}

def extract_mfcc(path):
    audio, sr = librosa.load(path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    return np.mean(mfcc.T, axis=0)

rows = []

folder = "audio/" 

files = glob.glob(f"{folder}/**/*.wav", recursive=True)
print("Total wav files found:", len(files))

for file_path in files:
    file = os.path.basename(file_path)

    parts = file.replace("_", "-").split('-')

    emotion_code = parts[2]
    intensity = parts[3]
    statement = parts[4]
    repetition = parts[5]
    actor_id = parts[6].split('.')[0]

    gender = "female" if int(actor_id) % 2 == 0 else "male"

    mfcc_features = extract_mfcc(file_path)

    row = {
        "emotion": emotion_map[emotion_code],
        "gender": gender,
        "actor_id": int(actor_id),
        "intensity": intensity,
        "statement": statement,
        "repetition": repetition
    }

    for i, val in enumerate(mfcc_features):
        row[f"mfcc_{i+1}"] = val

    rows.append(row)

df = pd.DataFrame(rows)
print(df.head())


Total wav files found: 2880
   emotion gender  actor_id intensity statement repetition      mfcc_1  \
0  neutral   male         1        01        01         01 -670.195435   
1  neutral   male         1        01        01         02 -660.230347   
2  neutral   male         1        01        02         01 -661.964478   
3  neutral   male         1        01        02         02 -657.722351   
4     calm   male         1        01        01         01 -694.579590   

      mfcc_2    mfcc_3     mfcc_4  ...   mfcc_31   mfcc_32   mfcc_33  \
0  65.063850  0.888954  14.715979  ... -2.351097 -2.504727 -3.151507   
1  63.325817 -2.630457  17.983355  ... -1.786414 -3.113372 -2.556752   
2  66.655869 -0.932158  14.899042  ... -2.264493 -2.643650 -2.937167   
3  65.035187  3.148672  15.666511  ... -2.918577 -2.849612 -3.591487   
4  72.531715  3.104562  17.112118  ... -2.507130 -1.405873 -2.290345   

    mfcc_34   mfcc_35   mfcc_36   mfcc_37   mfcc_38   mfcc_39   mfcc_40  
0 -2.190899 -3.80176

In [9]:
X = df.filter(like="mfcc").values
y = df["emotion"].values


In [14]:
#  our custom number mapping

emotion_to_num = {
    "neutral": 0,
    "calm": 1,
    "happy": 2,
    "sad": 3,
    "angry": 4,
    "fear": 5,
    "disgust": 6,
    "surprise": 7
}
# add numeric column
df["emotion_num"] = df["emotion"].map(emotion_to_num)
# Split dataset
X = df.filter(like="mfcc").values
y = df["emotion_num"].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
X,y

(array([[-6.7019543e+02,  6.5063850e+01,  8.8895434e-01, ...,
         -2.1449544e+00, -4.1521730e+00, -1.7796154e+00],
        [-6.6023035e+02,  6.3325817e+01, -2.6304569e+00, ...,
         -3.6171253e+00, -3.7516940e+00, -2.7309139e+00],
        [-6.6196448e+02,  6.6655869e+01, -9.3215787e-01, ...,
         -3.2813096e+00, -4.2086139e+00, -2.7262053e+00],
        ...,
        [-4.7747406e+02,  3.8989948e+01, -2.4028767e+01, ...,
         -5.1988631e-01, -1.2788836e+00, -4.8216973e-02],
        [-4.4481970e+02,  2.9430105e+01, -7.3198285e+00, ...,
          1.0747341e+00, -1.7664628e+00,  1.7296300e+00],
        [-4.8524686e+02,  3.4517685e+01, -2.6824563e+00, ...,
         -3.5408673e-01, -2.0401323e+00,  4.1914690e-01]], dtype=float32),
 array([0, 0, 0, ..., 7, 7, 7], dtype=int64))

In [19]:
# Train model(Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.9270833333333334
              precision    recall  f1-score   support

           0       0.94      0.76      0.84        42
           1       0.85      1.00      0.92        69
           2       0.95      0.95      0.95        82
           3       0.96      0.90      0.93        61
           4       0.92      0.90      0.91        79
           5       0.97      0.95      0.96        80
           6       0.91      0.93      0.92        84
           7       0.93      0.95      0.94        79

    accuracy                           0.93       576
   macro avg       0.93      0.92      0.92       576
weighted avg       0.93      0.93      0.93       576



In [20]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

# Create pipeline: Scale â†’ SVM
svm_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=10, gamma='scale')
)

# Train
svm_model.fit(X_train, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test)

# Metrics
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.9618055555555556
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.93      1.00      0.97        69
           2       0.95      0.99      0.97        82
           3       0.98      1.00      0.99        61
           4       0.97      0.92      0.95        79
           5       0.97      0.97      0.97        80
           6       0.97      0.93      0.95        84
           7       0.97      0.95      0.96        79

    accuracy                           0.96       576
   macro avg       0.96      0.96      0.96       576
weighted avg       0.96      0.96      0.96       576



### Since the accuracy of SVM is better than RandomForest we use SVM

In [21]:
# Save the model (so you can use it later)
import joblib
joblib.dump(svm_model, "speech_emotion_model.pkl")


['speech_emotion_model.pkl']

In [22]:
def predict_emotion(audio_path):
    audio, sr = librosa.load(audio_path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    features = np.mean(mfcc.T, axis=0).reshape(1, -1)

    pred = svm_model.predict(features)[0]

    for emo, num in emotion_to_num.items():
        if num == pred:
            return emo


In [24]:
predict_emotion('harvard1.wav')

'disgust'