In [2]:
pip install kagglehub librosa sounddevice scikit-learn joblib numpy tqdm


Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting sounddevice
  Downloading sounddevice-0.5.2-py3-none-win_amd64.whl.metadata (1.6 kB)
Collecting pyyaml (from kagglehub)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp312-cp312-win_amd64.whl.metadata (2.9 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-

In [4]:
import os
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib
import kagglehub

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Step 1: Download Dataset
print("[INFO] Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")
print("[INFO] Dataset downloaded to:", path)


[INFO] Downloading dataset from Kaggle...
[INFO] Dataset downloaded to: C:\Users\MSI\.cache\kagglehub\datasets\uwrfkaggler\ravdess-emotional-speech-audio\versions\1


In [6]:
# Step 2: Emotion labels
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fear",
    "07": "disgust",
    "08": "surprise"
}

In [7]:
def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        return mfccs_scaled
    except Exception as e:
        print(f"[WARN] Could not process {file_path}: {e}")
        return None

features = []
labels = []

In [8]:

pip install librosa==0.10.1 resampy==0.4.2 numpy scipy soundfile audioread


Note: you may need to restart the kernel to use updated packages.


In [9]:
# Step 3: Traverse audio files and extract features

import os
import librosa
from tqdm import tqdm
import numpy as np
print("[INFO] Extracting features...")
for root, dirs, files in os.walk(path):
    for file in tqdm(files):
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            emotion = emotion_map.get(emotion_code)
            if emotion:
                file_path = os.path.join(root, file)
                mfccs = extract_features(file_path)
                if mfccs is not None:
                    features.append(mfccs)
                    labels.append(emotion)

[INFO] Extracting features...


0it [00:00, ?it/s]
  from pkg_resources import resource_filename
100%|██████████| 60/60 [00:20<00:00,  2.99it/s]
100%|██████████| 60/60 [00:02<00:00, 22.86it/s]
100%|██████████| 60/60 [00:02<00:00, 23.22it/s]
100%|██████████| 60/60 [00:02<00:00, 24.33it/s]
100%|██████████| 60/60 [00:02<00:00, 23.36it/s]
100%|██████████| 60/60 [00:02<00:00, 22.34it/s]
100%|██████████| 60/60 [00:02<00:00, 20.98it/s]
100%|██████████| 60/60 [00:02<00:00, 22.35it/s]
100%|██████████| 60/60 [00:02<00:00, 24.31it/s]
100%|██████████| 60/60 [00:02<00:00, 23.16it/s]
100%|██████████| 60/60 [00:02<00:00, 25.70it/s]
100%|██████████| 60/60 [00:02<00:00, 22.76it/s]
100%|██████████| 60/60 [00:02<00:00, 26.09it/s]
100%|██████████| 60/60 [00:02<00:00, 24.05it/s]
100%|██████████| 60/60 [00:02<00:00, 24.52it/s]
100%|██████████| 60/60 [00:02<00:00, 23.50it/s]
100%|██████████| 60/60 [00:02<00:00, 23.92it/s]
100%|██████████| 60/60 [00:02<00:00, 23.06it/s]
100%|██████████| 60/60 [00:02<00:00, 22.52it/s]
100%|██████████| 60/60 

In [10]:
# Step 4: Train/test split
print("[INFO] Training model...")
X = np.array(features)
y = np.array(labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[INFO] Training model...


In [11]:
# Step 5: Train classifier (SVM or Random Forest)
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)


0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [12]:
# Step 6: Evaluate
print("[INFO] Model evaluation:")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


[INFO] Model evaluation:
              precision    recall  f1-score   support

       angry       0.77      0.71      0.74        79
        calm       0.52      0.78      0.63        69
     disgust       0.64      0.55      0.59        84
        fear       0.58      0.80      0.67        80
       happy       0.56      0.52      0.54        82
     neutral       0.64      0.33      0.44        42
         sad       0.44      0.44      0.44        61
    surprise       0.64      0.47      0.54        79

    accuracy                           0.59       576
   macro avg       0.60      0.58      0.57       576
weighted avg       0.60      0.59      0.59       576



In [13]:
joblib.dump(model, "voice_emotion_model.pkl")
print("[INFO] Model saved as voice_emotion_model.pkl")

[INFO] Model saved as voice_emotion_model.pkl


In [24]:
import sounddevice as sd
import librosa
import numpy as np
import scipy.io.wavfile as wav
import joblib

def record_audio(filename="live_audio.wav", duration=3, fs=44100):
    print("🎙️ Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    wav.write(filename, fs, audio)
    print("✅ Recording saved.")

def extract_features(file):
    y, sr = librosa.load(file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)

def predict_emotion(file):
    model = joblib.load("voice_emotion_model.pkl")
    features = extract_features(file).reshape(1, -1)
    prediction = model.predict(features)
    proba = model.predict_proba(features).max()
    return prediction[0], round(proba * 100, 2)

# Run detection
record_audio("live_audio.wav", duration=3)
emotion, confidence = predict_emotion("live_audio.wav")
print(f"🎭 Detected Emotion: {emotion} ({confidence}%)")


🎙️ Recording...
✅ Recording saved.
🎭 Detected Emotion: disgust (83.08%)
