In [27]:
import os
import glob
import numpy as np
import librosa
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [29]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']


In [31]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    X, sample_rate = librosa.load(file_name, res_type='scipy')
    result = np.array([])
    if chroma:
        stft = np.abs(librosa.stft(X))
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_feat))
    if mel:
        mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_feat))
    return result


In [33]:
def add_noise(data, noise_factor=0.005):
    noise = np.random.randn(len(data))
    return data + noise_factor * noise

def shift(data, shift_max=0.2, shift_direction='both'):
    shift = np.random.randint(int(len(data) * shift_max))
    if shift_direction == 'right':
        shift = -shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    return np.roll(data, shift)

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)


In [35]:
def load_data(test_size=0.2, augment=False):
    x, y = [], []
    speech_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Speech_Actors_01-24/Actor*/**/*.wav', recursive=True)
    song_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Song_Actors_01-24/Actor*/**/*.wav', recursive=True)
    all_files = speech_files + song_files
    for file in all_files:
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        # Original
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
        # Augmentation (optional)
        if augment:
            y_audio, sr = librosa.load(file, res_type='scipy')
            # Add noise
            feature_noise = extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(feature_noise)
            y.append(emotion)
            # Shifted
            y_shift = shift(y_audio)
            temp_file = "temp_shift.wav"
            librosa.output.write_wav(temp_file, y_shift, sr)
            feature_shift = extract_feature(temp_file, mfcc=True, chroma=True, mel=True)
            x.append(feature_shift)
            y.append(emotion)
            os.remove(temp_file)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)


In [37]:
x_train, x_test, y_train, y_test = load_data(test_size=0.2, augment=False)  # Set augment=True for augmentation

# Label encoding
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)


In [39]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [41]:
print('Class distribution in training data:', Counter(y_train))


Class distribution in training data: Counter({'happy': 312, 'angry': 307, 'sad': 301, 'calm': 298, 'fearful': 296, 'surprised': 154, 'disgust': 149, 'neutral': 144})


In [45]:
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (300,)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [500, 1000]
}
mlp = MLPClassifier()
clf = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
clf.fit(x_train_scaled, y_train_enc)
print("Best parameters found:", clf.best_params_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found: {'alpha': 0.01, 'hidden_layer_sizes': (300,), 'learning_rate': 'constant', 'max_iter': 500}


In [47]:
best_model = clf.best_estimator_
best_model.fit(x_train_scaled, y_train_enc)
y_pred = best_model.predict(x_test_scaled)

print("Accuracy:", accuracy_score(y_test_enc, y_pred))
print("F1 Score (Macro):", f1_score(y_test_enc, y_pred, average='macro'))
print("Classification Report:\n", classification_report(y_test_enc, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test_enc, y_pred))

Accuracy: 0.7433808553971487
F1 Score (Macro): 0.7405718936638566
Classification Report:
               precision    recall  f1-score   support

       angry       0.71      0.83      0.77        69
        calm       0.82      0.82      0.82        78
     disgust       0.74      0.65      0.69        43
     fearful       0.71      0.69      0.70        80
       happy       0.75      0.75      0.75        64
     neutral       0.82      0.73      0.77        44
         sad       0.69      0.72      0.71        75
   surprised       0.73      0.71      0.72        38

    accuracy                           0.74       491
   macro avg       0.75      0.74      0.74       491
weighted avg       0.74      0.74      0.74       491

Confusion Matrix:
 [[57  0  3  3  3  0  1  2]
 [ 0 64  0  1  2  4  7  0]
 [ 6  1 28  2  1  1  2  2]
 [ 9  0  0 55  6  0 10  0]
 [ 4  4  1  3 48  0  1  3]
 [ 0  4  1  1  1 32  3  2]
 [ 2  4  0  9  3  2 54  1]
 [ 2  1  5  3  0  0  0 27]]


In [63]:
import os
import glob
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping


In [65]:
def extract_mfcc_sequence(file_path, n_mfcc=40, max_len=200):
    y, sr = librosa.load(file_path, res_type='scipy')
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Pad or truncate to fixed length for batching
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc.T  # Shape: (max_len, n_mfcc)


In [67]:
emotions = {
    '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
    '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
}
observed_emotions = list(emotions.values())

def load_data_dl(test_size=0.2, max_len=200):
    x, y = [], []
    # Update these paths for your dataset
    speech_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Speech_Actors_01-24/Actor*/**/*.wav', recursive=True)
    song_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Song_Actors_01-24/Actor*/**/*.wav', recursive=True)
    all_files = speech_files + song_files
    for file in all_files:
        file_name = os.path.basename(file)
        emotion_code = file_name.split("-")[2]
        emotion = emotions.get(emotion_code)
        if emotion not in observed_emotions:
            continue
        try:
            mfcc_seq = extract_mfcc_sequence(file, max_len=max_len)
            x.append(mfcc_seq)
            y.append(emotion)
        except Exception as e:
            print(f"Error processing {file}: {e}")
    x = np.array(x)
    y = np.array(y)
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    y_cat = to_categorical(y_enc)
    x_train, x_test, y_train, y_test = train_test_split(x, y_cat, test_size=test_size, random_state=42, stratify=y)
    return x_train, x_test, y_train, y_test, le


In [69]:
speech_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Speech_Actors_01-24/Actor*/**/*.wav', recursive=True)
song_files = glob.glob('/Users/dushyantyadav/Downloads/Audio_Song_Actors_01-24/Actor*/**/*.wav', recursive=True)
all_files = speech_files + song_files
print("Speech files found:", len(speech_files))
print("Song files found:", len(song_files))
print("Total files found:", len(all_files))


Speech files found: 1440
Song files found: 1012
Total files found: 2452


In [71]:
def build_cnn_lstm(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=5, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    model.add(Conv1D(128, kernel_size=5, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [73]:
# Load data
x_train, x_test, y_train, y_test, le = load_data_dl(test_size=0.2, max_len=200)
input_shape = x_train.shape[1:]  # (max_len, n_mfcc)
num_classes = y_train.shape[1]

# Build model
model = build_cnn_lstm(input_shape, num_classes)

# Early stopping for better generalization
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train
history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=60,
    batch_size=32,
    callbacks=[early_stop]
)

# Evaluate
loss, acc = model.evaluate(x_test, y_test)
print("Test accuracy:", acc)
from sklearn.metrics import classification_report, confusion_matrix
y_pred = np.argmax(model.predict(x_test), axis=1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Test accuracy: 0.7800407409667969
              precision    recall  f1-score   support

       angry       0.93      0.83      0.87        75
        calm       0.90      0.85      0.88        75
     disgust       0.79      0.69      0.74        39
     fearful       0.76      0.73      0.75        75
       happy       0.79      0.84      0.81        75
     neutral       0.76      0.76      0.76        38
         sad       0.61      0.68      0.65        75
   surprised       0.70      0.82     

In [75]:

test_file = '/Users/dushyantyadav/Downloads/Crema/1001_IEO_DIS_LO.wav'
mfcc_seq = extract_mfcc_sequence(test_file, max_len=200)  # Use your pipeline's function
mfcc_seq = np.expand_dims(mfcc_seq, axis=0)  # Reshape for batch (1, max_len, n_mfcc)

In [77]:
pred = model.predict(mfcc_seq)  # For Keras/TensorFlow models
predicted_class = np.argmax(pred)
predicted_emotion = le.classes_[predicted_class]
print("Predicted emotion:", predicted_emotion)

Predicted emotion: surprised


In [111]:

test_file = '/Users/dushyantyadav/Downloads/Crema/1083_IEO_ANG_MD.wav'
mfcc_seq = extract_mfcc_sequence(test_file, max_len=200)  # Use your pipeline's function
mfcc_seq = np.expand_dims(mfcc_seq, axis=0)  # Reshape for batch (1, max_len, n_mfcc)


In [113]:
pred = model.predict(mfcc_seq)  # For Keras/TensorFlow models
predicted_class = np.argmax(pred)
predicted_emotion = le.classes_[predicted_class]
print("Predicted emotion:", predicted_emotion)

Predicted emotion: surprised
