In [1]:
import librosa
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [13]:
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)


In [14]:
data = []
labels = []
dataset_path = 'Dataset'  # Adjust the path as per your directory structure

# Loop through the dataset to extract features
for emotion in os.listdir(dataset_path):
    emotion_path = os.path.join(dataset_path, emotion)
    for file in os.listdir(emotion_path):
        file_path = os.path.join(emotion_path, file)
        mfcc = extract_features(file_path)  # Ensure this function is defined
        data.append(mfcc)
        labels.append(emotion)

# Convert data and labels to NumPy arrays
X = np.array(data)
y = np.array(labels)

# Encode string labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [15]:
# Reshape the input data for LSTM
X_train = X_train.reshape(X_train.shape[0], 40, 1)  # Adjust timesteps/features as needed
X_test = X_test.reshape(X_test.shape[0], 40, 1)

In [16]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [17]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 154ms/step - accuracy: 0.6039 - loss: 0.9508 - val_accuracy: 0.9750 - val_loss: 0.3557
Epoch 2/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 79ms/step - accuracy: 0.9593 - loss: 0.2846 - val_accuracy: 0.9750 - val_loss: 0.0594
Epoch 3/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - accuracy: 0.9879 - loss: 0.0675 - val_accuracy: 0.9917 - val_loss: 0.0212
Epoch 4/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.9963 - loss: 0.0169 - val_accuracy: 1.0000 - val_loss: 0.0033
Epoch 5/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - accuracy: 1.0000 - loss: 0.0055 - val_accuracy: 1.0000 - val_loss: 0.0017
Epoch 6/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 1.0000 - loss: 0.0029 - val_accuracy: 1.0000 - val_loss: 0.0012
Epoch 7/30
[1m15/15[0m [32m━━━

In [18]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 1.0000 - loss: 7.7235e-05
Test Accuracy: 1.00


In [None]:
def predict_emotion(model, file_path):
    feature = extract_features(file_path)
    feature = np.expand_dims(feature, axis=0)  # Reshape for prediction
    feature = feature[..., np.newaxis]  # Add extra dimension for LSTM
    prediction = model.predict(feature)
    predicted_label = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_label)

# Example prediction
audio_file = 'Benchmark/Speech_Emotion_Recognition/Dataset/angry/OAF_back_angry.wav'
predicted_emotion = predict_emotion(model, audio_file)
print(f'Predicted Emotion: {predicted_emotion[0]}')
