In [1]:
!kaggle datasets download -d uldisvalainis/audio-emotions

Dataset URL: https://www.kaggle.com/datasets/uldisvalainis/audio-emotions
License(s): unknown
Downloading audio-emotions.zip to /content
 99% 1.11G/1.12G [00:19<00:00, 116MB/s]
100% 1.12G/1.12G [00:19<00:00, 62.8MB/s]


In [2]:
import zipfile
zip_ref = zipfile.ZipFile('/content/audio-emotions.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

# **1: Import Libraries**

In [3]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, Reshape, BatchNormalization


# **2: Define Function to Load Audio Files and Extract Features**

In [4]:
# Function to load audio files and extract features
def load_audio_files(data_path):
    features = []
    labels = []
    for emotion in os.listdir(data_path):
        emotion_folder = os.path.join(data_path, emotion)
        if os.path.isdir(emotion_folder):
            for file in os.listdir(emotion_folder):
                if file.endswith('.wav'):
                    file_path = os.path.join(emotion_folder, file)
                    y, sr = librosa.load(file_path, sr=None)
                    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                    mfccs = np.mean(mfccs.T, axis=0)
                    features.append(mfccs)
                    labels.append(emotion)
    return np.array(features), np.array(labels)


# **3: Load Data**

In [5]:
# Paths and parameters
data_path = '/content/Emotions'  # Update with your dataset path

# Load data
X, y = load_audio_files(data_path)


# **4: Encode Labels**

In [6]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)


# **5: Split Data**

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


# **6: Reshape Data for CNN Input**

In [8]:
# Reshape data for CNN input
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]


# **7: Build Model**

In [9]:
# Define the model
model = Sequential([
    Reshape((13, 1, 1), input_shape=(13, 1)),
    Conv2D(32, (3, 1), activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Conv2D(64, (3, 1), activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])





  super().__init__(**kwargs)


# **8: Compile Model**

In [11]:
# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [12]:
# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
]


# **9: Train Model**

In [13]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test), callbacks=callbacks)


Epoch 1/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.2341 - loss: 1.9713 - val_accuracy: 0.3598 - val_loss: 1.6035 - learning_rate: 0.0010
Epoch 2/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3324 - loss: 1.6266 - val_accuracy: 0.4445 - val_loss: 1.4017 - learning_rate: 0.0010
Epoch 3/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3893 - loss: 1.4928 - val_accuracy: 0.4602 - val_loss: 1.3630 - learning_rate: 0.0010
Epoch 4/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4096 - loss: 1.4401 - val_accuracy: 0.4816 - val_loss: 1.2974 - learning_rate: 0.0010
Epoch 5/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.4417 - loss: 1.3855 - val_accuracy: 0.4922 - val_loss: 1.2632 - learning_rate: 0.0010
Epoch 6/100
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# **10: Evaluate Model**

In [14]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5801 - loss: 1.0823
Test Accuracy: 0.5667968988418579


# **11: Save Model**

In [15]:
# Save model
model.save('emotion_recognition_model.keras')


# **12: Load and Test Model**

In [16]:
import os
import numpy as np
import librosa
import tensorflow as tf

# Load the saved model
model = tf.keras.models.load_model('emotion_recognition_model.keras')

# Function to load and preprocess a new audio file
def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs = np.mean(mfccs.T, axis=0)
    return mfccs

# Load and preprocess a new audio file
file_path = '/content/Marriage_Story.mp3'  # Update with the path to your test audio file
mfccs = preprocess_audio(file_path)

# Reshape the data to fit the model input
mfccs_reshaped = mfccs[np.newaxis, ..., np.newaxis]

# Make a prediction
predicted_label = model.predict(mfccs_reshaped)
predicted_emotion = le.inverse_transform([np.argmax(predicted_label)])

# Display the result
print(f"The predicted emotion is: {predicted_emotion[0]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 406ms/step
The predicted emotion is: Angry
