In [1]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.

import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#1. load audio inside
from google.colab import drive
import shutil

# Set the file paths
drive_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'
colab_path = '/content/audio_speech_actors_01-24/uploaded'

# Copy the audio file from Google Drive to Colab
shutil.copytree(drive_path, colab_path)

# Verify the file has been copied successfully
import os
print("Audio file copied successfully:", os.path.exists(colab_path))

Audio file copied successfully: True


In [10]:
#2. extract the emotion part only
import os

emotion_list = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']
actor_emotions = {emotion: [] for emotion in emotion_list}

dataset_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'  # Replace with the actual path to your RAVDESS dataset directory

# Iterate through the files in the dataset directory
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        # Define emotion_code with a default value of -1
        emotion_code = -1

        # Extract the emotion code from the filename
        try:
            emotion_code = int(filename.split('-')[2])  #this is to split the emotion- 2 is where the emotion located
        except ValueError:
            print(f"Skipping file '{filename}' due to invalid emotion code")

        # Map the emotion code to the corresponding emotion label
        if 1 <= emotion_code <= 8:
            emotion_label = emotion_list[emotion_code - 1]
            actor_emotions[emotion_label].append(filename)
        else:
            print(f"Skipping file '{filename}' due to invalid emotion code")

# Print the list of actors' emotions based on emotion
for emotion, filenames in actor_emotions.items():
    print(f"Emotion: {emotion}")
    for filename in filenames:
        print(f"- {filename}")
    print()


Emotion: Neutral
- 03-01-01-01-01-02-07.wav
- 03-01-01-01-02-02-07.wav
- 03-01-01-01-01-02-08.wav
- 03-01-01-01-02-02-08.wav
- 03-01-01-01-01-01-08.wav
- 03-01-01-01-02-01-08.wav
- 03-01-01-01-01-02-09.wav
- 03-01-01-01-02-01-09.wav
- 03-01-01-01-01-01-09.wav
- 03-01-01-01-02-02-09.wav
- 03-01-01-01-01-02-10.wav
- 03-01-01-01-01-01-10.wav
- 03-01-01-01-02-02-10.wav
- 03-01-01-01-02-01-10.wav
- 03-01-01-01-01-01-11.wav
- 03-01-01-01-02-01-11.wav
- 03-01-01-01-02-02-11.wav
- 03-01-01-01-01-02-11.wav
- 03-01-01-01-01-02-12.wav
- 03-01-01-01-02-01-12.wav
- 03-01-01-01-02-02-12.wav
- 03-01-01-01-01-01-12.wav
- 03-01-01-01-01-02-13.wav
- 03-01-01-01-01-01-13.wav
- 03-01-01-01-02-01-13.wav
- 03-01-01-01-02-02-13.wav
- 03-01-01-01-01-02-14.wav
- 03-01-01-01-01-01-14.wav
- 03-01-01-01-02-01-14.wav
- 03-01-01-01-02-02-14.wav
- 03-01-01-01-01-02-15.wav
- 03-01-01-01-02-01-15.wav
- 03-01-01-01-01-01-15.wav
- 03-01-01-01-02-02-15.wav
- 03-01-01-01-01-01-16.wav
- 03-01-01-01-01-02-16.wav
- 03-01-01-

In [15]:
#3. extraction feature audio using MFCC
import os
import librosa
import numpy as np

# Define the path to the dataset directory
dataset_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'  # Replace with the actual path to your RAVDESS dataset directory

# Define the list of emotion labels
emotion_labels = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']

# Define the list to store the extracted features and corresponding labels
features = []
labels = []

# Iterate through the files in the dataset directory
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        # Load the audio file
        file_path = os.path.join(dataset_path, filename)
        audio, sr = librosa.load(file_path, sr=None)

        # Extract the MFCC features
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

        # Pad or truncate the MFCC features to a fixed length (e.g., 100 frames)
        max_frames = 100
        if mfcc.shape[1] < max_frames:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :max_frames]

        # Store the features and labels
        try:
            emotion = int(filename.split('-')[2])
            labels.append(emotion_labels[emotion - 1])
            features.append(mfcc)
        except IndexError:
            print(f"Skipping file: {filename}. Invalid filename format.")

# Convert the features and labels to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Print the shape of the features and labels arrays
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

#the output = (1340, 13, 100) - represents the shape of the features array in 3D which 1340 is first D which total num of audio dataset, 13 is 2nd D MFCC coefficients extracted for each frame of the audio. and 100  represents the number of frames or time steps for each audio sample that fixed to 100

Features shape: (1340, 13, 100)
Labels shape: (1340,)


In [19]:
#splititng dataset and training using numpy
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Define the path to the dataset directory
dataset_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'  # Replace with the actual path to your RAVDESS dataset directory

# Define the list of emotion labels
emotion_labels = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']

# Define the list to store the extracted features and corresponding labels
features = []
labels = []

# Iterate through the files in the dataset directory
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        # Load the audio file
        file_path = os.path.join(dataset_path, filename)
        audio, sr = librosa.load(file_path, sr=None)

        # Extract the MFCC features
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

        # Pad or truncate the MFCC features to a fixed length (e.g., 100 frames)
        max_frames = 100
        if mfcc.shape[1] < max_frames:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :max_frames]

        # Store the features and labels
        try:
            emotion = int(filename.split('-')[2])
            labels.append(emotion_labels[emotion - 1])
            features.append(mfcc)
        except IndexError:
            print(f"Skipping file: {filename}. Invalid filename format.")

# Convert the features and labels to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Perform label encoding on the emotion labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size=0.2, random_state=42)

# Reshape the features to match the expected input shape of the CNN
train_features = train_features.reshape((*train_features.shape, 1))
test_features = test_features.reshape((*test_features.shape, 1))

# Build the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=train_features.shape[1:]))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(emotion_labels), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_features, train_labels, batch_size=32, epochs=10, verbose=1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_features, test_labels, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
# 0.1828 is considered quite low

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.0584983825683594
Test Accuracy: 0.18283581733703613


In [20]:
#splitting the test audio and use tenserflow as training model
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

# Define the path to the dataset directory
dataset_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'  # Replace with the actual path to your RAVDESS dataset directory

# Define the list of emotion labels
emotion_labels = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']

# Define the list to store the extracted features and corresponding labels
features = []
labels = []

# Iterate through the files in the dataset directory
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        # Load the audio file
        file_path = os.path.join(dataset_path, filename)
        audio, sr = librosa.load(file_path, sr=None)

        # Extract the MFCC features
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

        # Pad or truncate the MFCC features to a fixed length (e.g., 100 frames)
        max_frames = 100
        if mfcc.shape[1] < max_frames:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :max_frames]

        # Store the features and labels
        try:
            emotion = int(filename.split('-')[2])
            labels.append(emotion_labels[emotion - 1])
            features.append(mfcc)
        except IndexError:
            print(f"Skipping file: {filename}. Invalid filename format.")

# Convert the features and labels to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Perform label encoding on the emotion labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size=0.2, random_state=42)

# Reshape the features to match the expected input shape of the CNN
train_features = train_features.reshape((*train_features.shape, 1))
test_features = test_features.reshape((*test_features.shape, 1))

# Build the CNN model using TensorFlow
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=train_features.shape[1:]),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(emotion_labels), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_features, train_labels, batch_size=32, epochs=10, verbose=1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_features, test_labels, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.7817859649658203
Test Accuracy: 0.2947761118412018


In [21]:
#test splitting and training using keras
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Define the path to the dataset directory
dataset_path = '/content/drive/MyDrive/ravdess/audio_speech_actors_01-24/uploaded'  # Replace with the actual path to your RAVDESS dataset directory

# Define the list of emotion labels
emotion_labels = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fearful', 'Disgust', 'Surprised']

# Define the list to store the extracted features and corresponding labels
features = []
labels = []

# Iterate through the files in the dataset directory
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        # Load the audio file
        file_path = os.path.join(dataset_path, filename)
        audio, sr = librosa.load(file_path, sr=None)

        # Extract the MFCC features
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

        # Pad or truncate the MFCC features to a fixed length (e.g., 100 frames)
        max_frames = 100
        if mfcc.shape[1] < max_frames:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode='constant')
        else:
            mfcc = mfcc[:, :max_frames]

        # Store the features and labels
        try:
            emotion = int(filename.split('-')[2])
            labels.append(emotion_labels[emotion - 1])
            features.append(mfcc)
        except IndexError:
            print(f"Skipping file: {filename}. Invalid filename format.")

# Convert the features and labels to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Perform label encoding on the emotion labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels_encoded, test_size=0.2, random_state=42)

# Reshape the features to match the expected input shape of the CNN
train_features = train_features.reshape((*train_features.shape, 1))
test_features = test_features.reshape((*test_features.shape, 1))

# Build the CNN model using Keras
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=train_features.shape[1:]))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(emotion_labels), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_features, train_labels, batch_size=32, epochs=10, verbose=1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_features, test_labels, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 2.585080623626709
Test Accuracy: 0.31716418266296387
