In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# now importing the folder names containing the audio files
speech_folder = "/content/drive/MyDrive/emotion_classification/Audio_Speech_Actors_01-24"
song_folder = "/content/drive/MyDrive/emotion_classification/Audio_Song_Actors_01-24"

In [None]:
# Emotion labels mapping based on RAVDESS dataset naming convention
#taking reference from document peech includes calm, happy, sad, angry, fearful, surprise, and disgust expressions, and song contains calm, happy, sad, angry, and fearful emotions.
#Each expression is produced at two levels of emotional intensity (normal, strong), with an additional neutral expression
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

In [None]:
sample_rate = 22050
n_mels = 128
hop_length = 512
duration = 3

In [None]:
def extract_audio_features(file_path, filename, sr=22050, n_mels=128, duration=3, hop_length=512):
    try:
        y, _ = librosa.load(file_path, sr=sr, duration=duration)
        if len(y) < sr * duration:
            y = np.pad(y, (0, sr * duration - len(y)))

        # Mean Mel Spectrogram (1D)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
        mel_mean = np.mean(log_mel_spec, axis=1)

        # MFCCs
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfccs.T, axis=0)

        # Delta & Delta-Delta MFCCs
        delta = librosa.feature.delta(mfccs)
        delta2 = librosa.feature.delta(mfccs, order=2)
        delta_mean = np.mean(delta.T, axis=0)
        delta2_mean = np.mean(delta2.T, axis=0)

        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_mean = np.mean(chroma.T, axis=0)

        # Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        contrast_mean = np.mean(contrast.T, axis=0)

        # Tonnetz
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
        tonnetz_mean = np.mean(tonnetz.T, axis=0)

        # Zero Crossing Rate, RMS, Centroid, Rolloff
        zcr_mean = np.mean(librosa.feature.zero_crossing_rate(y))
        rmse_mean = np.mean(librosa.feature.rms(y=y))

        # Gender (based on actor ID)
        actor_id = int(filename.split('-')[-1].split('.')[0])
        gender = 0 if actor_id % 2 == 0 else 1  # 0: female, 1: male

        # Intensity (low/high from filename: field index 3)
        intensity = int(filename.split('-')[3])

        # Combine all features
        feature_vector = np.hstack([
            mel_mean,
            mfcc_mean,
            delta_mean,
            delta2_mean,
            chroma_mean,
            contrast_mean,
            tonnetz_mean,
            zcr_mean,
            rmse_mean,
            gender,
            intensity
        ])

        return feature_vector
    except Exception as e:
        print(f"Error extracting features from {file_path}: {e}")
        return None

# Data Loading

In [None]:
def load_dataset_from_folder(folder_path, emotion_map):
    features = []
    labels = []

    print(f"\nLoading data from: {folder_path}")
    for root, _, files in os.walk(folder_path):
        for file in tqdm(files):
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                emotion_code = file.split('-')[2]
                emotion = emotion_map.get(emotion_code)
                if emotion:
                    feat = extract_audio_features(file_path, file, sr=sample_rate, n_mels=n_mels, duration=duration)
                    if feat is not None:
                        features.append(feat)
                        labels.append(emotion)

    return np.array(features), np.array(labels)


In [None]:
# Load speech and song data
X_speech, y_speech = load_dataset_from_folder(speech_folder, emotion_map)
X_song, y_song = load_dataset_from_folder(song_folder, emotion_map)


Loading data from: /content/drive/MyDrive/emotion_classification/Audio_Speech_Actors_01-24


0it [00:00, ?it/s]
100%|██████████| 60/60 [00:43<00:00,  1.39it/s]
100%|██████████| 60/60 [00:38<00:00,  1.55it/s]
100%|██████████| 60/60 [00:41<00:00,  1.44it/s]
100%|██████████| 60/60 [00:40<00:00,  1.50it/s]
100%|██████████| 60/60 [00:39<00:00,  1.54it/s]
100%|██████████| 60/60 [00:43<00:00,  1.38it/s]
100%|██████████| 60/60 [00:36<00:00,  1.63it/s]
100%|██████████| 60/60 [00:42<00:00,  1.42it/s]
100%|██████████| 60/60 [00:39<00:00,  1.53it/s]
100%|██████████| 60/60 [00:36<00:00,  1.62it/s]
100%|██████████| 60/60 [00:38<00:00,  1.57it/s]
100%|██████████| 60/60 [00:39<00:00,  1.52it/s]
100%|██████████| 60/60 [00:36<00:00,  1.66it/s]
100%|██████████| 60/60 [00:36<00:00,  1.65it/s]
100%|██████████| 60/60 [00:38<00:00,  1.58it/s]
100%|██████████| 60/60 [00:37<00:00,  1.59it/s]
100%|██████████| 60/60 [00:38<00:00,  1.55it/s]
100%|██████████| 60/60 [00:38<00:00,  1.55it/s]
100%|██████████| 60/60 [00:40<00:00,  1.49it/s]
100%|██████████| 60/60 [00:39<00:00,  1.54it/s]
100%|██████████| 60/6


Loading data from: /content/drive/MyDrive/emotion_classification/Audio_Song_Actors_01-24


0it [00:00, ?it/s]
100%|██████████| 44/44 [00:30<00:00,  1.45it/s]
100%|██████████| 44/44 [00:28<00:00,  1.53it/s]
100%|██████████| 44/44 [00:28<00:00,  1.53it/s]
100%|██████████| 44/44 [00:27<00:00,  1.59it/s]
100%|██████████| 44/44 [00:27<00:00,  1.58it/s]
100%|██████████| 44/44 [00:28<00:00,  1.56it/s]
100%|██████████| 44/44 [00:27<00:00,  1.63it/s]
100%|██████████| 44/44 [00:30<00:00,  1.44it/s]
100%|██████████| 44/44 [00:29<00:00,  1.50it/s]
100%|██████████| 44/44 [00:30<00:00,  1.42it/s]
100%|██████████| 44/44 [00:29<00:00,  1.49it/s]
100%|██████████| 44/44 [00:30<00:00,  1.44it/s]
100%|██████████| 44/44 [00:27<00:00,  1.59it/s]
100%|██████████| 44/44 [00:28<00:00,  1.52it/s]
100%|██████████| 44/44 [00:29<00:00,  1.47it/s]
100%|██████████| 44/44 [00:29<00:00,  1.49it/s]
100%|██████████| 44/44 [00:30<00:00,  1.46it/s]
100%|██████████| 44/44 [00:29<00:00,  1.47it/s]
100%|██████████| 44/44 [00:29<00:00,  1.50it/s]
100%|██████████| 44/44 [00:28<00:00,  1.55it/s]
100%|██████████| 44/4

In [None]:
X = np.vstack((X_speech, X_song))
y = np.concatenate((y_speech, y_song))

In [None]:
# Create column names for DataFrame
num_mel = 128
num_mfcc = 13
num_delta = 13
num_delta2 = 13
num_chroma = 12
num_contrast = 7
num_tonnetz = 6
additional = ['zcr', 'rmse', 'gender', 'intensity']

columns = (
    [f'mel_{i+1}' for i in range(num_mel)] +
    [f'mfcc_{i+1}' for i in range(num_mfcc)] +
    [f'delta_{i+1}' for i in range(num_delta)] +
    [f'delta2_{i+1}' for i in range(num_delta2)] +
    [f'chroma_{i+1}' for i in range(num_chroma)] +
    [f'contrast_{i+1}' for i in range(num_contrast)] +
    [f'tonnetz_{i+1}' for i in range(num_tonnetz)] +
    additional
)

In [None]:
df = pd.DataFrame(X, columns=columns)
df['emotion'] = y

In [None]:
df.shape

(2452, 197)

In [None]:
df.head()

Unnamed: 0,mel_1,mel_2,mel_3,mel_4,mel_5,mel_6,mel_7,mel_8,mel_9,mel_10,...,tonnetz_2,tonnetz_3,tonnetz_4,tonnetz_5,tonnetz_6,zcr,rmse,gender,intensity,emotion
0,-70.655457,-68.36998,-67.705742,-67.330467,-62.240829,-55.497036,-51.542595,-47.762424,-46.360947,-49.937832,...,0.039097,-0.051205,-0.02697,-0.000133,-0.019666,0.181768,0.015609,0.0,1.0,angry
1,-72.314018,-68.822762,-65.775558,-65.801338,-65.692055,-60.80302,-58.615334,-56.168446,-54.690483,-54.336365,...,0.040009,-0.086314,0.08337,0.001419,-0.001496,0.156333,0.006987,0.0,1.0,surprised
2,-73.161072,-71.349533,-69.594612,-69.925545,-67.570358,-64.556984,-62.838047,-60.212967,-57.506607,-55.255634,...,0.013919,0.00919,0.063062,-0.005817,-0.012488,0.176476,0.016331,0.0,2.0,surprised
3,-67.941032,-66.689468,-66.536446,-66.676514,-65.628128,-63.220478,-61.127796,-55.859779,-53.232128,-50.436695,...,0.043551,0.001958,0.036539,-0.013295,-0.000431,0.193014,0.010081,0.0,1.0,fearful
4,-67.721306,-67.691719,-68.883591,-69.845856,-68.610413,-66.581444,-65.777794,-63.005062,-60.18187,-57.147827,...,0.000435,0.017535,-0.024363,0.017719,0.025629,0.292691,0.023641,0.0,2.0,happy


In [None]:
df.to_csv('mel_mean_plus_audio_features_gender_intensity.csv',index=False)