In [1]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, RNN, Layer
from tensorflow.keras.optimizers import Adam


In [2]:
# Emotion labels in IEMOCAP
emotions = {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3}


In [3]:
def load_iemocap_data(data_path):
    data = []
    labels = []
    sessions = [f'Session{i}' for i in range(1, 4)]
    for session in sessions:
        wav_folder = os.path.join(data_path, session, 'sentences', 'wav')
        # print(wav_folder)
        emo_eval_folder = os.path.join(data_path, session, 'dialog', 'EmoEvaluation')
        # print(emo_eval_folder)
        for root, _, files in os.walk(wav_folder):
            for file in files:
                if file.endswith('.wav'):
                    wav_file = os.path.join(root, file)
                    # Corresponding emotion label file
                    emo_file = os.path.join(emo_eval_folder, os.path.basename(root) + '.txt')
                    with open(emo_file, 'r') as f:
                        for line in f:
                            if file[:-4] in line:
                                parts = line.strip().split('\t')
                                if len(parts) >= 4:
                                    emotion = parts[2]
                                    if emotion in emotions:
                                        data.append(wav_file)
                                        labels.append(emotions[emotion])
                                break
    return data, labels

# Load data
data_path = './IEMOCAP'  # Update this path
data_files, data_labels = load_iemocap_data(data_path)
print(f'Total samples loaded: {len(data_files)}')


Total samples loaded: 2755


In [4]:
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, sr=None)
    # Extract MFCCs
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    # Compute delta and delta-delta of MFCCs
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    # Concatenate features
    features = np.concatenate((mfcc, mfcc_delta, mfcc_delta2), axis=0)
    return features.T  # Shape: (time_steps, features)

In [5]:
def augment_audio(audio, sample_rate):
    augmented_audios = []
    # Pitch Shifting
    audio_pitched = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=2)
    augmented_audios.append(audio_pitched)
    # Time Stretching
    audio_stretched = librosa.effects.time_stretch(audio, rate=0.9)
    augmented_audios.append(audio_stretched)
    # Adding Noise
    noise = np.random.normal(0, 0.005, audio.shape)
    audio_noisy = audio + noise
    augmented_audios.append(audio_noisy)
    return augmented_audios



In [6]:
def extract_features_from_audio(audio, sample_rate):
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.concatenate((mfcc, mfcc_delta, mfcc_delta2), axis=0)
    return features.T

X = []
y = []

for i, file in enumerate(data_files):
    audio, sr = librosa.load(file, sr=None)
    # Original features
    features = extract_features_from_audio(audio, sr)
    X.append(features)
    y.append(data_labels[i])
    # Data augmentation
    augmented_audios = augment_audio(audio, sr)
    for aug_audio in augmented_audios:
        # Ensure same length
        aug_audio = librosa.util.fix_length(aug_audio, size=len(audio))
        features_aug = extract_features_from_audio(aug_audio, sr)
        X.append(features_aug)
        y.append(data_labels[i])



In [None]:
# Find the maximum sequence length
sequence_lengths = [len(features) for features in X]
max_length = max(sequence_lengths)
print(f'Maximum sequence length: {max_length}')

# Pad sequences
def pad_sequence(features, max_length):
    if len(features) < max_length:
        pad_width = max_length - len(features)
        return np.pad(features, ((0, pad_width), (0, 0)), mode='constant')
    else:
        return features[:max_length, :]

X_padded = [pad_sequence(features, max_length) for features in X]
X_padded = np.array(X_padded)
y = np.array(y)
print(f'Shape of X_padded: {X_padded.shape}')


In [37]:
# Reshape for normalization
num_samples, time_steps, num_features = X_padded.shape
X_reshaped = X_padded.reshape(-1, num_features)

# Standardize features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_reshaped)

# Reshape back to original shape
X_normalized = X_normalized.reshape(num_samples, time_steps, num_features)


In [38]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(enumerate(class_weights))
np.save('class_weights.npy', class_weights)
print(f'Class weights: {class_weight_dict}')


Class weights: {0: np.float64(1.0176790571169538), 1: np.float64(1.8865546218487395), 2: np.float64(1.0355166051660516), 3: np.float64(0.6572014051522248)}


In [39]:
# One-hot encode labels
num_classes = len(emotions)
y_encoded = to_categorical(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y_encoded, test_size=0.2, random_state=42, stratify=y)

print(f'Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}')


Training samples: 14368, Testing samples: 3592


In [45]:
unique_labels, counts = np.unique(y, return_counts=True)
label_counts = dict(zip(unique_labels, counts))
print(f'Number of different labels in y: {len(unique_labels)}')
print(f'Counts of each label in y: {label_counts}')

Number of different labels in y: 4
Counts of each label in y: {np.int64(0): np.int64(4412), np.int64(1): np.int64(2380), np.int64(2): np.int64(4336), np.int64(3): np.int64(6832)}


In [46]:
# Save the arrays to .npy files
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [48]:
# Load the arrays from .npy files
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (8816, 911, 39)
Shape of X_test: (2204, 911, 39)
Shape of y_train: (8816,)
Shape of y_test: (2204,)


In [14]:
unique_labels, counts = np.unique(y_train, return_counts=True)
label_counts = dict(zip(unique_labels, counts))
print(f'Number of different labels in y_train: {len(unique_labels)}')
print(f'Counts of each label in y_train: {label_counts}')

Number of different labels in y_train: 2
Counts of each label in y_train: {np.float64(0.0): np.int64(34059), np.float64(1.0): np.int64(11353)}
