In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**With Duplicate**

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Dropout, Concatenate, BatchNormalization, Lambda
)
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from tensorflow.keras.utils import to_categorical

# ----------- DATA LOADING & DUPLICATION -----------
data_path = '/content/drive/MyDrive/dataset/preprocessed-data/mahnob_HCI_preprocessed_all.npy'
data = np.load(data_path, allow_pickle=True)

def duplicate_dataset(data, num_duplicates):
    duplicated_data = data.tolist()
    for _ in range(num_duplicates):
        duplicated_data.extend(data.tolist())
    return np.array(duplicated_data)

num_duplicates = 1
data = duplicate_dataset(data, num_duplicates)

# ----------- FEATURES AND LABELS -----------
X_eeg   = np.array([entry['eeg_data']  for entry in data])
X_video = np.array([entry['video_data'] for entry in data]) # (N, 32, 64, 64)
X_ecg   = np.array([entry['ecg_data']  for entry in data])
y       = np.array([entry['labels']    for entry in data])
print(f"Number of samples after duplication: {len(data)}")

# Pad video data if needed
if X_video.shape[1:] != (32, 64, 64):
    padded_video = []
    for video in X_video:
        pad_shape = [
            (0, max(0, 32 - video.shape[0])),
            (0, max(0, 64 - video.shape[1])),
            (0, max(0, 64 - video.shape[2]))
        ]
        padded_video.append(np.pad(video, pad_shape, mode='constant'))
    X_video = np.array(padded_video)

# ----------- CLASS BALANCING -----------
def balance_classes(X1, X2, X3, y, class_idx):
    labels = y[:, class_idx]
    unique_classes, class_counts = np.unique(labels, return_counts=True)
    max_count = np.max(class_counts)

    balanced_X1, balanced_X2, balanced_X3, balanced_y = [], [], [], []
    for cls in unique_classes:
        cls_mask = labels == cls
        cls_X1, cls_X2, cls_X3, cls_y = X1[cls_mask], X2[cls_mask], X3[cls_mask], y[cls_mask]

        if len(cls_y) < max_count:
            X1_oversampled, X2_oversampled, X3_oversampled, y_oversampled = resample(
                cls_X1, cls_X2, cls_X3, cls_y, replace=True,
                n_samples=max_count - len(cls_y), random_state=42)
            cls_X1 = np.concatenate([cls_X1, X1_oversampled])
            cls_X2 = np.concatenate([cls_X2, X2_oversampled])
            cls_X3 = np.concatenate([cls_X3, X3_oversampled])
            cls_y  = np.concatenate([cls_y,  y_oversampled])
        balanced_X1.append(cls_X1)
        balanced_X2.append(cls_X2)
        balanced_X3.append(cls_X3)
        balanced_y.append(cls_y)
    return (np.concatenate(balanced_X1),
            np.concatenate(balanced_X2),
            np.concatenate(balanced_X3),
            np.concatenate(balanced_y))

X_eeg, X_video, X_ecg, y = balance_classes(X_eeg, X_video, X_ecg, y, class_idx=1)
X_eeg, X_video, X_ecg, y = balance_classes(X_eeg, X_video, X_ecg, y, class_idx=2)

# ----------- SPLITTING -----------
X_eeg_train, X_eeg_test, X_video_train, X_video_test, X_ecg_train, X_ecg_test, y_train, y_test = \
    train_test_split(X_eeg, X_video, X_ecg, y, test_size=0.2, random_state=42)

num_valence_classes = int(np.max(y[:, 1])) + 1
num_arousal_classes = int(np.max(y[:, 2])) + 1

y_valence_train = to_categorical(y_train[:, 1], num_classes=num_valence_classes)
y_arousal_train = to_categorical(y_train[:, 2], num_classes=num_arousal_classes)
y_valence_test  = to_categorical(y_test[:, 1], num_classes=num_valence_classes)
y_arousal_test  = to_categorical(y_test[:, 2], num_classes=num_arousal_classes)

# ----------- VIDEO DATA FOR EFFICIENTNETB0 -----------
X_video_train_2d = X_video_train[:, 0, :, :][..., np.newaxis]  # [N, 64, 64, 1]
X_video_test_2d  = X_video_test[:, 0, :, :][..., np.newaxis]

# EfficientNet expects 3 channels and specific preprocessing
def efficientnet_preprocess(x):
    x = np.repeat(x, 3, axis=-1)  # (N, 64, 64, 3)
    return preprocess_input(x)

X_video_train_2d = efficientnet_preprocess(X_video_train_2d)
X_video_test_2d  = efficientnet_preprocess(X_video_test_2d)

# ----------- MODEL DEFINITION -----------

# ==== Video branch: EfficientNetB0 ====
video_input = Input(shape=(64, 64, 3), name="video_input")
efficientnet_base = EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(64, 64, 3),
    pooling='avg'
)
efficientnet_base.trainable = False  # Set True to fine-tune
x = efficientnet_base(video_input)
cnn_output = Dense(128, activation='relu')(x)

# ==== EEG branch ====
eeg_input = Input(shape=(5, 512), name="eeg_input")
lstm_eeg = LSTM(128, return_sequences=True)(eeg_input)
lstm_eeg = Dropout(0.2)(lstm_eeg)
lstm_eeg = LSTM(64)(lstm_eeg)
lstm_eeg_output = Dense(128, activation='relu')(lstm_eeg)

# ==== ECG branch ====
ecg_input = Input(shape=(2, 512), name="ecg_input")
lstm_ecg = LSTM(64, return_sequences=True)(ecg_input)
lstm_ecg = Dropout(0.2)(lstm_ecg)
lstm_ecg = LSTM(32)(lstm_ecg)
lstm_ecg_output = Dense(64, activation='relu')(lstm_ecg)

# ==== Combine modalities ====
combined = Concatenate()([cnn_output, lstm_eeg_output, lstm_ecg_output])
combined = Dense(256, activation='relu')(combined)
combined = Dropout(0.3)(combined)
combined = Dense(128, activation='relu')(combined)
combined = Dropout(0.2)(combined)

# ==== Output layers ====
valence_output = Dense(num_valence_classes, activation='softmax', name="valence_output")(combined)
arousal_output = Dense(num_arousal_classes, activation='softmax', name="arousal_output")(combined)

model = Model(
    inputs=[video_input, eeg_input, ecg_input],
    outputs=[valence_output, arousal_output]
)

model.compile(
    optimizer='adam',
    loss=['categorical_crossentropy', 'categorical_crossentropy'],
    metrics=['accuracy', 'accuracy']
)

model.summary()

# ----------- TRAINING -----------
history = model.fit(
    [X_video_train_2d, X_eeg_train, X_ecg_train],
    [y_valence_train, y_arousal_train],
    validation_data=(
        [X_video_test_2d, X_eeg_test, X_ecg_test],
        [y_valence_test, y_arousal_test]
    ),
    epochs=50,
    batch_size=128
)

# ----------- EVALUATION -----------
y_valence_pred, y_arousal_pred = model.predict([X_video_test_2d, X_eeg_test, X_ecg_test])
y_valence_pred_classes = np.argmax(y_valence_pred, axis=1)
y_arousal_pred_classes = np.argmax(y_arousal_pred, axis=1)
y_valence_true = np.argmax(y_valence_test, axis=1)
y_arousal_true = np.argmax(y_arousal_test, axis=1)

valence_accuracy = accuracy_score(y_valence_true, y_valence_pred_classes)
arousal_accuracy = accuracy_score(y_arousal_true, y_arousal_pred_classes)

print(f"Valence Accuracy: {valence_accuracy * 100:.2f}%")
print(f"Arousal Accuracy: {arousal_accuracy * 100:.2f}%")


Number of samples after duplication: 372
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2s/step - arousal_output_accuracy: 0.0918 - arousal_output_loss: 2.3176 - loss: 4.6028 - valence_output_accuracy: 0.1441 - valence_output_loss: 2.2805 - val_arousal_output_accuracy: 0.2284 - val_arousal_output_loss: 2.1845 - val_loss: 4.3705 - val_valence_output_accuracy: 0.2132 - val_valence_output_loss: 2.1797
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 702ms/step - arousal_output_accuracy: 0.1837 - arousal_output_loss: 2.1731 - loss: 4.2924 - valence_output_accuracy: 0.2247 - valence_output_loss: 2.1186 - val_arousal_output_accuracy: 0.2538 - val_arousal_output_loss: 2.0434 - val_loss: 4.0503 - val_valence_output_accuracy: 0.2386 - val_valence_output_loss: 2.0053
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 711ms/step - arousal_output_accuracy: 0.2340 - arousal_output_loss: 2.0789 - loss: 4.0760 - valence_output_accuracy: 0.2720 - valence_output_loss: 1.

**Without duplicate data**

In [3]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Dropout, Concatenate, BatchNormalization, Lambda
)
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from tensorflow.keras.utils import to_categorical

# ----------- DATA LOADING & DUPLICATION -----------
data_path = '/content/drive/MyDrive/dataset/preprocessed-data/mahnob_HCI_preprocessed_all.npy'
data = np.load(data_path, allow_pickle=True)

# ----------- FEATURES AND LABELS -----------
X_eeg   = np.array([entry['eeg_data']  for entry in data])
X_video = np.array([entry['video_data'] for entry in data]) # (N, 32, 64, 64)
X_ecg   = np.array([entry['ecg_data']  for entry in data])
y       = np.array([entry['labels']    for entry in data])
print(f"Number of samples after duplication: {len(data)}")

# Pad video data if needed
if X_video.shape[1:] != (32, 64, 64):
    padded_video = []
    for video in X_video:
        pad_shape = [
            (0, max(0, 32 - video.shape[0])),
            (0, max(0, 64 - video.shape[1])),
            (0, max(0, 64 - video.shape[2]))
        ]
        padded_video.append(np.pad(video, pad_shape, mode='constant'))
    X_video = np.array(padded_video)

# ----------- CLASS BALANCING -----------
def balance_classes(X1, X2, X3, y, class_idx):
    labels = y[:, class_idx]
    unique_classes, class_counts = np.unique(labels, return_counts=True)
    max_count = np.max(class_counts)

    balanced_X1, balanced_X2, balanced_X3, balanced_y = [], [], [], []
    for cls in unique_classes:
        cls_mask = labels == cls
        cls_X1, cls_X2, cls_X3, cls_y = X1[cls_mask], X2[cls_mask], X3[cls_mask], y[cls_mask]

        if len(cls_y) < max_count:
            X1_oversampled, X2_oversampled, X3_oversampled, y_oversampled = resample(
                cls_X1, cls_X2, cls_X3, cls_y, replace=True,
                n_samples=max_count - len(cls_y), random_state=42)
            cls_X1 = np.concatenate([cls_X1, X1_oversampled])
            cls_X2 = np.concatenate([cls_X2, X2_oversampled])
            cls_X3 = np.concatenate([cls_X3, X3_oversampled])
            cls_y  = np.concatenate([cls_y,  y_oversampled])
        balanced_X1.append(cls_X1)
        balanced_X2.append(cls_X2)
        balanced_X3.append(cls_X3)
        balanced_y.append(cls_y)
    return (np.concatenate(balanced_X1),
            np.concatenate(balanced_X2),
            np.concatenate(balanced_X3),
            np.concatenate(balanced_y))

X_eeg, X_video, X_ecg, y = balance_classes(X_eeg, X_video, X_ecg, y, class_idx=1)
X_eeg, X_video, X_ecg, y = balance_classes(X_eeg, X_video, X_ecg, y, class_idx=2)

# ----------- SPLITTING -----------
X_eeg_train, X_eeg_test, X_video_train, X_video_test, X_ecg_train, X_ecg_test, y_train, y_test = \
    train_test_split(X_eeg, X_video, X_ecg, y, test_size=0.2, random_state=42)

num_valence_classes = int(np.max(y[:, 1])) + 1
num_arousal_classes = int(np.max(y[:, 2])) + 1

y_valence_train = to_categorical(y_train[:, 1], num_classes=num_valence_classes)
y_arousal_train = to_categorical(y_train[:, 2], num_classes=num_arousal_classes)
y_valence_test  = to_categorical(y_test[:, 1], num_classes=num_valence_classes)
y_arousal_test  = to_categorical(y_test[:, 2], num_classes=num_arousal_classes)

# ----------- VIDEO DATA FOR EFFICIENTNETB0 -----------
X_video_train_2d = X_video_train[:, 0, :, :][..., np.newaxis]  # [N, 64, 64, 1]
X_video_test_2d  = X_video_test[:, 0, :, :][..., np.newaxis]

# EfficientNet expects 3 channels and specific preprocessing
def efficientnet_preprocess(x):
    x = np.repeat(x, 3, axis=-1)  # (N, 64, 64, 3)
    return preprocess_input(x)

X_video_train_2d = efficientnet_preprocess(X_video_train_2d)
X_video_test_2d  = efficientnet_preprocess(X_video_test_2d)

# ----------- MODEL DEFINITION -----------

# ==== Video branch: EfficientNetB0 ====
video_input = Input(shape=(64, 64, 3), name="video_input")
efficientnet_base = EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(64, 64, 3),
    pooling='avg'
)
efficientnet_base.trainable = False  # Set True to fine-tune
x = efficientnet_base(video_input)
cnn_output = Dense(128, activation='relu')(x)

# ==== EEG branch ====
eeg_input = Input(shape=(5, 512), name="eeg_input")
lstm_eeg = LSTM(128, return_sequences=True)(eeg_input)
lstm_eeg = Dropout(0.2)(lstm_eeg)
lstm_eeg = LSTM(64)(lstm_eeg)
lstm_eeg_output = Dense(128, activation='relu')(lstm_eeg)

# ==== ECG branch ====
ecg_input = Input(shape=(2, 512), name="ecg_input")
lstm_ecg = LSTM(64, return_sequences=True)(ecg_input)
lstm_ecg = Dropout(0.2)(lstm_ecg)
lstm_ecg = LSTM(32)(lstm_ecg)
lstm_ecg_output = Dense(64, activation='relu')(lstm_ecg)

# ==== Combine modalities ====
combined = Concatenate()([cnn_output, lstm_eeg_output, lstm_ecg_output])
combined = Dense(256, activation='relu')(combined)
combined = Dropout(0.3)(combined)
combined = Dense(128, activation='relu')(combined)
combined = Dropout(0.2)(combined)

# ==== Output layers ====
valence_output = Dense(num_valence_classes, activation='softmax', name="valence_output")(combined)
arousal_output = Dense(num_arousal_classes, activation='softmax', name="arousal_output")(combined)

model = Model(
    inputs=[video_input, eeg_input, ecg_input],
    outputs=[valence_output, arousal_output]
)

model.compile(
    optimizer='adam',
    loss=['categorical_crossentropy', 'categorical_crossentropy'],
    metrics=['accuracy', 'accuracy']
)

model.summary()

# ----------- TRAINING -----------
history = model.fit(
    [X_video_train_2d, X_eeg_train, X_ecg_train],
    [y_valence_train, y_arousal_train],
    validation_data=(
        [X_video_test_2d, X_eeg_test, X_ecg_test],
        [y_valence_test, y_arousal_test]
    ),
    epochs=50,
    batch_size=128
)

# ----------- EVALUATION -----------
y_valence_pred, y_arousal_pred = model.predict([X_video_test_2d, X_eeg_test, X_ecg_test])
y_valence_pred_classes = np.argmax(y_valence_pred, axis=1)
y_arousal_pred_classes = np.argmax(y_arousal_pred, axis=1)
y_valence_true = np.argmax(y_valence_test, axis=1)
y_arousal_true = np.argmax(y_arousal_test, axis=1)

valence_accuracy = accuracy_score(y_valence_true, y_valence_pred_classes)
arousal_accuracy = accuracy_score(y_arousal_true, y_arousal_pred_classes)

print(f"Valence Accuracy: {valence_accuracy * 100:.2f}%")
print(f"Arousal Accuracy: {arousal_accuracy * 100:.2f}%")


Number of samples after duplication: 186
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 4s/step - arousal_output_accuracy: 0.1275 - arousal_output_loss: 2.3194 - loss: 4.6091 - valence_output_accuracy: 0.1287 - valence_output_loss: 2.2890 - val_arousal_output_accuracy: 0.1915 - val_arousal_output_loss: 2.2324 - val_loss: 4.5147 - val_valence_output_accuracy: 0.1596 - val_valence_output_loss: 2.2822
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 937ms/step - arousal_output_accuracy: 0.1599 - arousal_output_loss: 2.2299 - loss: 4.4315 - valence_output_accuracy: 0.1918 - valence_output_loss: 2.2006 - val_arousal_output_accuracy: 0.3298 - val_arousal_output_loss: 2.1582 - val_loss: 4.4114 - val_valence_output_accuracy: 0.1915 - val_valence_output_loss: 2.2532
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 812ms/step - arousal_output_accuracy: 0.1858 - arousal_output_loss: 2.1420 - loss: 4.3025 - valence_output_accuracy: 0.2085 - valence_output_loss: 2.