## Imports for CNN Training

In [1]:
# MNE      : load and handle EEG data
# NumPy    : array operations
# glob/os  : file paths
# re       : parse subject IDs and labels from filenames
# sklearn  : data splitting and evaluation metrics
# TensorFlow/Keras : build and train the CNN

import os
import glob
import re
import numpy as np
import mne

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras import layers, models


## Set Data Folder Path

In [2]:
data_folder = r"E:\Personal Project\Recognizing Mood Disorder Using EEG\EEG Data\filtered_EEG_data"
print("Data folder:", data_folder)

Data folder: E:\Personal Project\Recognizing Mood Disorder Using EEG\EEG Data\filtered_EEG_data


## List EEG Files & Parse Filenames

In [3]:
# Get all cleaned epoch files
file_paths = glob.glob(os.path.join(data_folder, "*-epo.fif"))
print(f"Found {len(file_paths)} files.")
print("Example files:", [os.path.basename(fp) for fp in file_paths[:5]])

# Function to extract subject_id and label from filename
def parse_filename(file_path):
    name = os.path.basename(file_path)

    subject_match = re.search(r"(\d{8})", name)
    label_match = re.search(r"_label_([A-Za-z]+)", name)

    subject_id = subject_match.group(1) if subject_match else "UNKNOWN"
    label = label_match.group(1) if label_match else "UNKNOWN"

    return subject_id, label

Found 53 files.
Example files: ['02010002_label_MDD-epo.fif', '02010004_label_MDD-epo.fif', '02010005_label_MDD-epo.fif', '02010006_label_MDD-epo.fif', '02010008_label_MDD-epo.fif']


## Load All Subjects (Build X, y, groups)

In [4]:
X_list = []
y_list = []
groups_list = []

label_map = {"HC": 0, "MDD": 1}

for fp in file_paths:
    subject_id, label_str = parse_filename(fp)

    # ignore unknown labels
    if label_str not in label_map:
        print("Skipping (unknown label):", fp)
        continue

    print("Loading:", os.path.basename(fp),
          "| subject:", subject_id,
          "| label:", label_str)

    # Load epochs
    epochs = mne.read_epochs(fp, preload=True, verbose="ERROR")
    data = epochs.get_data()  # (epochs, channels, times)

    n_epochs = data.shape[0]

    # Store data
    X_list.append(data)
    y_list.append(np.full(n_epochs, label_map[label_str], dtype=int))
    groups_list.append(np.full(n_epochs, subject_id))

# Combine lists into arrays
X = np.concatenate(X_list, axis=0)       # (N, C, T)
y = np.concatenate(y_list, axis=0)       # (N,)
groups = np.concatenate(groups_list, 0)  # (N,)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Unique subjects:", len(np.unique(groups)))
print("Class counts:", np.bincount(y))


Loading: 02010002_label_MDD-epo.fif | subject: 02010002 | label: MDD
Loading: 02010004_label_MDD-epo.fif | subject: 02010004 | label: MDD
Loading: 02010005_label_MDD-epo.fif | subject: 02010005 | label: MDD
Loading: 02010006_label_MDD-epo.fif | subject: 02010006 | label: MDD
Loading: 02010008_label_MDD-epo.fif | subject: 02010008 | label: MDD
Loading: 02010010_label_MDD-epo.fif | subject: 02010010 | label: MDD
Loading: 02010011_label_MDD-epo.fif | subject: 02010011 | label: MDD
Loading: 02010012_label_MDD-epo.fif | subject: 02010012 | label: MDD
Loading: 02010013_label_MDD-epo.fif | subject: 02010013 | label: MDD
Loading: 02010015_label_MDD-epo.fif | subject: 02010015 | label: MDD
Loading: 02010016_label_MDD-epo.fif | subject: 02010016 | label: MDD
Loading: 02010018_label_MDD-epo.fif | subject: 02010018 | label: MDD
Loading: 02010019_label_MDD-epo.fif | subject: 02010019 | label: MDD
Loading: 02010021_label_MDD-epo.fif | subject: 02010021 | label: MDD
Loading: 02010022_label_MDD-epo.fi

## Subject-wise Train/Val/Test Split

In [5]:
# Ensures that all epochs from a subject stay in the same split.

unique_subjects = np.unique(groups)

# Split subjects: 80% train+val, 20% test
trainval_subj, test_subj = train_test_split(
    unique_subjects,
    test_size=0.2,
    random_state=42
)

# Split train+val into 75% train, 25% val
train_subj, val_subj = train_test_split(
    trainval_subj,
    test_size=0.25,    # 0.25 of 80% = 20%
    random_state=42
)

print("Train subjects:", len(train_subj))
print("Val subjects:  ", len(val_subj))
print("Test subjects: ", len(test_subj))

# Masks for selecting epochs by subject
train_mask = np.isin(groups, train_subj)
val_mask   = np.isin(groups, val_subj)
test_mask  = np.isin(groups, test_subj)

# Build epoch-level datasets
X_train = X[train_mask]
y_train = y[train_mask]

X_val   = X[val_mask]
y_val   = y[val_mask]

X_test  = X[test_mask]
y_test  = y[test_mask]

print("Train epochs:", X_train.shape)
print("Val epochs:  ", X_val.shape)
print("Test epochs: ", X_test.shape)

Train subjects: 31
Val subjects:   11
Test subjects:  11
Train epochs: (4688, 128, 500)
Val epochs:   (1666, 128, 500)
Test epochs:  (1658, 128, 500)


## Normalize Data & Reshape for 1D CNN

In [6]:
# Convert to float32
X_train = X_train.astype("float32")
X_val   = X_val.astype("float32")
X_test  = X_test.astype("float32")

# Global normalization (use train stats)
mean = X_train.mean()
std  = X_train.std() + 1e-7

X_train = (X_train - mean) / std
X_val   = (X_val   - mean) / std
X_test  = (X_test  - mean) / std

# Current shape: (N, channels, time)
n_epochs_train, n_channels, n_times = X_train.shape
print("Original train shape:", X_train.shape)

# For Conv1D: (N, time, channels)
X_train_cnn = np.transpose(X_train, (0, 2, 1))
X_val_cnn   = np.transpose(X_val,   (0, 2, 1))
X_test_cnn  = np.transpose(X_test,  (0, 2, 1))

print("CNN train shape:", X_train_cnn.shape)
print("CNN val shape:  ", X_val_cnn.shape)
print("CNN test shape: ", X_test_cnn.shape)

Original train shape: (4688, 128, 500)
CNN train shape: (4688, 500, 128)
CNN val shape:   (1666, 500, 128)
CNN test shape:  (1658, 500, 128)


## Define 1D CNN Model

In [7]:
input_shape = (n_times, n_channels)

model = models.Sequential([
    layers.Input(shape=input_shape),

    layers.Conv1D(16, kernel_size=7, activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(2),
    layers.Dropout(0.3),

    layers.Conv1D(32, kernel_size=5, activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(2),
    layers.Dropout(0.3),

    layers.Conv1D(64, kernel_size=3, activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling1D(2),
    layers.Dropout(0.4),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),

    layers.Dense(1, activation='sigmoid')  # binary: HC vs MDD
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

## Train the CNN (with Early Stopping)

In [8]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_cnn, y_train,
    validation_data=(X_val_cnn, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 38ms/step - accuracy: 0.5689 - loss: 0.8024 - val_accuracy: 0.4178 - val_loss: 1.0695
Epoch 2/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.7931 - loss: 0.4421 - val_accuracy: 0.5738 - val_loss: 3.0559
Epoch 3/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - accuracy: 0.9384 - loss: 0.1769 - val_accuracy: 0.5072 - val_loss: 4.7481
Epoch 4/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.9597 - loss: 0.1056 - val_accuracy: 0.5462 - val_loss: 6.1808
Epoch 5/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.9810 - loss: 0.0653 - val_accuracy: 0.5084 - val_loss: 7.8002
Epoch 6/50
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.9857 - loss: 0.0492 - val_accuracy: 0.4598 - val_loss: 5.3514


## Evaluate CNN (Epoch-Level Performance)

In [9]:
# Predict probabilities → convert to class labels
y_prob_test = model.predict(X_test_cnn).ravel()
y_pred_test = (y_prob_test >= 0.5).astype(int)

# Metrics
acc_test = accuracy_score(y_test, y_pred_test)
cm_test = confusion_matrix(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)

print("Epoch-level TEST accuracy:", acc_test)
print("\nConfusion matrix (epoch-level):\n", cm_test)
print("\nClassification report (epoch-level):\n", report_test)

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Epoch-level TEST accuracy: 0.4879372738238842

Confusion matrix (epoch-level):
 [[391 360]
 [489 418]]

Classification report (epoch-level):
               precision    recall  f1-score   support

           0       0.44      0.52      0.48       751
           1       0.54      0.46      0.50       907

    accuracy                           0.49      1658
   macro avg       0.49      0.49      0.49      1658
weighted avg       0.50      0.49      0.49      1658



## Recreate Subject Split (for Subject-Level Evaluation)

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np

# Unique subject IDs
unique_subjects = np.unique(groups)

# Same subject split as before
trainval_subj, test_subj = train_test_split(
    unique_subjects,
    test_size=0.2,
    random_state=42
)

# Mask for test-set epochs
test_mask = np.isin(groups, test_subj)

# Subject ID per test epoch
groups_test = groups[test_mask]

print("Unique test subjects:", np.unique(groups_test))
print("groups_test shape:", groups_test.shape)

Unique test subjects: ['02010006' '02010010' '02010019' '02010021' '02010025' '02010028'
 '02020020' '02030003' '02030005' '02030014' '02030019']
groups_test shape: (1658,)


## Subject-Level Evaluation

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

subject_preds = {}
subject_true = {}

# Loop through each subject in the test set
for subj in np.unique(groups_test):
    mask = (groups_test == subj)

    true_labels = y_test[mask]
    pred_labels = y_pred_test[mask]

    # Majority vote
    pred = int(np.round(pred_labels.mean()))
    true = int(np.round(true_labels.mean()))

    subject_preds[subj] = pred
    subject_true[subj] = true

# Build arrays
subj_ids = sorted(subject_preds.keys())
y_true_subj = np.array([subject_true[s] for s in subj_ids])
y_pred_subj = np.array([subject_preds[s] for s in subj_ids])

# Metrics
acc_subj = accuracy_score(y_true_subj, y_pred_subj)
cm_subj = confusion_matrix(y_true_subj, y_pred_subj)
report_subj = classification_report(y_true_subj, y_pred_subj)

print("SUBJECT-level accuracy:", acc_subj)
print("\nSubject-level confusion matrix:\n", cm_subj)
print("\nSubject-level classification report:\n", report_subj)

SUBJECT-level accuracy: 0.6363636363636364

Subject-level confusion matrix:
 [[4 1]
 [3 3]]

Subject-level classification report:
               precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       0.75      0.50      0.60         6

    accuracy                           0.64        11
   macro avg       0.66      0.65      0.63        11
weighted avg       0.67      0.64      0.63        11

