In [1]:
import os
import numpy as np
import pandas as pd
import pyedflib
from pyedflib import highlevel
from scipy.signal import resample
from sklearn.utils import shuffle
import json
import mne

In [2]:
SAMPLE_RATE = 128  # fs
# SAMPLE_LEN = 1.0   # sample seconds
# OVERLAPPING = 0.8  # overlapping seconds
sub_folder_path = str(SAMPLE_RATE) + 'Hz'
sub_folder_path

'128Hz'

In [3]:
root = 'CAUEEG/'
# participants file path
annotation_path = os.path.join(root, 'annotation.xlsx')
annotations = pd.read_excel(annotation_path)
annotations

Unnamed: 0,serial,age,dementia,ad,load,eoad,vd,sivd,ad_vd_mixed,mci,...,ftd,bvftd,language_ftd,semantic_aphasia,non_fluent_aphasia,parkinson_synd,parkinson_disease,parkinson_dementia,nph,tga
0,1,78,,,,,,,,1.0,...,,,,,,,,,,
1,2,56,,,,,,,,,...,,,,,,,,,,
2,3,93,,,,,,,,1.0,...,,,,,,,,,,
3,4,78,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
4,5,75,,,,,,,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1374,1384,57,1.0,1.0,,1.0,,,,,...,,,,,,,,,,
1375,1385,77,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
1376,1386,80,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
1377,1387,83,,,,,,,,1.0,...,,,,,,,,,,


## Labels

In [38]:
conditions = [
    annotations["dementia"] == 1,         # condition 1：Dementia: Alzheimer's Disease, Vascular Dementia, Parkinson's Dementia 
    annotations["mci"] == 1,        # condition 2：Mild Cognitive Impairment
    annotations["normal"] == 1      # condition 3：Normal control
]
choices = [1, 2, 0]        # corresponding labels, De=1, MCI=2, NC=0

# np.select, if no condition is met, default to 3
annotations["label"] = np.select(conditions, choices, default=3)
annotations

Unnamed: 0,serial,age,dementia,ad,load,eoad,vd,sivd,ad_vd_mixed,mci,...,bvftd,language_ftd,semantic_aphasia,non_fluent_aphasia,parkinson_synd,parkinson_disease,parkinson_dementia,nph,tga,label
0,1,78,,,,,,,,1.0,...,,,,,,,,,,2
1,2,56,,,,,,,,,...,,,,,,,,,,0
2,3,93,,,,,,,,1.0,...,,,,,,,,,,2
3,4,78,1.0,1.0,1.0,,,,,,...,,,,,,,,,,1
4,5,75,,,,,,,,1.0,...,,,,,,,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1374,1384,57,1.0,1.0,,1.0,,,,,...,,,,,,,,,,1
1375,1385,77,1.0,1.0,1.0,,,,,,...,,,,,,,,,,1
1376,1386,80,1.0,1.0,1.0,,,,,,...,,,,,,,,,,1
1377,1387,83,,,,,,,,1.0,...,,,,,,,,,,2


In [39]:
df_label = annotations[['label','serial']]
label_path = 'Processed/' + sub_folder_path + '/CAUEEG/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)
    
np.save(label_path + '/label.npy', df_label.values)

In [40]:
np.load(label_path + '/label.npy')

array([[   2,    1],
       [   0,    2],
       [   2,    3],
       ...,
       [   1, 1386],
       [   2, 1387],
       [   2, 1388]], dtype=int64)

## Features

In [41]:
f = pyedflib.EdfReader("CAUEEG/signal/edf/00001.edf")
channels = f.getSignalLabels()
f.close()

In [42]:
channels

['Fp1-AVG',
 'F3-AVG',
 'C3-AVG',
 'P3-AVG',
 'O1-AVG',
 'Fp2-AVG',
 'F4-AVG',
 'C4-AVG',
 'P4-AVG',
 'O2-AVG',
 'F7-AVG',
 'T3-AVG',
 'T5-AVG',
 'F8-AVG',
 'T4-AVG',
 'T6-AVG',
 'FZ-AVG',
 'CZ-AVG',
 'PZ-AVG',
 'EKG',
 'Photic']

In [43]:
# 'Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4', 'T6', 'O1', 'O2'
li_std_ch = ['Fp1-AVG', 'Fp2-AVG', 'F7-AVG', 'F3-AVG', 'FZ-AVG', 'F4-AVG', 'F8-AVG', 'T3-AVG', 'C3-AVG', 'CZ-AVG', 'C4-AVG', 'T4-AVG', 'T5-AVG', 'P3-AVG', 'PZ-AVG', 'P4-AVG', 'T6-AVG', 'O1-AVG', 'O2-AVG']

In [44]:
def generate_mask_from_events(json_path, total_samples, sample_rate=256, artifact_keywords=None):
    """
    Generate a boolean mask for valid EEG samples based on events.
    Marks valid recording periods and excludes artifacts + HV intervals.

    Parameters
    ----------
    json_path : str
        Path to the event JSON file.
    total_samples : int
        Total number of EEG samples.
    sample_rate : int
        Sampling rate in Hz (default: 256).
    artifact_keywords : list of str, optional
        Keywords to detect noisy artifacts (default: common EEG noise terms).

    Returns
    -------
    mask : np.ndarray of bool
        Boolean array indicating valid EEG data samples.
    """

    if artifact_keywords is None:
        artifact_keywords = [
            'artifact', 'blink', 'eye', 'noise', 'movement', 'move', 'cough',
            'muscle', 'chewing', 'swallow', 'talk', 'sleep', 'sweating', 'jerking', 'drowsy'
        ]

    with open(json_path, 'r') as f:
        events = json.load(f)

    mask = np.zeros(total_samples, dtype=bool)

    # Step 1: Mark valid recording periods
    is_recording = False
    segment_start = None
    for timestamp, label in events:
        label_lower = label.lower()
        if label_lower in ("start recording", "recording resumed"):
            segment_start = timestamp
            is_recording = True
        elif label_lower == "paused" and is_recording:
            segment_end = timestamp
            mask[segment_start:min(segment_end, total_samples)] = True
            is_recording = False
            segment_start = None

    # Step 2: Remove artifact regions (±1s)
    for timestamp, label in events:
        label_lower = label.lower()
        if any(key in label_lower for key in artifact_keywords):
            start = max(0, timestamp - int(1 * sample_rate))
            end = min(total_samples, timestamp + int(1 * sample_rate))
            mask[start:end] = False

    return mask

In [45]:
# resample the time series data from original_fs to target_fs
def resample_time_series(data, original_fs, target_fs):
    T, C = data.shape
    new_length = int(T * target_fs / original_fs)

    resampled_data = np.zeros((new_length, C))
    for i in range(C):
        resampled_data[:, i] = resample(data[:, i], new_length)

    return resampled_data

def band_pass_filter(
    eeg_data: np.ndarray,
    sfreq: float,
    ch_names: list,
    verbose=True
):
    """
    Clean EEG data using bandpass filtering, percentile-based bad channel detection,
    ICA + ICLabel artifact removal, resampling, re-referencing, epoching, and z-score normalization.

    Args:
        eeg_data (np.ndarray): EEG data, shape (T, C).
        sfreq (float): Original sampling frequency.
        ch_names (list): List of channel names.
        resample_sfreq (float): Target sampling frequency.
        verbose (bool): Verbose output.

    Returns:
        np.ndarray: Cleaned, normalized EEG data, shape (n_epochs, time_steps, channels).
    """
    # 1. Construct MNE Raw object
    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=['eeg'] * len(ch_names))
    raw = mne.io.RawArray(eeg_data.T, info)

    # 2. Set Montage
    raw.set_montage(mne.channels.make_standard_montage('standard_1020'))
    if verbose:
        print("✔ Montage set: 'standard_1020'.")

    #  Bandpass Filter (0.5–45 Hz)
    raw.filter(l_freq=0.5, h_freq=45.0, verbose=False)
    if verbose:
        print("✔ Bandpass filter applied (0.5–45 Hz).")
        
    return raw.get_data().T
        

def eeg_data(std_edf_path, event_path, li_std_ch, target_freq=128):
    signals, signal_headers, _ = highlevel.read_edf(std_edf_path, ch_names=li_std_ch)
    freq = signal_headers[0]['sample_frequency']
    print("Original frequency ", freq)
    data = signals.T
    print("Raw data shape ", data.shape)
    mask = generate_mask_from_events(event_path, data.shape[0], freq)
    data = data[mask]
    print("Cleaned data shape ", data.shape)
    data = band_pass_filter(data, freq, ch_names = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4', 'T6', 'O1', 'O2'])
    if freq != target_freq:
        data = resample_time_series(data, freq, target_freq)

    print("Resampled data shape ", data.shape)
    return data

In [46]:
# main
feature_path = 'Processed/' + sub_folder_path + '/CAUEEG/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)


# 00955.json file does not have paused event, we manually add it to the end of the file before running the code
file_root_path = 'CAUEEG/signal/edf/'
event_root_path = 'CAUEEG/event/'
for file, event in zip(os.listdir(file_root_path), os.listdir(event_root_path)):
    if file.endswith('.edf'):
        file_path = os.path.join(file_root_path, file)
        event_path = os.path.join(event_root_path, event)
        print("Processing ", file)
        data = eeg_data(file_path, event_path, li_std_ch, SAMPLE_RATE)
        print("Data shape ", data.shape)
        # save the data
        np.save(os.path.join(feature_path, file[:-4] + '.npy'), data)
        print("----------------------------------------------------------------\n")

Processing  00001.edf
Original frequency  200.0
Raw data shape  (145600, 19)
Cleaned data shape  (133400, 19)
Creating RawArray with float64 data, n_channels=19, n_times=133400
    Range : 0 ... 133399 =      0.000 ...   666.995 secs
Ready.
✔ Montage set: 'standard_1020'.
✔ Bandpass filter applied (0.5–45 Hz).
Resampled data shape  (85376, 19)
Data shape  (85376, 19)
----------------------------------------------------------------

Processing  00002.edf
Original frequency  200.0
Raw data shape  (208600, 19)
Cleaned data shape  (191944, 19)
Creating RawArray with float64 data, n_channels=19, n_times=191944
    Range : 0 ... 191943 =      0.000 ...   959.715 secs
Ready.
✔ Montage set: 'standard_1020'.
✔ Bandpass filter applied (0.5–45 Hz).
Resampled data shape  (122844, 19)
Data shape  (122844, 19)
----------------------------------------------------------------

Processing  00003.edf
Original frequency  200.0
Raw data shape  (130200, 19)
Cleaned data shape  (123600, 19)
Creating RawArra

In [5]:
# Test the saved npy file
# example

path = feature_path

total_length = 0
for file in os.listdir(path):
    sub_path = os.path.join(path, file)
    print(np.load(sub_path).shape)
    total_length += np.load(sub_path).shape[0]
print("\nTotal length:", total_length)

(85376, 19)
(122844, 19)
(79104, 19)
(97664, 19)
(101840, 19)
(96128, 19)
(105216, 19)
(57619, 19)
(123904, 19)
(106752, 19)
(87296, 19)
(87816, 19)
(77824, 19)
(104576, 19)
(74240, 19)
(69504, 19)
(105856, 19)
(69376, 19)
(94208, 19)
(107776, 19)
(71680, 19)
(116486, 19)
(152192, 19)
(72960, 19)
(111906, 19)
(109824, 19)
(119040, 19)
(101504, 19)
(109056, 19)
(105216, 19)
(74752, 19)
(112602, 19)
(81390, 19)
(108800, 19)
(92544, 19)
(101760, 19)
(114560, 19)
(90496, 19)
(96256, 19)
(71808, 19)
(102656, 19)
(116535, 19)
(113536, 19)
(114688, 19)
(103296, 19)
(153216, 19)
(98048, 19)
(78976, 19)
(121472, 19)
(77440, 19)
(80000, 19)
(105984, 19)
(95343, 19)
(124672, 19)
(56832, 19)
(102016, 19)
(61824, 19)
(102400, 19)
(126464, 19)
(105233, 19)
(78720, 19)
(85019, 19)
(90752, 19)
(99584, 19)
(100864, 19)
(97024, 19)
(104320, 19)
(105600, 19)
(100864, 19)
(103040, 19)
(112256, 19)
(73728, 19)
(72064, 19)
(71808, 19)
(96512, 19)
(112768, 19)
(114304, 19)
(102272, 19)
(85376, 19)
(82816, 19