In [1]:
import os
import scipy

In [2]:
ruta_carpeta_TDAH = '/kaggle/input/ieee-tdah-control-database/ieee/ADHD_group'  
ruta_carpeta_control = '/kaggle/input/ieee-tdah-control-database/ieee/Control_group'  

# Nombre de cada sujeto
sujetos_TDAH = [archivo[:-4] for archivo in os.listdir(ruta_carpeta_TDAH) if archivo.endswith('.mat')]
sujetos_TDAH.pop()
sujetos_control = [archivo[:-4] for archivo in os.listdir(ruta_carpeta_control) if archivo.endswith('.mat')]

diagnostico = {}

for sbj in sujetos_TDAH:
    diagnostico[sbj] = 1

for sbj in sujetos_control:
    diagnostico[sbj] = 0

# organizamos los datos de los sujetos con TDAH en un diccionario
eeg_tdah = {}

for i in range(len(sujetos_TDAH)):
    sbj = sujetos_TDAH[i]
    mat_file_path = ruta_carpeta_TDAH+'/'+sbj+'.mat'
    data = scipy.io.loadmat(mat_file_path)
    columna = list(data.keys())[-1]
    eeg_tdah[sbj] = data[columna].T

# organizamos los datos de los sujetos de control en un diccionario
eeg_control = {}

for i in range(len(sujetos_control)):
    sbj = sujetos_control[i]
    mat_file_path = ruta_carpeta_control+'/'+sbj+'.mat'
    data = scipy.io.loadmat(mat_file_path)
    columna = list(data.keys())[-1]
    eeg_control[sbj] = data[columna].T

def segmentar_senales(db, labels):
    """
    Divide las señales EEG en segmentos de 512 instantes con un traslape del 50%.
    
    Args:
        db (dict): Diccionario donde las claves son los nombres de los sujetos y los valores
                   son matrices de forma CxT_i (C = canales, T_i = tiempo).
    
    Returns:
        dict: Nuevo diccionario con los segmentos de cada sujeto.
    """
    segmentos_db = {}
    segmento_tamano = 512
    paso = int(segmento_tamano * 0.5)  # Traslape del 50%
    i = 0
    
    segmentos = []
    y = []
    sbjs = []
    
    for sujeto, senal in db.items():
        C, T = senal.shape
        
        # Crear segmentos con traslape
        for inicio in range(0, T - segmento_tamano + 1, paso):
            segmento = senal[:, inicio:inicio + segmento_tamano]
            segmentos.append(segmento)
            y.append(labels[i])
            sbjs.append(sujeto)

        i += 1
    return np.array(segmentos), np.array(y), sbjs

In [3]:
import numpy as np

data= eeg_control | eeg_tdah
zeros = np.zeros(len(eeg_control))
ones = np.ones(len(eeg_tdah))
y = np.hstack((zeros, ones))

# **To build the proposal**

## **Pipeline**

### **1.Preprocessing**

In [4]:
import numpy as np
from scipy.signal import butter, filtfilt

# Bandpass filter (0.5-63 Hz)
def bandpass_filter(signal, lowcut=0.5, highcut=63, fs=128, order=4):
    nyquist = 0.5 * fs
    b, a = butter(order, [lowcut/nyquist, highcut/nyquist], btype='band')
    return filtfilt(b, a, signal)

# Notch filter at 50 Hz
def notch_filter(signal, fs=128, freq=50.0):
    nyquist = 0.5 * fs
    low = (freq - 1) / nyquist
    high = (freq + 1) / nyquist
    b, a = butter(2, [low, high], btype='bandstop')
    return filtfilt(b, a, signal)

### **2. Segmentation**

In [5]:
def segment_signal(signal, segment_length=512, overlap=256):
    segments = []
    for i in range(0, len(signal) - segment_length + 1, overlap):
        segments.append(signal[i:i+segment_length])
    return segments

# Define sub-band ranges
subbands = {
    "delta": (0.5, 4),
    "theta": (4, 8),
    "alpha": (8, 13),
    "beta": (13, 30)
}

def extract_subbands(segment, fs=128):
    return {band: bandpass_filter(segment, low, high, fs=fs) 
            for band, (low, high) in subbands.items()}

### **3. Feature extraction**

In [6]:
from scipy.stats import skew, kurtosis
from scipy.signal import welch
import numpy as np

def shannon_entropy(signal, bins=100):
    hist, _ = np.histogram(signal, bins=bins, density=True)
    hist = hist[hist > 0]  # Avoid log(0)
    return -np.sum(hist * np.log2(hist))

def compute_features(segment):
    features = {}
    features['std'] = np.std(segment)
    features['rms'] = np.sqrt(np.mean(np.square(segment)))
    features['skew'] = skew(segment)
    features['kurt'] = kurtosis(segment)

    # Hjorth parameters
    diff1 = np.diff(segment)
    diff2 = np.diff(diff1)
    features['hjorth_activity'] = np.var(segment)
    features['hjorth_mobility'] = np.sqrt(np.var(diff1) / np.var(segment))
    features['hjorth_complexity'] = np.sqrt(np.var(diff2) / np.var(diff1)) / features['hjorth_mobility']

    # Shannon entropy (custom)
    features['shannon'] = shannon_entropy(segment)

    # Spectral entropy
    freqs, psd = welch(segment, fs=128)
    psd_norm = psd / np.sum(psd)
    features['spectral_entropy'] = -np.sum(psd_norm * np.log2(psd_norm + 1e-12))

    # Band power
    features['band_power'] = np.sum(psd)

    # Power Spectral Density - Average value
    features['psd_avg'] = np.mean(psd)

    return features

In [7]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

X = []
labels = []
sbjs = []

for subj_idx, (subj_id, signals) in enumerate(data.items()):
    label = y[subj_idx]
    print(f"Processing subject {subj_idx+1}: {subj_id}. Label: {label}")

    # signals assumed shape: (19 channels, time)
    channel_segments = [segment_signal(notch_filter(bandpass_filter(channel_signal)))
                        for channel_signal in signals]

    # Number of segments per channel might differ slightly due to length—sync by taking the minimum
    num_segments = min(len(segs) for segs in channel_segments)

    for i in range(num_segments):  # Loop over segment index
        segment_features = []
        for ch in range(len(signals)):  # For each channel
            seg = channel_segments[ch][i]
            subband_signals = extract_subbands(seg)
            for band_name in ['delta', 'theta', 'alpha', 'beta']:  # fixed order
                filtered = subband_signals[band_name]
                feats = compute_features(filtered)
                segment_features.extend(list(feats.values()))  # 11 features

        X.append(segment_features)  # length should be 4 * 19 * 11 = 836
        labels.append(label)
        sbjs.append(subj_id)

X = np.array(X)
y = np.array(labels)

print("X shape:", X.shape)
print("y shape:", y.shape)

Processing subject 1: v306. Label: 0.0
Processing subject 2: v46p. Label: 0.0
Processing subject 3: v140. Label: 0.0
Processing subject 4: v108. Label: 0.0
Processing subject 5: v107. Label: 0.0
Processing subject 6: v110. Label: 0.0
Processing subject 7: v143. Label: 0.0
Processing subject 8: v307. Label: 0.0
Processing subject 9: v308. Label: 0.0
Processing subject 10: v42p. Label: 0.0
Processing subject 11: v56p. Label: 0.0
Processing subject 12: v114. Label: 0.0
Processing subject 13: v120. Label: 0.0
Processing subject 14: v305. Label: 0.0
Processing subject 15: v149. Label: 0.0
Processing subject 16: v113. Label: 0.0
Processing subject 17: v303. Label: 0.0
Processing subject 18: v297. Label: 0.0
Processing subject 19: v47p. Label: 0.0
Processing subject 20: v54p. Label: 0.0
Processing subject 21: v59p. Label: 0.0
Processing subject 22: v57p. Label: 0.0
Processing subject 23: v58p. Label: 0.0
Processing subject 24: v115. Label: 0.0
Processing subject 25: v117. Label: 0.0
Processin

In [8]:
# debería quedar de tamaño segments x 4 x 19 x 11, o sea, cada segmento tiene 836  características

In [9]:
import pickle

with open('X_preprocessed.pkl', 'wb') as f:
    pickle.dump(X, f)

with open('y.pkl', 'wb') as f:
    pickle.dump(y, f)

In [10]:
from sklearn.feature_selection import SelectKBest, f_classif

# Run ANOVA
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)

# Get p-values and scores
p_values = selector.pvalues_
scores = selector.scores_

# Create mask for p-values <= 0.5
mask = p_values <= 0.5

# Apply the mask to X
X_selected = X[:, mask]

print("Selected features:", np.sum(mask))
print("X shape after ANOVA filter:", X_selected.shape)

Selected features: 756
X shape after ANOVA filter: (8213, 756)


In [11]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Apply PCA
pca = PCA(n_components=0.90)  # Retain 90% of the variance
X_pca = pca.fit_transform(X_scaled)

# Print results
print("Original ANOVA-selected shape:", X_selected.shape)
print("Reduced shape after PCA:", X_pca.shape)
print("Number of components selected:", X_pca.shape[1])

Original ANOVA-selected shape: (8213, 756)
Reduced shape after PCA: (8213, 177)
Number of components selected: 177


In [12]:
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Asumiendo que ya tienes:
# - X_pca: características reducidas (n_samples, n_features)
# - y: labels (n_samples,)
# - sbjs: lista de sujetos para cada segmento (n_samples,)
# - folds: lista de folds con sujetos de train y test
#   Ejemplo: folds = [(train_subjects, test_subjects), ...]


with open("/kaggle/input/ieee-tdah-control-database/folds.pkl", "rb") as f:
    folds = pickle.load(f)
    
# Preparar listas para resultados
scores = []
subject_accuracy = {sbj: [] for sbj in set(sbjs)}

# 🔁 Loop sobre los folds
for fold_idx, (train_subjects, test_subjects) in enumerate(folds):
    print(f"Fold {fold_idx + 1} - Test subjects: {test_subjects}")

    # Obtener índices por sujeto
    train_idx = [i for i, sbj in enumerate(sbjs) if sbj in train_subjects]
    test_idx = [i for i, sbj in enumerate(sbjs) if sbj in test_subjects]

    # Dividir datos
    X_train, X_test = X_pca[train_idx], X_pca[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    sbjs_test = np.array(sbjs)[test_idx]

    # Entrenar el modelo SVM (RBF kernel como en el paper)
    clf = SVC(kernel='rbf', gamma='scale', C=1.0)
    clf.fit(X_train, y_train)

    # Evaluar el fold
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    scores.append(acc)
    print(f'Fold Accuracy: {acc:.4f}')

    # Evaluar precisión por sujeto individual
    for sbj in test_subjects:
        sbj_indices = [i for i, s in enumerate(sbjs_test) if s == sbj]
        X_sbj = X_test[sbj_indices]
        y_sbj = y_test[sbj_indices]
        y_pred_sbj = clf.predict(X_sbj)
        acc_sbj = accuracy_score(y_sbj, y_pred_sbj)
        subject_accuracy[sbj].append(acc_sbj)

# Resultado final
mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f'\n✅ Mean Accuracy: {mean_acc:.4f} ± {std_acc:.4f}')

# Guardar resultados si se desea
with open('svm_lsso_scores.pkl', 'wb') as f:
    pickle.dump(scores, f)

with open('svm_lsso_subject_accuracy.pkl', 'wb') as f:
    pickle.dump(subject_accuracy, f)


Fold 1 - Test subjects: ['v28p', 'v274', 'v1p', 'v231', 'v22p', 'v29p', 'v206', 'v238', 'v31p', 'v35p', 'v177', 'v200', 'v112', 'v113', 'v48p', 'v140', 'v131', 'v125', 'v55p', 'v143', 'v43p', 'v305', 'v134', 'v114']
Fold Accuracy: 0.6692
Fold 2 - Test subjects: ['v18p', 'v39p', 'v234', 'v32p', 'v190', 'v6p', 'v254', 'v204', 'v24p', 'v183', 'v246', 'v219', 'v298', 'v41p', 'v47p', 'v308', 'v52p', 'v300', 'v59p', 'v299', 'v302', 'v51p', 'v109', 'v127']
Fold Accuracy: 0.6643
Fold 3 - Test subjects: ['v215', 'v3p', 'v209', 'v37p', 'v213', 'v15p', 'v284', 'v181', 'v19p', 'v34p', 'v263', 'v244', 'v138', 'v121', 'v46p', 'v54p', 'v120', 'v310', 'v147', 'v50p', 'v56p', 'v107', 'v297', 'v108']
Fold Accuracy: 0.6431
Fold 4 - Test subjects: ['v227', 'v8p', 'v236', 'v14p', 'v196', 'v27p', 'v33p', 'v179', 'v173', 'v10p', 'v265', 'v20p', 'v57p', 'v45p', 'v111', 'v115', 'v53p', 'v118', 'v123', 'v44p', 'v149', 'v303', 'v116', 'v151']
Fold Accuracy: 0.7167
Fold 5 - Test subjects: ['v279', 'v30p', 'v288',