In [11]:
import os
import glob
import time
import random
import numpy as np
import pandas as pd
import argparse

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, LeaveOneGroupOut
from sklearn.svm import SVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, confusion_matrix
)
from deap import base, creator, tools, algorithms

N_SUBJECT=9

parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, default="./DATA_Updated")
parser.add_argument('--kernel', type=str, default="rbf")
parser.add_argument('--n_gen', type=int, default=10)
parser.add_argument('--n_pop', type=int, default=50)
parser.add_argument('--cxpb', type=float, default=0.5, help="CX rate at each tournement")
parser.add_argument('--mutpb', type=float, default=0.1, help="Mut rate at each tournement")
args = parser.parse_args(args=[])

# read/combine csv
def load_data_from_csv(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.xls"))
    feature_list = []
    label_list = []
    subj_list = []
    label_encoder = LabelEncoder()

    ## Get label name from the first file only
    label_names = set()
    df = pd.read_csv(all_files[0])
    label_names.update(df.iloc[:, 1].unique())
    label_encoder.fit(sorted(label_names))
    label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_) + 1))
    print("Label mapping:", label_map)

    ## read all files
    for file in all_files:
        df = pd.read_csv(file)
        y = label_encoder.transform(df.iloc[:, 1]) + 1
        X = df.iloc[:, 2:].values
        feature_list.append(X)
        label_list.append(y)
        subj_list.append(np.full(y.shape, int(df.iloc[1,0][2])))
        print(f"Loading {file}:  {df.shape}")
    
    X_all = np.vstack(feature_list)
    y_all = np.hstack(label_list)
    g_all = np.hstack(subj_list)
    print(f"Loaded {len(all_files)} csv: X.shape = {X_all.shape}, y.shape = {y_all.shape} in total.")
    return X_all, y_all, label_encoder, g_all

def evaluate(individual):
    global X_global, y_global  # Use global variables for data access

    if sum(individual) == 0:
        return (0.0,)
    selected = np.atleast_1d(individual).nonzero()[0]   ## np.where(x==1) 
    X_sel = X_global[:, selected]
    #clf = SVC(kernel='linear', C=1.0)
    clf = SVC(kernel=args.kernel, gamma='scale', C=1.0)
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_val_score(clf, X_sel, y_global, cv=cv, scoring='accuracy', n_jobs=-1)
    #print(scores)
    return (scores.mean(),)

def evaluate_selected_features(X, y, g, selected_indices, label_encoder):
    logo = LeaveOneGroupOut()
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=16, svd_solver="auto")),
        ('classifier', SVC(kernel='rbf', gamma='auto', C=1))
        #('classifier', SVC(kernel='linear', C=1))
        #clf = RandomForestClassifier(n_estimators=50)
    ])

    fold_accuracies = []
    test_gids = []

    print("Leave-one-subject-out CV...")
    for fold_idx, (train_index, test_index) in enumerate(logo.split(X, y, g)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        g_train, g_test = g[train_index], g[test_index]
        
        test_gids.append(g_test[0])
        print(f"Fold {fold_idx + 1}/{N_SUBJECT}: Test group {g_test[0]}", end='')
        print(f"  Train/Test samples: {X_train.shape[0]} {X_test.shape[0]}")

        t0 = time.time()
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)

        print(f"  Accuracy {g_test[0]}: {accuracy:.4f} in {time.time()-t0:.2f} secs.")

        #print(" Classification Report:")
        #print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=1))
    
        #print(" Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
    
        #print(" Accuracy:          ", accuracy_score(y_test, y_pred))
        #print(" Precision (macro): ", precision_score(y_test, y_pred, average='macro', zero_division=1))
        #print(" Recall (macro):    ", recall_score(y_test, y_pred, average='macro'))
        #print(" F1-score (macro):  ", f1_score(y_test, y_pred, average='macro'))

#
if __name__ == "__main__":

    X, y, label_encoder, g = load_data_from_csv(args.data_folder)
    selected_indices = range(X.shape[1])
    evaluate_selected_features(X, y, g, selected_indices, label_encoder)


Label mapping: {np.str_('feet'): np.int64(1), np.str_('left_hand'): np.int64(2), np.str_('right_hand'): np.int64(3), np.str_('tongue'): np.int64(4)}
Loading ./DATA_Updated/A05T_features.xls:  (288, 152)
Loading ./DATA_Updated/A01T_features.xls:  (288, 152)
Loading ./DATA_Updated/A02T_features.xls:  (288, 152)
Loading ./DATA_Updated/A06T_features.xls:  (288, 152)
Loading ./DATA_Updated/A04T_features.xls:  (144, 152)
Loading ./DATA_Updated/A08T_features.xls:  (288, 152)
Loading ./DATA_Updated/A07T_features.xls:  (288, 152)
Loading ./DATA_Updated/A09T_features.xls:  (288, 152)
Loading ./DATA_Updated/A03T_features.xls:  (288, 152)
Loaded 9 csv: X.shape = (2448, 150), y.shape = (2448,) in total.
Leave-one-subject-out CV...
Fold 1/9: Test group 1  Train/Test samples: 2160 288
  Accuracy 1: 0.2743 in 0.39 secs.
[[ 9 28 26  9]
 [ 8 31 22 11]
 [ 5 32 27  8]
 [ 7 31 22 12]]
Fold 2/9: Test group 2  Train/Test samples: 2160 288
  Accuracy 2: 0.2882 in 0.37 secs.
[[ 9 21 23 19]
 [12 27 25  8]
 [ 7 