In [2]:
import os
import glob
import time
import random
import numpy as np
import pandas as pd
import argparse

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, confusion_matrix
)
from deap import base, creator, tools, algorithms

parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, default="./DATA_Updated")
parser.add_argument('--kernel', type=str, default="rbf")
parser.add_argument('--n_gen', type=int, default=10)
parser.add_argument('--n_pop', type=int, default=50)
parser.add_argument('--cxpb', type=float, default=0.5, help="CX rate at each tournement")
parser.add_argument('--mutpb', type=float, default=0.1, help="Mut rate at each tournement")
args = parser.parse_args(args=[])

# read/combine csv
def load_data_from_csv(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.xls"))
    feature_list = []
    label_list = []
    label_encoder = LabelEncoder()

    ## Get label name from the first file only
    label_names = set()
    df = pd.read_csv(all_files[0])
    label_names.update(df.iloc[:, 1].unique())
    label_encoder.fit(sorted(label_names))
    label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_) + 1))
    print("Label mapping:", label_map)

    ## read all files
    for file in all_files:
        df = pd.read_csv(file).iloc[:, 1:]   ## remove the first colume (subject label)
        y = label_encoder.transform(df.iloc[:, 0]) + 1
        X = df.iloc[:, 1:].values
        feature_list.append(X)
        label_list.append(y)
        print(f"Loading {file}:  {df.shape}")

    X_all = np.vstack(feature_list)
    y_all = np.hstack(label_list)
    print(f"Loaded {len(all_files)} csv: X.shape = {X_all.shape}, y.shape = {y_all.shape} in total.")
    return X_all, y_all, label_encoder

def evaluate(individual):
    global X_global, y_global  # Use global variables for data access

    if sum(individual) == 0:
        return (0.0,)
    #selected = [i for i, bit in enumerate(individual) if bit == 1]
    selected = np.atleast_1d(individual).nonzero()[0]   ## np.where(x==1) 
    X_sel = X_global[:, selected]
    #clf = SVC(kernel='linear', C=1.0)
    clf = SVC(kernel=args.kernel, gamma='scale', C=1.0)
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_val_score(clf, X_sel, y_global, cv=cv, scoring='accuracy', n_jobs=-1)
    #print(scores)
    return (scores.mean(),)


def evaluate_selected_features(X, y, selected_indices, label_encoder):
    print("SVC evaluating selected feature ... ", end='')

    X_sel = X[:, selected_indices]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2)

    t0 = time.time()
    #clf = SVC(kernel='linear', C=1.0)
    ##clf = SVC(kernel='rbf', gamma='scale', C=1.0)
    clf = RandomForestClassifier(n_estimators=50)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"in {time.time()-t0:.2f} secs.")

    print(" Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=1))

    print(" Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print(" Accuracy:          ", accuracy_score(y_test, y_pred))
    print(" Precision (macro): ", precision_score(y_test, y_pred, average='macro', zero_division=1))
    print(" Recall (macro):    ", recall_score(y_test, y_pred, average='macro'))
    print(" F1-score (macro):  ", f1_score(y_test, y_pred, average='macro'))

#
if __name__ == "__main__":

    X, y, label_encoder = load_data_from_csv(args.data_folder)
    selected_indices = range(X.shape[1])
    evaluate_selected_features(X, y, selected_indices, label_encoder)


Label mapping: {np.str_('feet'): np.int64(1), np.str_('left_hand'): np.int64(2), np.str_('right_hand'): np.int64(3), np.str_('tongue'): np.int64(4)}
Loading ./DATA_Updated/A05T_features.xls:  (288, 151)
Loading ./DATA_Updated/A01T_features.xls:  (288, 151)
Loading ./DATA_Updated/A02T_features.xls:  (288, 151)
Loading ./DATA_Updated/A06T_features.xls:  (288, 151)
Loading ./DATA_Updated/A04T_features.xls:  (144, 151)
Loading ./DATA_Updated/A08T_features.xls:  (288, 151)
Loading ./DATA_Updated/A07T_features.xls:  (288, 151)
Loading ./DATA_Updated/A09T_features.xls:  (288, 151)
Loading ./DATA_Updated/A03T_features.xls:  (288, 151)
Loaded 9 csv: X.shape = (2448, 150), y.shape = (2448,) in total.
SVC evaluating selected feature ... in 1.25 secs.
 Classification Report:
              precision    recall  f1-score   support

        feet       0.27      0.20      0.23       127
   left_hand       0.26      0.28      0.27       140
  right_hand       0.26      0.30      0.28       132
      ton