### Can we classify each phase as relatively low or high anxiety for each subject? ###

In [12]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import sys
sys.path.append(module_path)

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)

In [19]:
# LOAD TRAIN AND TEST DATA
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)

# metrics = train.Metrics.ALL
# metrics = train.Metrics.ECG \
    # + train.Metrics.EDA \
    # + train.Metrics.ANKLE + train.Metrics.WRIST

metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE, 
    # train.Metrics.RESP,
    # train.Metrics.MEAN_ANKLE_ACT_L, 
    # train.Metrics.MEAN_ANKLE_ACT_R, 
    # train.Metrics.PEAK_ANKLE_ACC_L, 
    # train.Metrics.PEAK_ANKLE_ACC_R,
    # train.Metrics.MEAN_WRIST_ACT_L, 
    # train.Metrics.MEAN_WRIST_ACT_R, 
    # train.Metrics.PEAK_WRIST_ACC_L, 
    # train.Metrics.PEAK_WRIST_ACC_R
]

model_phases = [
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Exposure", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Exposure", "Speech_Break"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Break"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", 
        "Speech_Relax", "Speech_Anticipate"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax",
        "Speech_Relax"
    ],
    ["BugBox_Break", "Speech_Break"],
    ["BugBox_Exposure", "Speech_Exposure"]
]

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    "Bayes": GaussianNB(),
    "XGB": XGBClassifier()
}

for phases in model_phases:
    print(f"PHASES: {phases} " + "-"*30)
    anxiety_label_type = "Anxiety"
    x, y = train.Train_APD.get_apd_data_ranking(metrics, phases, verbose=False, anxiety_label_type=anxiety_label_type)
    x = x.drop(["phaseId"], axis=1)
    # drop subjects with noisy data
    x = x[x['subject'] != 84.0]
    y = y[y['subject'] != 84.0]
    # x = x[x['subject'] != 8.0]
    # y = y[y['subject'] != 8.0]

    x.drop(labels=["anxietyGroup"], axis=1)

    # 0-1 scaling
    for i in range(3, len(x.columns)):
        data_col = x[x.columns[i]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x[x.columns[i]] = data_col

    acc_results = {
        "SVM": [], "KNN": [],
        "DT": [],
        "LogReg": [],
        "Bayes": [],
        "XGB": []
    }
    reports = {
        "SVM": [], "KNN": [],
        "DT": [],
        "LogReg": [],
        "Bayes": [],
        "XGB": [],
    }
    num_iters = 10
    for _ in range(num_iters):
        out = train.train_predict(models, x, y, by_subject=False, save_metrics=True, test_size=0.15)
        for model_name in acc_results:
            acc_results[model_name].append(out[model_name][0])
            reports[model_name].append(out[model_name][1])

    for model_name in acc_results.keys():
        acc = np.mean(acc_results[model_name])
        print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
        if acc > 0.4:
            print(f"Model evaluation metrics for {model_name}:")
            p = np.mean([report["precision"] for report in reports[model_name]])
            r = np.mean([report["recall"] for report in reports[model_name]])
            f1 = np.mean([report["f1"] for report in reports[model_name]])
            auc = np.mean([report["auc"] for report in reports[model_name]])
            report = reports[model_name]
            print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
        print("")
    print("\n")

PHASES: ['Baseline_Rest', 'BugBox_Relax', 'BugBox_Anticipate', 'BugBox_Exposure', 'BugBox_Break', 'Speech_Relax', 'Speech_Anticipate', 'Speech_Exposure', 'Speech_Break'] ------------------------------
SVM accuracy over 10 rounds: 0.6656476151530948
Model evaluation metrics for SVM:
Precision: 0.6855120232677863
Recall: 0.9301052895415532
F1-score: 0.7892216631018754
AUC score: 0.5255668633318394

KNN accuracy over 10 rounds: 0.5916005355180756
Model evaluation metrics for KNN:
Precision: 0.670837659862495
Recall: 0.7740338833678779
F1-score: 0.7172431125086755
AUC score: 0.49519400417804055

DT accuracy over 10 rounds: 0.5904592994317203
Model evaluation metrics for DT:
Precision: 0.7108730619736282
Recall: 0.6636996778187921
F1-score: 0.6848672572375202
AUC score: 0.5519759179866967

LogReg accuracy over 10 rounds: 0.6714864379279581
Model evaluation metrics for LogReg:
Precision: 0.6858721484023816
Recall: 0.9454360306009228
F1-score: 0.7945705682422376
AUC score: 0.52657693221063

B

notes:
KNN
best results when using HR freq domain metrics instead of time domain
mean SCL > SCR rate
SVM > KNN when using both mean SCL and SCR rate
7 > 5 = 9 neighbors
  
Performance is better when using mean rather than median to divide phases into high and low anxiety
