### Can we classify each phase as relatively low or high anxiety for each subject? ###

In [1]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import sys
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
sys.path.append(module_path)

import tools.data_reader_popane as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)
importlib.reload(preprocessing)


metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE
]

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    "Bayes": GaussianNB(),
    "XGB": XGBClassifier()
}

In [141]:
# STUDY 1
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)
importlib.reload(preprocessing)

study = "Study1"
phases = dr.Study1.ALL
label_type = "affect"

x, y = train.Train_POPANE.get_popane_data(study, metrics, phases, verbose=False, label_type=label_type)
inds = pd.isnull(x).any(1).to_numpy().nonzero()[0]
x = x.drop(inds, axis=0)
x = x.drop(["phaseId"], axis=1)
y = y.drop(inds, axis=0)

print(y["label"].value_counts())
print("")

# 0-1 scaling
for i in range(2, len(x.columns)):
    if x.columns[i] in metrics:
        data_col = x[x.columns[i]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x[x.columns[i]] = data_col

acc_results = {
    "SVM": [], "KNN": [],
    "DT": [],
    "LogReg": [],
    "Bayes": [],
    "XGB": []
}
reports = {
    "SVM": [], "KNN": [],
    "DT": [],
    "LogReg": [],
    "Bayes": [],
    "XGB": [],
}
num_iters = 10
for _ in range(num_iters):
    out = train.train_predict(models, x, y, by_subject=False, save_metrics=True, get_shap_values=False)
    for model_name in acc_results:
        acc_results[model_name].append(out[model_name][0])
        reports[model_name].append(out[model_name][1])

for model_name in acc_results.keys():
    acc = np.mean(acc_results[model_name])
    print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
    if acc > 0.65:
        print(f"Model evaluation metrics for {model_name}:")
        p = np.mean([report["precision"] for report in reports[model_name]])
        r = np.mean([report["recall"] for report in reports[model_name]])
        f1 = np.mean([report["f1"] for report in reports[model_name]])
        auc = np.mean([report["auc"] for report in reports[model_name]])
        report = reports[model_name]
        print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
    print("")

In a future version of pandas all arguments of DataFrame.any and Series.any will be keyword-only.


0    276
1     91
Name: label, dtype: int64

SVM accuracy over 10 rounds: 0.5624947589098532

KNN accuracy over 10 rounds: 0.7590381805476145
Model evaluation metrics for KNN:
Precision: 0.0
Recall: 0.0
F1-score: 0.0
AUC score: 0.4988095238095238

DT accuracy over 10 rounds: 0.6213938123372086

LogReg accuracy over 10 rounds: 0.7590045105139446
Model evaluation metrics for LogReg:
Precision: 0.1
Recall: 0.007142857142857143
F1-score: 0.013333333333333332
AUC score: 0.5011614401858304

Bayes accuracy over 10 rounds: 0.5390356394129978

XGB accuracy over 10 rounds: 0.6639216059970776
Model evaluation metrics for XGB:
Precision: 0.21483738483738485
Recall: 0.12053113553113552
F1-score: 0.1458678133460742
AUC score: 0.47795054029470546



In [None]:
# STUDY 5: 

In [None]:
# STUDY 1, 3, 4, 5
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)
importlib.reload(preprocessing)

label_type = "affect"

study = "Study1"
phases = dr.Study1.ALL

x1, y1 = train.Train_POPANE.get_popane_data(study, metrics, phases, verbose=False, label_type=label_type)
inds = pd.isnull(x1).any(1).to_numpy().nonzero()[0]
x1 = x1.drop(inds, axis=0)
y1 = y1.drop(inds, axis=0)

study = "Study3"
phases = dr.Study3.ALL

x3, y3 = train.Train_POPANE.get_popane_data(study, metrics, phases, verbose=False, label_type=label_type)
inds = pd.isnull(x3).any(1).to_numpy().nonzero()[0]
x3 = x3.drop(inds, axis=0)
y3 = y3.drop(inds, axis=0)

study = "Study4"
phases = dr.Study4.ALL

x4, y4 = train.Train_POPANE.get_popane_data(study, metrics, phases, verbose=False, label_type=label_type)
inds = pd.isnull(x4).any(1).to_numpy().nonzero()[0]
x4 = x4.drop(inds, axis=0)
y4 = y4.drop(inds, axis=0)

study = "Study5"
phases = dr.Study5.ALL

x5, y5 = train.Train_POPANE.get_popane_data(study, metrics, phases, verbose=False, label_type=label_type)
inds = pd.isnull(x5).any(1).to_numpy().nonzero()[0]
x5 = x5.drop(inds, axis=0)
y5 = y5.drop(inds, axis=0)

# print(x1.shape)
# print(x3.shape)
# print(x4.shape)
# print(x5.shape)

x = pd.concat([x1, x3, x4, x5])
x = x.drop(["phaseId"], axis=1)
y = pd.concat([y1, y3, y4, y5])

# print(x.head())
# print(x.shape)

print(y["label"].value_counts())
print("")

# 0-1 scaling
for i in range(2, len(x.columns)):
    if x.columns[i] in metrics:
        data_col = x[x.columns[i]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x[x.columns[i]] = data_col

acc_results = {
    "SVM": [], "KNN": [],
    "DT": [],
    "LogReg": [],
    "Bayes": [],
    "XGB": []
}
reports = {
    "SVM": [], "KNN": [],
    "DT": [],
    "LogReg": [],
    "Bayes": [],
    "XGB": [],
}
num_iters = 10
for _ in range(num_iters):
    out = train.train_predict(models, x, y, by_subject=False, save_metrics=True, get_shap_values=False)
    for model_name in acc_results:
        acc_results[model_name].append(out[model_name][0])
        reports[model_name].append(out[model_name][1])

for model_name in acc_results.keys():
    acc = np.mean(acc_results[model_name])
    print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
    if acc > 0.65:
        print(f"Model evaluation metrics for {model_name}:")
        p = np.mean([report["precision"] for report in reports[model_name]])
        r = np.mean([report["recall"] for report in reports[model_name]])
        f1 = np.mean([report["f1"] for report in reports[model_name]])
        auc = np.mean([report["auc"] for report in reports[model_name]])
        report = reports[model_name]
        print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
    print("")