### Can we classify each phase as relatively low or high anxiety for each subject? ###

In [12]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import sys
sys.path.append(module_path)

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate, RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)

In [13]:
importlib.reload(train)
importlib.reload(dr)

temp_a, _ = train.Train_APD.get_apd_data_ranking([train.Metrics.BPM], phases=dr.Phases.PHASES_LIST)
idx = temp_a[temp_a["bpm"] > 200].index 
invalid_apd_subjects = set(temp_a["subject"].iloc[idx].tolist())
idx = temp_a[temp_a["bpm"] < 35].index 
invalid_apd_subjects.update(set(temp_a["subject"].iloc[idx].tolist()))

In [15]:
# LOAD TRAIN AND TEST DATA
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)


metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE, 
]

model_phases = [
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Exposure", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Exposure", "Speech_Break"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Break"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", 
        "Speech_Relax", "Speech_Anticipate"
    ],
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Break"
    ],
    # ["BugBox_Break", "Speech_Break"],
    ["BugBox_Exposure", "Speech_Exposure"]
]

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    # "KNN": KNeighborsClassifier(n_neighbors=7),
    # "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    # "Bayes": GaussianNB(),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="logloss")
}

threshold = "fixed"

for phases in model_phases:
    print(f"PHASES: {phases} " + "-"*30)
    # anxiety_label_type = "Anxiety"
    anxiety_label_type = None
    x, y = train.Train_APD.get_apd_data_ranking(metrics, phases, verbose=False, anxiety_label_type=anxiety_label_type, threshold=threshold)
    x = x.drop(["phaseId"], axis=1)
    # drop subjects with noisy data
    x = x[~x["subject"].isin(invalid_apd_subjects)]
    y = y[~y["subject"].isin(invalid_apd_subjects)]
    # x = x[x['subject'] != 8.0]
    # y = y[y['subject'] != 8.0]

    if anxiety_label_type is not None:
        x.drop(labels=["anxietyGroup"], axis=1)
        
    print(y.loc[:, "label"].value_counts())

    # 0-1 scaling
    for i in range(3, len(x.columns)):
        data_col = x[x.columns[i]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x[x.columns[i]] = data_col

    acc_results = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    reports = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    num_iters = 10
    for _ in range(num_iters):
        try:
            out = train.train_predict(models, x, y, by_subject=True, save_metrics=True, test_size=0.15, is_resample=True)
            for model_name in acc_results:
                acc_results[model_name].append(out[model_name][0])
                reports[model_name].append(out[model_name][1])
        except Exception as e:
            print("Error in resampling train/test data")

    for model_name in acc_results.keys():
        acc = np.mean(acc_results[model_name])
        print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
        if acc > 0.5:
            print(f"Model evaluation metrics for {model_name}:")
            p = np.mean([report["precision"] for report in reports[model_name]])
            r = np.mean([report["recall"] for report in reports[model_name]])
            f1 = np.mean([report["f1"] for report in reports[model_name]])
            auc = np.mean([report["auc"] for report in reports[model_name]])
            report = reports[model_name]
            print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
        print("")
    print("\n")

PHASES: ['Baseline_Rest', 'BugBox_Relax', 'BugBox_Anticipate', 'BugBox_Exposure', 'BugBox_Break', 'Speech_Relax', 'Speech_Anticipate', 'Speech_Exposure', 'Speech_Break'] ------------------------------
1    253
0     71
Name: label, dtype: int64
Only one label in test data, rerunning train_test_split
Only one label in test data, rerunning train_test_split
Only one label in test data, rerunning train_test_split
Ratio of negative to positive labels (0.29767441860465116) is under 0.333, oversampling negative class.
Ratio of negative to positive labels (0.18421052631578946) is under 0.333, oversampling negative class.
y_train:
1    215
0     71
Name: label, dtype: int64
y_test:
1    38
0    12
Name: label, dtype: int64
Model SVM, Predictions: [1], [50]
Model LogReg, Predictions: [0 1], [ 8 42]
Model XGB, Predictions: [0 1], [14 36]
Ratio of negative to positive labels (0.18723404255319148) is under 0.333, oversampling negative class.
y_train:
1    235
0     78
Name: label, dtype: int64
y_te

In [11]:
# LOAD TRAIN AND TEST DATA -- train on some phases and test on others
importlib.reload(train)
importlib.reload(dr)
importlib.reload(dt)

# metrics = train.Metrics.ALL
# metrics = train.Metrics.ECG \
    # + train.Metrics.EDA \
    # + train.Metrics.ANKLE + train.Metrics.WRIST

metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    # train.Metrics.HF_RR, 
    # train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE, 
]

phases_a = [
    "Baseline_Rest", 
    "BugBox_Relax", 
    "BugBox_Anticipate", 
    # "BugBox_Exposure", 
    # "BugBox_Break",
    "Speech_Relax", 
    "Speech_Anticipate", 
    # "Speech_Exposure", 
    # "Speech_Break"
]

phases_b = [
    # "Baseline_Rest", 
    # "BugBox_Relax", 
    # "BugBox_Anticipate", 
    "BugBox_Exposure", 
    # "BugBox_Break",
    # "Speech_Relax", 
    # "Speech_Anticipate", 
    "Speech_Exposure", 
    # "Speech_Break"
]

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    # "KNN": KNeighborsClassifier(n_neighbors=7),
    # "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    # "Bayes": GaussianNB(),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="logloss")
}

anxiety_label_type = "Anxiety"
# anxiety_label_type = None
test_size = 1.0

x_a, y_a = train.Train_APD.get_apd_data_ranking(metrics, phases_a, verbose=False, anxiety_label_type=anxiety_label_type)
x_b, y_b = train.Train_APD.get_apd_data_ranking(metrics, phases_b, verbose=False, anxiety_label_type=anxiety_label_type)
x_a = x_a.drop(["phaseId"], axis=1)
x_b = x_b.drop(["phaseId"], axis=1)

# drop subjects with noisy data

x_a = x_a[~x_a["subject"].isin(invalid_apd_subjects)]
y_a = y_a[~y_a["subject"].isin(invalid_apd_subjects)]
x_b = x_b[~x_b["subject"].isin(invalid_apd_subjects)]
y_b = y_b[~y_b["subject"].isin(invalid_apd_subjects)]

print(f"y_a:\n{y_a.loc[:, 'label'].value_counts()}")
print(f"y_b:\n{y_b.loc[:, 'label'].value_counts()}")

# x = x[x['subject'] != 8.0]
# y = y[y['subject'] != 8.0]

if anxiety_label_type is not None:
    x_a.drop(labels=["anxietyGroup"], axis=1)
    x_b.drop(labels=["anxietyGroup"], axis=1)

# 0-1 scaling
for i in range(3, len(x_a.columns)):
    data_col = x_a[x_a.columns[i]]
    data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
    x_a[x_a.columns[i]] = data_col
for i in range(3, len(x_b.columns)):
    data_col = x_b[x_b.columns[i]]
    data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
    x_b[x_b.columns[i]] = data_col
    
# make sure subjects from different datasets aren't labeled with the same index
x_b["subject"] = x_b["subject"] + 500

acc_results = {
    "SVM": [],
    "LogReg": [],
    "XGB": []
}
reports = {
    "SVM": [],
    "LogReg": [],
    "XGB": []
}
num_iters = 10
for _ in range(num_iters):
    out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=False, save_metrics=True, test_size=test_size)
    for model_name in acc_results:
        acc_results[model_name].append(out[model_name][0])
        reports[model_name].append(out[model_name][1])

for model_name in acc_results.keys():
    acc = np.mean(acc_results[model_name])
    print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
    if acc > 0.4:
        print(f"Model evaluation metrics for {model_name}:")
        p = np.mean([report["precision"] for report in reports[model_name]])
        r = np.mean([report["recall"] for report in reports[model_name]])
        f1 = np.mean([report["f1"] for report in reports[model_name]])
        auc = np.mean([report["auc"] for report in reports[model_name]])
        report = reports[model_name]
        print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
    print("")
print("\n")

y_a:
0    94
1    86
Name: label, dtype: int64
y_b:
0    57
1    15
Name: label, dtype: int64
y_train:
0    114
1     87
Name: label, dtype: int64
y_test:
0    37
1    14
Name: label, dtype: int64
Model SVM, Predictions: [0], [51]
Model LogReg, Predictions: [0], [51]
Model XGB, Predictions: [0 1], [49  2]
y_train:
0    114
1     87
Name: label, dtype: int64
y_test:
0    37
1    14
Name: label, dtype: int64
Model SVM, Predictions: [0], [51]
Model LogReg, Predictions: [0], [51]
Model XGB, Predictions: [0 1], [49  2]
y_train:
0    114
1     87
Name: label, dtype: int64
y_test:
0    37
1    14
Name: label, dtype: int64
Model SVM, Predictions: [0], [51]
Model LogReg, Predictions: [0], [51]
Model XGB, Predictions: [0 1], [49  2]
y_train:
0    114
1     87
Name: label, dtype: int64
y_test:
0    37
1    14
Name: label, dtype: int64
Model SVM, Predictions: [0], [51]
Model LogReg, Predictions: [0], [51]
Model XGB, Predictions: [0 1], [49  2]
y_train:
0    114
1     87
Name: label, dtype: int64
y

In [11]:
# K-FOLD CROSS-VALIDATION
from sklearn.model_selection import KFold


metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE, 
]

model_phases = [
    [
        "Baseline_Rest", 
        "BugBox_Relax", "BugBox_Anticipate", "BugBox_Exposure", "BugBox_Break",
        "Speech_Relax", "Speech_Anticipate", "Speech_Exposure", "Speech_Break"
    ],
]

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    # "KNN": KNeighborsClassifier(n_neighbors=7),
    # "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    # "Bayes": GaussianNB(),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="logloss")
}

for phases in model_phases:
    print(f"PHASES: {phases} " + "-"*30)
    # anxiety_label_type = "Anxiety"
    anxiety_label_type = None
    x, y = train.Train_APD.get_apd_data_ranking(metrics, phases, verbose=False, anxiety_label_type=anxiety_label_type)
    x = x.drop(["phaseId"], axis=1)
    # drop subjects with noisy data
    x = x[~x["subject"].isin(invalid_apd_subjects)]
    y = y[~y["subject"].isin(invalid_apd_subjects)]
    # x = x[x['subject'] != 8.0]
    # y = y[y['subject'] != 8.0]

    if anxiety_label_type is not None:
        x.drop(labels=["anxietyGroup"], axis=1)
        
    print(y.loc[:, "label"].value_counts())

    # 0-1 scaling
    for i in range(3, len(x.columns)):
        data_col = x[x.columns[i]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x[x.columns[i]] = data_col

    acc_results = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    reports = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }

    # num_iters = 10
    num_iters = 1
    cv = KFold(n_splits=5, shuffle=True, random_state=1)

    for _ in range(num_iters):
        for model_name in acc_results:
            model = models[model_name]
            # scores = cross_validate(model, x, y, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"], cv=5, n_jobs=-1)
            scores = cross_validate(model, x, y, scoring=["accuracy"], cv=5, n_jobs=-1)
            print(scores)
            reports[model_name].append(scores)
    
    print("\n")

PHASES: ['Baseline_Rest', 'BugBox_Relax', 'BugBox_Anticipate', 'BugBox_Exposure', 'BugBox_Break', 'Speech_Relax', 'Speech_Anticipate', 'Speech_Exposure', 'Speech_Break'] ------------------------------
0    177
1    147
Name: label, dtype: int64
{'fit_time': array([0.00200129, 0.00200057, 0.00200033, 0.00300264, 0.00199938]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_accuracy': array([nan, nan, nan, nan, nan])}
{'fit_time': array([0.00100255, 0.00199795, 0.00200081, 0.00100207, 0.00200129]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_accuracy': array([nan, nan, nan, nan, nan])}
{'fit_time': array([0.00200248, 0.00200033, 0.00200009, 0.00199938, 0.00203395]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_accuracy': array([nan, nan, nan, nan, nan])}


