In [1]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import shap
import sys
sys.path.append(module_path)

import tools.data_reader_apd as dr_a
import tools.data_reader_ascertain as dr_asc
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE
]

# model_phases_apd = [
#     [
#         "Baseline_Rest", 
#         "BugBox_Relax", "BugBox_Anticipate", "BugBox_Exposure", "BugBox_Break",
#         "Speech_Relax", "Speech_Anticipate", "Speech_Exposure", "Speech_Break"
#     ]
# ]

model_phases_apd = [
    "Baseline_Rest", 
    "BugBox_Relax", "BugBox_Anticipate", "BugBox_Exposure", "BugBox_Break",
    "Speech_Relax", "Speech_Anticipate", "Speech_Exposure", "Speech_Break"
]

# anxiety_label_type = "Anxiety"
anxiety_label_type = None
asc_label_type = dr_asc.SelfReports.AROUSAL

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    # "KNN": KNeighborsClassifier(n_neighbors=7),
    # "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    # "Bayes": GaussianNB(),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="logloss")
}

threshold = "fixed"
asc_threshold = "fixed"
test_size = 1.0

temp_a, _ = train.Train_APD.get_apd_data_ranking([train.Metrics.BPM], phases=dr_a.Phases.PHASES_LIST)
idx = temp_a[temp_a["bpm"] > 200].index 
invalid_apd_subjects = set(temp_a["subject"].iloc[idx].tolist())
idx = temp_a[temp_a["bpm"] < 35].index 
invalid_apd_subjects.update(set(temp_a["subject"].iloc[idx].tolist()))


In [None]:
# TRAIN ON APD AND TEST ON ASCERTAIN
importlib.reload(train)
importlib.reload(dr_a)
importlib.reload(dr_asc)
importlib.reload(dt)


for i, phases_apd in enumerate(model_phases_apd):
    print(f"APD PHASES {i} " + "-"*50)
    x_a, y_a = train.Train_APD.get_apd_data_ranking(metrics, phases_apd, verbose=False, anxiety_label_type=anxiety_label_type, threshold=threshold)
    x_b, y_b = train.Train_ASCERTAIN.get_ascertain_data(metrics, verbose=False, label_type=asc_label_type, threshold=asc_threshold)
    # drop subjects with noisy data
    x_a = x_a[~x_a["subject"].isin(invalid_apd_subjects)]
    y_a = y_a[~y_a["subject"].isin(invalid_apd_subjects)]
    inds = pd.isnull(x_b).any(axis=1).to_numpy().nonzero()[0]
    x_b = x_b.drop(labels=inds, axis=0)
    y_b = y_b.drop(labels=inds, axis=0)
    
    if anxiety_label_type is not None:
        x_a = x_a.drop(["anxietyGroup"], axis=1)  # drop anxietyGroup column because WESAD doesn't have this feature
    
    x_a = x_a.drop(["phaseId"], axis=1)
    x_b = x_b.drop(["phaseId"], axis=1)

    # make sure subjects from different datasets aren't labeled with the same index
    x_b["subject"] = x_b["subject"] + 500
    y_b["subject"] = y_b["subject"] + 500

    # 0-1 scaling
    for c in range(3, len(x_a.columns)):
        data_col = x_a[x_a.columns[c]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x_a[x_a.columns[c]] = data_col
    # 0-1 scaling
    for c in range(3, len(x_b.columns)):
        data_col = x_b[x_b.columns[c]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x_b[x_b.columns[c]] = data_col

    acc_results = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    reports = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    num_iters = 10
    for _ in range(num_iters):
        out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=True, save_metrics=True, test_size=test_size, is_resample=True)
        for model_name in acc_results:
            acc_results[model_name].append(out[model_name][0])
            reports[model_name].append(out[model_name][1])

    for model_name in acc_results.keys():
        acc = np.mean(acc_results[model_name])
        print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
        if acc > 0.5:
            print(f"Model evaluation metrics for {model_name}:")
            p = np.mean([report["precision"] for report in reports[model_name]])
            r = np.mean([report["recall"] for report in reports[model_name]])
            f1 = np.mean([report["f1"] for report in reports[model_name]])
            auc = np.mean([report["auc"] for report in reports[model_name]])
            report = reports[model_name]
            print(f"\tPrecision: {p}\n\tRecall: {r}\n\tF1-score: {f1}\n\tAUC score: {auc}")
    print("\n")

In [90]:
importlib.reload(train)
importlib.reload(dr_a)

ha, la = train.Train_APD.get_ratings(phases=model_phases_apd, threshold=threshold)
ha = ha[~ha["subject"].isin(invalid_apd_subjects)]
la = la[~la["subject"].isin(invalid_apd_subjects)]
print(ha)
print(la)

# x, y = train.Train_APD.get_apd_data_ranking(metrics, model_phases_apd, verbose=False, anxiety_label_type=anxiety_label_type, threshold=threshold)
# x = x[~x["subject"].isin(invalid_apd_subjects)]
# y = y[~y["subject"].isin(invalid_apd_subjects)]
# for i in range(y.shape[0] // 52):
#     print(y.iloc[i*52:i*52+52, :])

   subject   0   1   2   3   4   5   6   7   8  mean
1        6  20   0  10  20   0   0  40  30   0    50
2        7  20   0  30  40  10  10  50  70  10    50
4       10  10   0   0  10   0   0  20  10   0    50
5       12  70  10  60  80  10  10  20  40  70    50
8       18  40  10  20  40  10  10  10  20  10    50
9       22  30  10  40  60  20  10  60  40  20    50
10      26   0   0  10  30   0   0  20  10   0    50
11      27  40   0  70  30  20   0  50  50  30    50
13      31  40  80  10  10  40   0  10  10   0    50
14      32  40  20  30  40  10   0  20  30   0    50
15      33  20  30  80  80  20  20  60  50  50    50
16      35  10   0   0  10   0   0  20  20   0    50
18      45   0   0  30  60  50   0  10   0   0    50
19      47  20  20  40  30  10  10  30  30  10    50
21      49  20  10  30  40  20  10  40  30  20    50
22      54  10  10  50  80  60  30  50  60  30    50
23      55  20   0  40  50   0   0  40  30   0    50
24      66  30  20  30  60  30  10  40  60  10

In [100]:
# TRAIN ON ASCERTAIN AND TEST ON APD
importlib.reload(train)
importlib.reload(dr_a)
importlib.reload(dr_asc)
importlib.reload(dt)


for j, phases_apd in enumerate(model_phases_apd):
    print(f"APD PHASES {phases_apd} " + "-"*50)
    phases_apd = [phases_apd]
    x_a, y_a = train.Train_ASCERTAIN.get_ascertain_data(metrics, verbose=False, label_type=asc_label_type, threshold=asc_threshold)
    x_b, y_b = train.Train_APD.get_apd_data_ranking(metrics, phases_apd, verbose=False, anxiety_label_type=anxiety_label_type, threshold=threshold)
    # drop subjects with noisy data
    x_b = x_b[~x_b["subject"].isin(invalid_apd_subjects)]
    y_b = y_b[~y_b["subject"].isin(invalid_apd_subjects)]
    inds = pd.isnull(x_a).any(axis=1).to_numpy().nonzero()[0]
    x_a = x_a.drop(labels=inds, axis=0)
    y_a = y_a.drop(labels=inds, axis=0)

    if anxiety_label_type is not None:
        x_a = x_a.drop(["anxietyGroup"], axis=1)  # drop anxietyGroup column because WESAD doesn't have this feature
    
    x_a = x_a.drop(["phaseId"], axis=1)
    x_b = x_b.drop(["phaseId"], axis=1)

    # make sure subjects from different datasets aren't labeled with the same index
    x_b["subject"] = x_b["subject"] + 500
    y_b["subject"] = y_b["subject"] + 500

    # 0-1 scaling
    for c in range(3, len(x_a.columns)):
        data_col = x_a[x_a.columns[c]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x_a[x_a.columns[c]] = data_col
    # 0-1 scaling
    for c in range(3, len(x_b.columns)):
        data_col = x_b[x_b.columns[c]]
        data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
        x_b[x_b.columns[c]] = data_col

    acc_results = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    reports = {
        "SVM": [],
        "LogReg": [],
        "XGB": []
    }
    num_iters = 1
    for _ in range(num_iters):
        out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=False, save_metrics=True, test_size=test_size, is_resample=False)
        for model_name in acc_results:
            acc_results[model_name].append(out[model_name][0])
            reports[model_name].append(out[model_name][1])

    for model_name in acc_results.keys():
        acc = np.mean(acc_results[model_name])
        print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
        if acc > 0.5:
            p = np.mean([report["precision"] for report in reports[model_name]])
            r = np.mean([report["recall"] for report in reports[model_name]])
            f1 = np.mean([report["f1"] for report in reports[model_name]])
            auc = np.mean([report["auc"] for report in reports[model_name]])
            report = reports[model_name]
            print(f"\tReport:\n\tPrecision: {p}\n\tRecall: {r}\n\tF1-score: {f1}\n\tAUC score: {auc}")
    print("\n")

APD PHASES Baseline_Rest --------------------------------------------------
(52, 2)
16
(36, 2)
(24,)
y_train:
1    1319
0     240
Name: label, dtype: int64
y_test:
0    21
1     3
Name: label, dtype: int64
Model SVM, Predictions: [1], [24]
Model LogReg, Predictions: [0], [24]
Model XGB, Predictions: [0 1], [22  2]
SVM accuracy over 1 rounds: 0.125
LogReg accuracy over 1 rounds: 0.875
	Report:
	Precision: 1.0
	Recall: 0.0
	F1-score: 0.0
	AUC score: 0.5
XGB accuracy over 1 rounds: 0.7916666666666666
	Report:
	Precision: 0.0
	Recall: 0.0
	F1-score: 0.0
	AUC score: 0.4523809523809524


APD PHASES BugBox_Relax --------------------------------------------------
(52, 2)
16
(36, 2)
(24,)
y_train:
1    1319
0     240
Name: label, dtype: int64
y_test:
0    23
1     1
Name: label, dtype: int64


KeyboardInterrupt: 