In [1]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import shap
import sys
sys.path.append(module_path)

import tools.data_reader_wesad as dr_w
import tools.data_reader_ascertain as dr_asc
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)
warnings.simplefilter(action='ignore', category=FutureWarning)# IMPORTING MODULES

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    # train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE
]

model_phases_wesad = dr_w.Phases.PHASE_ORDER

wesad_label_type = "stai"
asc_label_type = dr_asc.SelfReports.AROUSAL

models = {
    # "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    # "KNN": KNeighborsClassifier(n_neighbors=7),
    # "DT": DecisionTreeClassifier(),
    "LogReg": LogisticRegression(max_iter=1000),
    # "Bayes": GaussianNB(),
    "RF": RandomForestClassifier(n_estimators=50, max_features=3),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="error")
}

threshold = "fixed"
asc_threshold = "fixed"
test_size = 1.0


In [3]:
importlib.reload(train)
importlib.reload(dr_w)

stai_scores, dim_scores_arousal, dim_scores_valence = train.Train_WESAD.get_labels(model_phases_wesad)
print(stai_scores)

    subject  Base_STAI  TSST_STAI  Medi_1_STAI   Fun_STAI  Medi_2_STAI
0         2  33.333333  66.666667    23.333333  26.666667    20.000000
1         3  30.000000  66.666667    40.000000  30.000000    36.666667
2         4  30.000000  56.666667    50.000000  26.666667    43.333333
3         5  33.333333  60.000000    26.666667  40.000000    26.666667
4         6  30.000000  50.000000    30.000000  33.333333    20.000000
5         7  26.666667  60.000000    30.000000  23.333333    36.666667
6         8  43.333333  60.000000    46.666667  43.333333    40.000000
7         9  30.000000  46.666667    23.333333  33.333333    23.333333
8        10  36.666667  66.666667    33.333333  26.666667    40.000000
9        11  30.000000  63.333333    26.666667  20.000000    26.666667
10       13  46.666667  70.000000    26.666667  26.666667    53.333333
11       14  43.333333  60.000000    43.333333  40.000000    43.333333
12       15  40.000000  63.333333    30.000000  33.333333    30.000000
13    

In [5]:
# TRAIN ON ASCERTAIN AND TEST ON WESAD -- ALL
importlib.reload(train)
importlib.reload(dr_w)
importlib.reload(dt)


x_a, y_a = train.Train_ASCERTAIN.get_ascertain_data(metrics, verbose=False, label_type=asc_label_type, threshold=asc_threshold, normalize=True)
x_b, y_b = train.Train_WESAD.get_wesad_data(metrics, model_phases_wesad, verbose=False, label_type=wesad_label_type, threshold=threshold, normalize=True)
# drop subjects with noisy data
inds = pd.isnull(x_a).any(axis=1).to_numpy().nonzero()[0]
x_a = x_a.drop(labels=inds, axis=0).reset_index(drop=True)
y_a = y_a.drop(labels=inds, axis=0).reset_index(drop=True)
inds = pd.isnull(x_b).any(axis=1).to_numpy().nonzero()[0]
x_b = x_b.drop(labels=inds, axis=0).reset_index(drop=True)
y_b = y_b.drop(labels=inds, axis=0).reset_index(drop=True)

# x = x[x['subject'] != 8.0]
# y = y[y['subject'] != 8.0]

x_a = x_a.drop(["phaseId"], axis=1)
x_b = x_b.drop(["phaseId"], axis=1)

# make sure subjects from different datasets aren't labeled with the same index
x_b["subject"] = x_b["subject"] + 500
y_b["subject"] = y_b["subject"] + 500

acc_results = {
    # "SVM": [],
    "LogReg": [],
    "RF": [],
    "XGB": []
}
reports = {
    # "SVM": [],
    "LogReg": [],
    "RF": [],
    "XGB": []
}
num_iters = 1
get_importance = True
for _ in range(num_iters):
    out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=True, save_metrics=True, test_size=test_size, is_resample=False, get_importance=True, drop_subject=True)
    for model_name in acc_results:
        acc_results[model_name].append(out[model_name][0])
        reports[model_name].append(out[model_name][1])
        if get_importance:
            try:
                print("")
                feature_imp = list(zip(metrics + ["lf_hf_ratio"], out[model_name][2]))
                feature_imp = sorted(feature_imp, key=lambda x: x[1], reverse=True)
                print(feature_imp)
            except Exception as e:
                print(e)
                # print(out[model_name][0][2])
            print("")

for model_name in acc_results.keys():
    print(f"Model evaluation metrics for {model_name}:")
    for i in range(len(reports[model_name])):
        report = reports[model_name][i]
        acc = acc_results[model_name][i]
        p = report["precision"]
        r = report["recall"]
        f1 = report["f1"]
        auc = report["auc"]
        print(f"\tAccuracy: {acc}\n\tPrecision: {p}\n\tRecall: {r}\n\tF1-score: {f1}\n\tAUC score: {auc}\n" + "-"*40)
    print("")
print("\n")

y_train:
1    3001
0    1540
Name: label, dtype: int64
y_test:
0    779
1    180
Name: label, dtype: int64
Model LogReg, Predictions: [0 1], [ 28 931]
Model RF, Predictions: [0 1], [302 657]
Model XGB, Predictions: [0 1], [473 486]

[('SCR_rate', 0.32136064076511145), ('mean_SCL', 0.16795083666647767), ('hf_rr', -0.08106088974854966), ('lf_hf_ratio', -0.11289343501967576), ('bpm', -0.299761021414726), ('lf_rr', -0.33511538607025276), ('rmssd', -0.965536298357176), ('sdnn', -1.2139235835099742)]


[('mean_SCL', 0.17167179742123442), ('bpm', 0.15341278791739107), ('rmssd', 0.14417388220157903), ('sdnn', 0.14326458679486634), ('lf_hf_ratio', 0.1326715454089462), ('hf_rr', 0.11328326121676172), ('lf_rr', 0.10622093730259703), ('SCR_rate', 0.03530120173662428)]


[('mean_SCL', 0.15020557), ('lf_rr', 0.12806016), ('bpm', 0.12679671), ('rmssd', 0.12543643), ('sdnn', 0.12512189), ('lf_hf_ratio', 0.12066133), ('hf_rr', 0.11801185), ('SCR_rate', 0.105706)]

Model evaluation metrics for LogReg:
	

In [None]:
# TRAIN ON ASCERTAIN AND TEST ON WESAD
importlib.reload(train)
importlib.reload(dr_w)
importlib.reload(dt)


# for j, phases_wesad in enumerate(model_phases_wesad):
for j, phases_wesad in enumerate([dr_w.Phases.TSST]):
    print(f"WESAD PHASES {phases_wesad} " + "-"*50)
    phases_wesad = [phases_wesad]
    x_a, y_a = train.Train_ASCERTAIN.get_ascertain_data(metrics, verbose=False, label_type=asc_label_type, threshold=asc_threshold, normalize=True)
    x_b, y_b = train.Train_WESAD.get_wesad_data(metrics, phases_wesad, verbose=False, label_type=wesad_label_type, threshold=threshold, normalize=True)
    # drop subjects with noisy data
    inds = pd.isnull(x_a).any(axis=1).to_numpy().nonzero()[0]
    x_a = x_a.drop(labels=inds, axis=0).reset_index(drop=True)
    y_a = y_a.drop(labels=inds, axis=0).reset_index(drop=True)

    # x = x[x['subject'] != 8.0]
    # y = y[y['subject'] != 8.0]

    x_a = x_a.drop(["phaseId"], axis=1)
    x_b = x_b.drop(["phaseId"], axis=1)

    # make sure subjects from different datasets aren't labeled with the same index
    x_b["subject"] = x_b["subject"] + 500
    y_b["subject"] = y_b["subject"] + 500

    acc_results = {
        # "SVM": [],
        "LogReg": [],
        "RF": [],
        "XGB": []
    }
    reports = {
        # "SVM": [],
        "LogReg": [],
        "RF": [],
        "XGB": []
    }
    num_iters = 1
    get_importance = True
    for _ in range(num_iters):
        out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=True, save_metrics=True, test_size=test_size, is_resample=False, get_importance=True, drop_subject=True)
        for model_name in acc_results:
            acc_results[model_name].append(out[model_name][0])
            reports[model_name].append(out[model_name][1])
            if get_importance:
                try:
                    print("")
                    feature_imp = list(zip(metrics + ["lf_hf_ratio"], out[model_name][2]))
                    feature_imp = sorted(feature_imp, key=lambda x: x[1], reverse=True)
                    print(feature_imp)
                except Exception as e:
                    print(e)
                    # print(out[model_name][0][2])
                print("")

    for model_name in acc_results.keys():
        print(f"Model evaluation metrics for {model_name}:")
        for i in range(len(reports[model_name])):
            report = reports[model_name][i]
            acc = acc_results[model_name][i]
            p = report["precision"]
            r = report["recall"]
            f1 = report["f1"]
            auc = report["auc"]
            print(f"\tAccuracy: {acc}\n\tPrecision: {p}\n\tRecall: {r}\n\tF1-score: {f1}\n\tAUC score: {auc}\n" + "-"*40)
        print("")
    print("\n")

WESAD PHASES TSST --------------------------------------------------
y_train:
1    1522
0     262
Name: label, dtype: int64
y_test:
1    13
0     1
Name: label, dtype: int64
Model LogReg, Predictions: [0 1], [ 3 11]
Model RF, Predictions: [0 1], [ 4 10]
Model XGB, Predictions: [0 1], [5 9]

[('mean_SCL', 0.36503474097110644), ('SCR_rate', 0.34790615148435833), ('lf_rr', 0.32803677158340006), ('hf_rr', -0.0401397663341208), ('bpm', -0.1510485020310252), ('lf_hf_ratio', -0.7198460377967504), ('sdnn', -1.1643226626094145), ('rmssd', -1.4541963127225255)]


[('mean_SCL', 0.14821258468568552), ('sdnn', 0.14411646061281644), ('rmssd', 0.138591338237736), ('bpm', 0.13451961319453737), ('lf_hf_ratio', 0.12887434072763482), ('hf_rr', 0.10402247471787945), ('lf_rr', 0.10357668403624361), ('SCR_rate', 0.09808650378746674)]


[('sdnn', 0.13990077), ('lf_rr', 0.13073882), ('bpm', 0.12886035), ('mean_SCL', 0.12775761), ('rmssd', 0.12377887), ('hf_rr', 0.122103505), ('SCR_rate', 0.114196084), ('lf_hf

In [None]:
# TRAIN ON WESAD AND TEST ON ASCERTAIN
importlib.reload(train)
importlib.reload(dr_w)
importlib.reload(dt)


print(f"WESAD PHASES {model_phases_wesad} " + "-"*50)
x_a, y_a = train.Train_WESAD.get_wesad_data(metrics, model_phases_wesad, verbose=False, label_type=wesad_label_type, threshold=threshold, normalize=True)
x_b, y_b = train.Train_ASCERTAIN.get_ascertain_data(metrics, verbose=False, label_type=asc_label_type, threshold=asc_threshold, normalize=True)
# drop subjects with noisy data
inds = pd.isnull(x_b).any(axis=1).to_numpy().nonzero()[0]
x_b = x_b.drop(labels=inds, axis=0).reset_index(drop=True)
y_b = y_b.drop(labels=inds, axis=0).reset_index(drop=True)

x_a = x_a.drop(["phaseId"], axis=1)
x_b = x_b.drop(["phaseId"], axis=1)

# make sure subjects from different datasets aren't labeled with the same index
x_b["subject"] = x_b["subject"] + 500
y_b["subject"] = y_b["subject"] + 500

acc_results = {
    # "SVM": [],
    "LogReg": [],
    "RF": [],
    "XGB": []
}
reports = {
    # "SVM": [],
    "LogReg": [],
    "RF": [],
    "XGB": []
}
num_iters = 1
get_importance = True
for _ in range(num_iters):
    out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=True, save_metrics=True, test_size=test_size, is_resample=True, get_importance=get_importance, drop_subject=True)
    for model_name in acc_results:
        acc_results[model_name].append(out[model_name][0])
        reports[model_name].append(out[model_name][1])
        if get_importance:
            try:
                print("")
                feature_imp = list(zip(metrics + ["lf_hf_ratio"], out[model_name][2]))
                feature_imp = sorted(feature_imp, key=lambda x: x[1], reverse=True)
                print(feature_imp)
            except Exception as e:
                print(e)
                # print(out[model_name][0][2])
            print("")

for model_name in acc_results.keys():
    print(f"Model evaluation metrics for {model_name}:")
    for i in range(len(reports[model_name])):
        report = reports[model_name][i]
        acc = acc_results[model_name][i]
        p = report["precision"]
        r = report["recall"]
        f1 = report["f1"]
        auc = report["auc"]
        print(f"\tAccuracy: {acc}\n\tPrecision: {p}\n\tRecall: {r}\n\tF1-score: {f1}\n\tAUC score: {auc}\n" + "-"*40)
    print("")
print("\n")

WESAD PHASES ['Base', 'TSST', 'Medi_1', 'Fun', 'Medi_2'] --------------------------------------------------
Ratio of positive to negative labels (0.2727272727272727) is under 0.333, oversampling positive class.
Ratio of negative to positive labels (0.1721419185282523) is under 0.333, oversampling negative class.
y_train:
0    55
1    55
Name: label, dtype: int64
y_test:
0    1522
1    1522
Name: label, dtype: int64
Model LogReg, Predictions: [0 1], [ 667 2377]
Model RF, Predictions: [0 1], [2833  211]
Model XGB, Predictions: [0 1], [1943 1101]

[('mean_SCL', 1.9428702217029432), ('lf_rr', 1.4042261142716563), ('SCR_rate', 1.3444728769125638), ('bpm', 0.4647817318927647), ('rmssd', 0.30390965035169093), ('hf_rr', 0.042443078303880034), ('lf_hf_ratio', -0.8374083305525871), ('sdnn', -0.9990047766502216)]


[('mean_SCL', 0.21496519334376607), ('hf_rr', 0.18096905883929132), ('lf_rr', 0.15848928052501798), ('SCR_rate', 0.1261696319568389), ('lf_hf_ratio', 0.10049565444549417), ('sdnn', 0.0