In [4]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', '..', 'cvxEDA', 'src'))
import pandas as pd
import random
import scipy.signal as ss
import sys
module_path = os.path.abspath(os.path.join('..', '..', 'src'))
sys.path.append(module_path)

import tools.data_reader_popane as dr_p
import tools.display_tools as dt
import tools.preprocessing as preprocessing
import train

from scipy.fft import fft, fftfreq, fftshift
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from xgboost import XGBClassifier

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# TRAIN ON APD AND TEST ON POPANE
importlib.reload(train)
importlib.reload(dr_p)
importlib.reload(dt)


metrics = [
    train.Metrics.BPM, 
    train.Metrics.RMSSD, 
    train.Metrics.HF_RR, 
    train.Metrics.LF_RR, 
    train.Metrics.IBI, 
    train.Metrics.SDNN, 
    train.Metrics.MEAN_SCL, 
    train.Metrics.SCR_RATE
]

studies_popane = [
    "Study1",
    "Study2",
    "Study3",
    # "Study4",
    "Study5",
    # "Study6",
    # "Study7"
]

model_phases_popane = [
    dr_p.Study1.ALL,
    dr_p.Study2.ALL,
    dr_p.Study3.ALL,
    # dr_p.Study4.ALL,
    dr_p.Study5.ALL
    # dr_p.Study6.ALL
    # dr_p.Study7.ALL
]

popane_label_type = "affect"

models = {
    "SVM": SVC(C=10, gamma=1),  # C=10, gamma=1
    "LogReg": LogisticRegression(max_iter=1000),
    "XGB": XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="logloss")
}

threshold = "dynamic"
test_size = 1.0

for i in range(len(studies_popane)):
    for j in range(i+1, len(studies_popane)):
        print(f"POPANE {studies_popane[i]} " + "-"*50)
        print(f"POPANE {studies_popane[j]} " + "-"*50)
        x_a, y_a = train.Train_POPANE.get_popane_data(studies_popane[i], metrics, model_phases_popane[i], verbose=False, label_type=popane_label_type, threshold=threshold)
        x_b, y_b = train.Train_POPANE.get_popane_data(studies_popane[j], metrics, model_phases_popane[j], verbose=False, label_type=popane_label_type, threshold=threshold)
        
        inds = pd.isnull(x_a).any(1).to_numpy().nonzero()[0]
        x_a = x_a.drop(inds, axis=0)
        y_a = y_a.drop(inds, axis=0)
        
        inds = pd.isnull(x_b).any(1).to_numpy().nonzero()[0]
        x_b = x_b.drop(inds, axis=0)
        y_b = y_b.drop(inds, axis=0)

        x_a = x_a.drop(["phaseId"], axis=1)
        x_b = x_b.drop(["phaseId"], axis=1)

        # make sure subjects from different datasets aren't labeled with the same index
        x_b["subject"] = x_b["subject"] + 500

        if y_a.isnull().values.any():
            print("y_a contains NaN")

        if y_b.isnull().values.any():
            print("y_b contains NaN")

        # 0-1 scaling
        for c in range(3, len(x_a.columns)):
            data_col = x_a[x_a.columns[c]]
            data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
            x_a[x_a.columns[c]] = data_col
        # 0-1 scaling
        for c in range(3, len(x_b.columns)):
            data_col = x_b[x_b.columns[c]]
            data_col = (data_col - data_col.min())/(data_col.max() - data_col.min())
            x_b[x_b.columns[c]] = data_col

        # print(y_a.loc[:, "label"].value_counts() + y_b.loc[:, "label"].value_counts())

        acc_results = {
            "SVM": [], 
            # "KNN": [],
            # "DT": [],
            "LogReg": [],
            # "Bayes": [],
            "XGB": []
        }
        reports = {
            "SVM": [], 
            # "KNN": [],
            # "DT": [],
            "LogReg": [],
            # "Bayes": [],
            "XGB": [],
        }
        num_iters = 10
        for _ in range(num_iters):
            out = train.Train_Multi_Dataset.train_across_datasets(models, x_a, y_a, x_b, y_b, by_subject=False, save_metrics=True, test_size=test_size)
            for model_name in acc_results:
                acc_results[model_name].append(out[model_name][0])
                reports[model_name].append(out[model_name][1])

        for model_name in acc_results.keys():
            acc = np.mean(acc_results[model_name])
            print(f"{model_name} accuracy over {num_iters} rounds: {acc}")
            if acc > 0.65:
                print(f"Model evaluation metrics for {model_name}:")
                p = np.mean([report["precision"] for report in reports[model_name]])
                r = np.mean([report["recall"] for report in reports[model_name]])
                f1 = np.mean([report["f1"] for report in reports[model_name]])
                idx = np.argmax([report["f1"] for report in reports[model_name]])
                auc = np.mean([report["auc"] for report in reports[model_name]])
                report = reports[model_name]
                print(f"Precision: {p}\nRecall: {r}\nF1-score: {f1}\nAUC score: {auc}")
                plot_y = reports[model_name][idx]["actual vs pred"]
                _, y_test_counts = np.unique(plot_y[0], return_counts=True)
                _, y_pred_counts = np.unique(plot_y[1], return_counts=True)

                # br1 = [0, 0.5]
                # br2 = [1, 1.5]
                
                # x = [0, 1]
                # x_axis = np.arange(len(x))
                # plt.bar(x_axis-0.1, y_test_counts, 0.2, label="Actual")
                # plt.bar(x_axis+0.1, y_pred_counts, 0.2, label="Predicted")
                # plt.xticks(x_axis, x)
                # plt.legend()
                # plt.show()

        print("\n")