In [1]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', 'src'))
sys.path.append(module_path)
import pandas as pd
import random
import scipy.signal as ss
import sys

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing

from scipy.fft import fft, fftfreq, fftshift
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import normalize

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)


metrics = [
    # "rmssd", "hf_rr", "lf_rr", "ibi", 
    "bpm", "rmssd", "hf_rr", "lf_rr", "ibi", 
    "mean_SCL", "SCR_rate"
]

phases = {
    "Baseline": [dr.Phases.BASE_REST, dr.Phases.BASE_SPEECH],
    "Bug baseline": [dr.Phases.BUG_RELAX],
    "Speech baseline": [dr.Phases.SPEECH_RELAX],
    "Bug all": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech all": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
    "Bug pre-anxiety": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE],
    "Speech pre-anxiety": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE],
    "Bug anxiety": [dr.Phases.BUG_EXPOSURE],
    "Speech anxiety": [dr.Phases.SPEECH_EXPOSURE],
    "Bug post-anxiety": [dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech post-anxiety": [dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
}

test_phases = [
    phases["Baseline"],
    phases["Bug baseline"],
    phases["Speech baseline"],
    phases["Bug baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"],
    phases["Baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"] + phases["Speech baseline"],

    phases["Bug all"],
    phases["Speech all"],
    phases["Bug all"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"],
    phases["Baseline"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"] + phases["Speech all"],

    phases["Bug pre-anxiety"],
    phases["Speech pre-anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],

    phases["Bug anxiety"],
    phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"],
    phases["Speech post-anxiety"],
    phases["Bug post-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Speech post-anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
]

In [2]:
# RANKING PHASES BY HIGH TO LOW DISTRESS
SUDS_labels = [
    "Participant",
    "Baseline_SUDS",
    "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Exposure_SUDS", "BugBox_Break_SUDS",
    "Speech_Relax_SUDS", "Speech_SUDS", "Speech_Exposure_SUDS", "Speech_Break_SUDS"
]


ha_participant_indices = [
    'P4', 'P6', 'P7', 'P8', 'P10', 'P12', 'P15', 'P16', 'P18', 'P22', 'P26', 'P27', 'P29', 'P31', 'P32', 'P33', 'P35', 'P42', 'P45', 'P47', 'P48', 'P49', 'P54', 'P55', 'P66', 'P69'
]

la_participant_indices = [
    'P14', 'P21', 'P23', 'P25', 'P34', 'P39', 'P43', 'P46', 'P51', 'P57', 'P71', 'P72', 'P77', 'P78', 'P79', 'P80', 'P82', 'P83', 'P84', 'P85', 'P87', 'P88', 'P89', 'P91', 'P92', 'P93'
]

participant_file = os.path.join(dr.Paths.DATA_DIR, "participants_details.csv")
df = pd.read_csv(participant_file)

suds_df = df[SUDS_labels]
ha_suds_df = suds_df.loc[suds_df['Participant'].isin(ha_participant_indices)]
la_suds_df = suds_df.loc[suds_df['Participant'].isin(la_participant_indices)]

ha_ranked = {}
la_ranked = {}

for i in range(ha_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, ha_suds_df.shape[1]):
        phases_ranked.append((ha_suds_df.iloc[i, j], ha_suds_df.columns[j]))
    ha_ranked[ha_suds_df.iloc[i, 0]] = phases_ranked

for i in range(la_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, la_suds_df.shape[1]):
        phases_ranked.append((la_suds_df.iloc[i, j], la_suds_df.columns[j]))
    la_ranked[la_suds_df.iloc[i, 0]] = phases_ranked

for p in ha_ranked.keys():
    suds = ha_ranked[p]
    suds = suds.sort(key=lambda x:x[0])

for p in la_ranked.keys():
    suds = la_ranked[p]
    suds = suds.sort(key=lambda x:x[0])

ha_labels = []
la_labels = []

for p in ha_ranked.keys():
    ha_labels.append([SUDS_labels.index(phase[1]) for phase in ha_ranked[p]])
for p in la_ranked.keys():
    la_labels.append([SUDS_labels.index(phase[1]) for phase in la_ranked[p]])

ha_labels = np.vstack(ha_labels)
la_labels = np.vstack(la_labels)

In [22]:
NUM_SUBJECTS = 52

def get_apd_data_feature_fusion(metrics, phases):
    """
    Combines features s.t. each feature vector represents each of the metrics for a single phase.
    Return: x_train, y_train, x_test, y_test
    """
    metrics_folder = os.path.join(dr.Paths.DATA_DIR, "metrics")
    data_x = []
    data_y = []
    for phase in phases:
        ha_features = []
        la_features = []
        for metric in metrics:
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            la_features.append(arr)

        ha_features = np.hstack(ha_features)
        la_features = np.hstack(la_features)
        x = np.vstack([ha_features, la_features])
        y = np.vstack([ha_labels, la_labels])
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")

        data_x.append(x)
        data_y.append(y)
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")
    
    data_x = np.vstack(data_x)
    data_y = np.vstack(data_y)

    print(f"data_x: {data_x.shape}")
    print(f"data_y: {data_y.shape}")
    test_size = 0.1
    test_indices = random.sample(range(NUM_SUBJECTS), int(NUM_SUBJECTS*test_size))
    # print(test_indices)
    x_train = []
    y_train = []
    x_test = []
    y_test = []
    
    for i in range(y.shape[0]):
        if i%NUM_SUBJECTS in test_indices:
            x_test.append(x[i, :])
            y_test.append(y[i, :])
        else:
            x_train.append(x[i, :])
            y_train.append(y[i, :])
    
    x_train = np.vstack(x_train)
    y_train = np.asarray(y_train).flatten()
    x_test = np.vstack(x_test)
    y_test = np.asarray(y_test).flatten()
    # print(f"x_train: {x_train.shape}")
    # print(f"y_train: {y_train.shape}")
    # print(f"x_test: {x_test.shape}")
    # print(f"y_test: {y_test.shape}")
    
    return x_train, y_train, x_test, y_test


def get_apd_data_ensemble(metrics, phases):
    """
    Combines features s.t. each feature vector represents each of the phases for a single metric.
    Return: x_trian, y_train, x_test, y_test
    """
    metrics_folder = os.path.join(dr.Paths.DATA_DIR, "metrics")
    data_x = []
    data_y = []
    for phase in phases:
        ha_features = []
        la_features = []
        for metric in metrics:
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            la_features.append(arr)

        ha_features = np.hstack(ha_features)
        la_features = np.hstack(la_features)
        x = np.vstack([ha_features, la_features])
        y = np.asarray([ha_labels, la_labels])
        # print(x.shape)
        # print(y.shape)

        data_x.append(x)
        data_y.append(y)
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")
    
    data_x = np.vstack(data_x)
    data_y = np.vstack(data_y)

    # print(f"data_x: {data_x.shape}")
    # print(f"data_y: {data_y.shape}")
    test_size = 0.1
    test_indices = random.sample(range(NUM_SUBJECTS), int(NUM_SUBJECTS*test_size))
    # print(test_indices)
    x_train = []
    y_train = []
    x_test = []
    y_test = []

    for i in range(y.size):
        if i%NUM_SUBJECTS in test_indices:
            x_test.append(x[i, :])
            y_test.append(y[i, :])
        else:
            x_train.append(x[i, :])
            y_train.append(y[i, :])
    
    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train).flatten()
    x_test = np.asarray(x_test)
    y_test = np.asarray(y_test).flatten()
    # print(f"x_train: {x_train.shape}")
    # print(f"y_train: {y_train.shape}")
    # print(f"x_test: {x_test.shape}")
    # print(f"y_test: {y_test.shape}")

    return x_train, y_train, x_test, y_test

In [23]:
import xgboost as xgb

model_phases = phases["Baseline"]

x_train, y_train, x_test, y_test = get_apd_data_feature_fusion(metrics, model_phases)
groups = [x_train.shape[0]]
print(x_train.shape)
print(y_train.shape)

model = xgb.XGBRanker(  
    # tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:map',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
)


model.fit(x_train, y_train, group=groups, verbose=True)

data_x: (104, 7)
data_y: (104, 9)
(47, 7)
(423,)


XGBoostError: [08:19:27] C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/data/data.cc:583: Check failed: labels_.Size() == num_row_ (423 vs. 47) : Size of labels must equal to number of rows.