In [11]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', 'src'))
sys.path.append(module_path)
import pandas as pd
import random
import scipy.signal as ss
import sys

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing

from scipy.fft import fft, fftfreq, fftshift
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import normalize

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)


metrics = [
    # "rmssd", "hf_rr", "lf_rr", "ibi", 
    "bpm", "rmssd", "hf_rr", "lf_rr", "ibi", 
    "mean_SCL", "SCR_rate"
]

phases = {
    "Baseline": [dr.Phases.BASE_REST, dr.Phases.BASE_SPEECH],
    "Bug baseline": [dr.Phases.BUG_RELAX],
    "Speech baseline": [dr.Phases.SPEECH_RELAX],
    "Bug all": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech all": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
    "Bug pre-anxiety": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE],
    "Speech pre-anxiety": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE],
    "Bug anxiety": [dr.Phases.BUG_EXPOSURE],
    "Speech anxiety": [dr.Phases.SPEECH_EXPOSURE],
    "Bug post-anxiety": [dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech post-anxiety": [dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
}

test_phases = [
    phases["Baseline"],
    phases["Bug baseline"],
    phases["Speech baseline"],
    phases["Bug baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"],
    phases["Baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"] + phases["Speech baseline"],

    phases["Bug all"],
    phases["Speech all"],
    phases["Bug all"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"],
    phases["Baseline"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"] + phases["Speech all"],

    phases["Bug pre-anxiety"],
    phases["Speech pre-anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],

    phases["Bug anxiety"],
    phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"],
    phases["Speech post-anxiety"],
    phases["Bug post-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Speech post-anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
]

In [12]:
# RANKING PHASES BY LOW TO HIGH ANXIETY
SUDS_labels = [
    "Participant",
    "Baseline_SUDS",
    # "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Exposure_SUDS", "BugBox_Break_SUDS",
    "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Break_SUDS",
    "Speech_Relax_SUDS", "Speech_SUDS", "Speech_Exposure_SUDS", "Speech_Break_SUDS"
]


ha_participant_indices = [
    'P4', 'P6', 'P7', 'P8', 'P10', 'P12', 'P15', 'P16', 'P18', 'P22', 'P26', 'P27', 'P29', 'P31', 'P32', 'P33', 'P35', 'P42', 'P45', 'P47', 'P48', 'P49', 'P54', 'P55', 'P66', 'P69'
]

la_participant_indices = [
    'P14', 'P21', 'P23', 'P25', 'P34', 'P39', 'P43', 'P46', 'P51', 'P57', 'P71', 'P72', 'P77', 'P78', 'P79', 'P80', 'P82', 'P83', 'P84', 'P85', 'P87', 'P88', 'P89', 'P91', 'P92', 'P93'
]

participant_file = os.path.join(dr.Paths.DATA_DIR, "participants_details.csv")
df = pd.read_csv(participant_file)

suds_df = df[SUDS_labels]
ha_suds_df = suds_df.loc[suds_df['Participant'].isin(ha_participant_indices)]
la_suds_df = suds_df.loc[suds_df['Participant'].isin(la_participant_indices)]

ha_ranked = {}
la_ranked = {}

for i in range(ha_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, ha_suds_df.shape[1]):
        phases_ranked.append((ha_suds_df.iloc[i, j], ha_suds_df.columns[j]))
    ha_ranked[ha_suds_df.iloc[i, 0]] = phases_ranked

for i in range(la_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, la_suds_df.shape[1]):
        phases_ranked.append((la_suds_df.iloc[i, j], la_suds_df.columns[j]))
    la_ranked[la_suds_df.iloc[i, 0]] = phases_ranked

ha_labels = {}
la_labels = {}

for s in ha_ranked.keys():
    suds = ha_ranked[s]
    suds.sort(key=lambda x:x[0])
    ordered = [phase[1] for phase in ha_ranked[s]]
    ha_labels[int(s[1:])] = [ordered.index(p) for p in SUDS_labels[1:]]

for s in la_ranked.keys():
    suds = la_ranked[s]
    suds.sort(key=lambda x:x[0])
    ordered = [phase[1] for phase in la_ranked[s]]
    la_labels[int(s[1:])] = [ordered.index(p) for p in SUDS_labels[1:]]

# Phase index + 1 corresponds to phase name in SUDS_labels. Value corresponds to ranking from low to high anxiety.
ha_rankings = pd.DataFrame(ha_labels.values(), columns=[i for i in range(len(SUDS_labels[1:]))])
subjects = pd.DataFrame(ha_labels.keys())
ha_rankings.insert(0, "subject", subjects)
la_rankings = pd.DataFrame(la_labels.values(), columns=[i for i in range(len(SUDS_labels[1:]))])
subjects = pd.DataFrame(la_labels.keys())
la_rankings.insert(0, "subject", subjects)

In [13]:
RANKING_PHASES = [
    dr.Phases.BASE_REST,
    # dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK,
    dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_BREAK,
    dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK
]

NUM_SUBJECTS = 52
TEST_SIZE = 0.1

def get_apd_data_ranking(metrics, phases, verbose=False):
    metrics_folder = dr.Paths.METRICS

    columns = metrics.copy()
    columns.insert(0, "subject")

    data_x = []
    data_y = pd.concat([ha_rankings, la_rankings], axis=0).reset_index(drop=True)
    # print(data_y.head())

    for phase in phases:
        if verbose: print(f"Generating features for phase {phase} " + "-"*30)
        phase_id = phases.index(phase)
        ha_features = []
        la_features = []

        for i in range(len(metrics)):
            metric = metrics[i]
            if verbose: print(f"Generating features for metric {metric}")
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                ids = pd.DataFrame(data=ids, columns=["subject"])
                ha_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            arr = pd.DataFrame(data=arr, columns=[f"{metric}"])
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                ids = pd.DataFrame(data=ids, columns=["subject"])
                la_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            arr = pd.DataFrame(data=arr, columns=[f"{metric}"])
            la_features.append(arr)

        ha_features = pd.concat(ha_features, axis=1)
        la_features = pd.concat(la_features, axis=1)
        x = pd.concat([ha_features, la_features], axis=0)
        # print(x["subject"].value_counts().iloc[0:8])
        phase = pd.DataFrame(data=[phase_id for _ in range(x.shape[0])])
        x.insert(1, "phaseId", phase)

        data_x.append(x)
    
    data_x = pd.concat(data_x).reset_index(drop=True)
    # data_x.sort_values(by=["phaseId", "subject"], inplace=True)

    # print(data_y)

    subjects = data_x.loc[:, "subject"]
    # print(subjects.shape)
    # print(subjects.value_counts())
    # print(subjects.unique().shape)
    phase_col = data_x.loc[:, "phaseId"]
    ranking_col = []

    for i in range(data_x.shape[0]):
        s = subjects.iloc[i]
        p = phase_col.iloc[i]
        rank = int(data_y.loc[data_y["subject"] == s].loc[:, p])
        ranking_col.append(rank)
    
    data_y = pd.DataFrame({"subject": subjects, "phaseId": phase_col, "ranking": ranking_col})
    # data_y = pd.DataFrame({"ranking": ranking_col})

    # print(data_x.shape)
    # print(data_y.shape)
    # print(data_x.head())
    # print(data_y.head())
    
    return data_x, data_y


def train_test_split(x, y):
    subjects = list(x.loc[:, "subject"].unique())
    test_subjects = random.sample(subjects, int(NUM_SUBJECTS*TEST_SIZE))
    print(f"test subjects: {test_subjects}")
    x_train = x[~x["subject"].isin(test_subjects)]
    y_train = y[~y["subject"].isin(test_subjects)]
    x_test = x[x["subject"].isin(test_subjects)]
    y_test = y[y["subject"].isin(test_subjects)]

    # print(x.shape)
    # print(y.shape)
    # print(x_train.shape)
    # print(y_train.shape)
    # print(x_test.shape)
    # print(y_test.shape)

    return x_train, y_train, x_test, y_test


def predict(data, labels, model):
    test_subjects = list(data.loc[:, "subject"].unique())
    subject_list = []
    phase_ids = []
    ranks = []
    for s in test_subjects:
        df = data.loc[data["subject"] == s]
        # print(df.iloc[0:8, 0:3])
        pred = model.predict(df)
        # print(f"pred: {pred}")
        phaseId = np.array(df.reset_index()['phaseId'])
        pred = np.argsort(pred)  # lowest to highest anxiety
        # print(f"pred indices: {pred}")
        phase_ranking = list(phaseId[pred])
        # print(f"phase ranking: {phase_ranking}")
        phase_ids.extend(phase_ranking)
        subject_list.extend([s]*len(pred))
        ranks.extend(list(range(len(pred))))
    
    actual = labels.sort_values(by=["subject"]).loc[:, "ranking"]
    results = pd.DataFrame({"subject": subject_list, "phaseId": phase_ids, "ranking": ranks, "actual": actual})
    
    return results

# print(x.head())
# print(y.head())

# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

# print(x_train.head())
# print(y_train.head())
# print(x_test.head())
# print(y_test.head())

In [14]:
import xgboost as xgb

model_phases = [
    dr.Phases.BASE_REST,
    dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_BREAK,
    dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK
]

if type(model_phases) != list:
    model_phases = [model_phases]

x, y = get_apd_data_ranking(metrics, model_phases, verbose=False)
# drop subjects with noisy data
x = x[x['subject'] != 84.0]
y = y[y['subject'] != 84.0]
NUM_SUBJECTS -= 1

# print(x.shape)
# print(y.shape)
# print(x["subject"].value_counts())

num_phases = len(model_phases)
num_rankings = y.shape[1]
x_train, y_train, x_test, y_test = train_test_split(x, y)
num_train_subjects = NUM_SUBJECTS - int(NUM_SUBJECTS*TEST_SIZE)
groups = [num_phases for _ in range(num_train_subjects)]

# print(len(groups))
# print(groups[0])

# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

# print(x_train.head())
# print(y_train.head())
# print(x_test.head())
# print(y_test.head())

model = xgb.XGBRanker(  
    # tree_method='gpu_hist',
    # booster='gbtree',
    objective='rank:ndcg',
    n_estimators=100, 
    random_state=42, 
    learning_rate=0.1,
    # colsample_bytree=0.9, 
    # eta=0.05, 
    # max_depth=6, 
    # subsample=0.75 
)

model.fit(x_train, y_train.loc[:, "ranking"], group=groups, verbose=True)
predicted = predict(x_test, y_test, model)

test subjects: [69.0, 46.0, 27.0, 91.0, 21.0]


  from pandas import MultiIndex, Int64Index


In [15]:
# print(predicted)
# print(y_test)
subjects = list(predicted.loc[:, "subject"].unique())
predicted = predicted.sort_values(by=["subject", "phaseId"]).reset_index(drop=True)

print(predicted)

    subject  phaseId  ranking  actual
0      21.0        0        6       2
1      21.0        1        7       4
2      21.0        2        0       0
3      21.0        3        3       1
4      21.0        4        2       6
5      21.0        5        5       5
6      21.0        6        1       3
7      21.0        7        4       7
8      27.0        0        5       4
9      27.0        1        3       5
10     27.0        2        6       2
11     27.0        3        1       6
12     27.0        4        2       1
13     27.0        5        7       3
14     27.0        6        4       0
15     27.0        7        0       7
16     46.0        0        2       6
17     46.0        1        7       2
18     46.0        2        0       1
19     46.0        3        3       5
20     46.0        4        1       0
21     46.0        5        5       4
22     46.0        6        6       3
23     46.0        7        4       7
24     69.0        0        5       0
25     69.0 