In [291]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', 'src'))
sys.path.append(module_path)
import pandas as pd
import random
import scipy.signal as ss
import sys

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing

from scipy.fft import fft, fftfreq, fftshift
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import normalize

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)


metrics = [
    # "rmssd", "hf_rr", "lf_rr", "ibi", 
    "bpm", "rmssd", "hf_rr", "lf_rr", "ibi", 
    "mean_SCL", "SCR_rate"
]

phases = {
    "Baseline": [dr.Phases.BASE_REST, dr.Phases.BASE_SPEECH],
    "Bug baseline": [dr.Phases.BUG_RELAX],
    "Speech baseline": [dr.Phases.SPEECH_RELAX],
    "Bug all": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech all": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
    "Bug pre-anxiety": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE],
    "Speech pre-anxiety": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE],
    "Bug anxiety": [dr.Phases.BUG_EXPOSURE],
    "Speech anxiety": [dr.Phases.SPEECH_EXPOSURE],
    "Bug post-anxiety": [dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech post-anxiety": [dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
}

test_phases = [
    phases["Baseline"],
    phases["Bug baseline"],
    phases["Speech baseline"],
    phases["Bug baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"],
    phases["Baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"] + phases["Speech baseline"],

    phases["Bug all"],
    phases["Speech all"],
    phases["Bug all"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"],
    phases["Baseline"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"] + phases["Speech all"],

    phases["Bug pre-anxiety"],
    phases["Speech pre-anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],

    phases["Bug anxiety"],
    phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"],
    phases["Speech post-anxiety"],
    phases["Bug post-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Speech post-anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
]

In [21]:
# RANKING PHASES BY HIGH TO LOW DISTRESS
SUDS_labels = [
    "Participant",
    "Baseline_SUDS",
    "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Exposure_SUDS", "BugBox_Break_SUDS",
    "Speech_Relax_SUDS", "Speech_SUDS", "Speech_Exposure_SUDS", "Speech_Break_SUDS"
]


ha_participant_indices = [
    'P4', 'P6', 'P7', 'P8', 'P10', 'P12', 'P15', 'P16', 'P18', 'P22', 'P26', 'P27', 'P29', 'P31', 'P32', 'P33', 'P35', 'P42', 'P45', 'P47', 'P48', 'P49', 'P54', 'P55', 'P66', 'P69'
]

la_participant_indices = [
    'P14', 'P21', 'P23', 'P25', 'P34', 'P39', 'P43', 'P46', 'P51', 'P57', 'P71', 'P72', 'P77', 'P78', 'P79', 'P80', 'P82', 'P83', 'P84', 'P85', 'P87', 'P88', 'P89', 'P91', 'P92', 'P93'
]

participant_file = os.path.join(dr.Paths.DATA_DIR, "participants_details.csv")
df = pd.read_csv(participant_file)

suds_df = df[SUDS_labels]
ha_suds_df = suds_df.loc[suds_df['Participant'].isin(ha_participant_indices)]
la_suds_df = suds_df.loc[suds_df['Participant'].isin(la_participant_indices)]

ha_ranked = {}
la_ranked = {}

for i in range(ha_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, ha_suds_df.shape[1]):
        phases_ranked.append((ha_suds_df.iloc[i, j], ha_suds_df.columns[j]))
    ha_ranked[ha_suds_df.iloc[i, 0]] = phases_ranked

for i in range(la_suds_df.shape[0]):
    phases_ranked = []
    for j in range(1, la_suds_df.shape[1]):
        phases_ranked.append((la_suds_df.iloc[i, j], la_suds_df.columns[j]))
    la_ranked[la_suds_df.iloc[i, 0]] = phases_ranked

ha_labels = {}
la_labels = {}

for p in ha_ranked.keys():
    suds = ha_ranked[p]
    suds = suds.sort(key=lambda x:x[0])
    ha_labels[p] = [SUDS_labels.index(phase[1]) for phase in ha_ranked[p]]

for p in la_ranked.keys():
    suds = la_ranked[p]
    suds = suds.sort(key=lambda x:x[0])
    la_labels[p] = [SUDS_labels.index(phase[1]) for phase in la_ranked[p]]

# Phases ranked from low to high anxiety. Phase ID corresponds to phase index in SUDS_labels
ha_labels = pd.DataFrame.from_dict(ha_labels, orient='index')
la_labels = pd.DataFrame.from_dict(la_labels, orient='index')
# ha_labels = np.vstack(ha_labels)
# la_labels = np.vstack(la_labels)

In [575]:
NUM_SUBJECTS = 52

def get_apd_data_feature_fusion(metrics, phases):
    """
    Combines features s.t. each feature vector represents each of the metrics for a single phase.
    If there are 2 phases, then the first half of the rows correspond to the first phase, and the second half to the second phase.
    Number of columns = subject column + number of metrics
    Return: feature DataFrame, label DataFrame
    """
    metrics_folder = os.path.join(dr.Paths.DATA_DIR, "metrics")
    data_x = []
    data_y = []
    for phase in phases:
        print(f"Generating features for phase {phase} " + "-"*30)
        ha_features = []
        la_features = []
        for i in range(len(metrics)):
            metric = metrics[i]
            print(f"Generating features for metric {metric}")
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                ha_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            # print(arr)
            # arr = normalize(arr)
            # arr = np.reshape(arr, (arr.size, 1))
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                la_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            # print(arr)
            # arr = normalize(arr)
            # arr = np.reshape(arr, (arr.size, 1))
            la_features.append(arr)

        ha_features = np.hstack(ha_features)  # horizontally concatenate metrics for one phase
        la_features = np.hstack(la_features)  # horizontally concatenate metrics for one phase
        x = np.vstack([ha_features, la_features])  # vertically concatenate metrics for one phase for HA and LA
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")

        data_x.append(x)
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")
    
    # phases concatenated vertically
    data_x = np.vstack(data_x)
    data_y = np.vstack([ha_labels, la_labels])
    # print(f"data_x: {data_x.shape}")
    # print(f"data_y: {data_y.shape}")

    return data_x, data_y


def format_input_for_ranking(x, y, metrics):
    num_subjects = y.shape[0]
    num_phases = x.shape[0] // num_subjects
    num_rankings = y.shape[1]
    x = np.repeat(x, repeats=num_rankings, axis=0)
    subject_col = x[:, 0]
    phases = np.reshape(y, (y.size, 1))
    phases = np.vstack([phases for _ in range(num_phases)]).flatten()
    rankings = np.asarray(list(range(1, 10))*num_subjects*num_phases)
    x = np.insert(x, 1, phases, axis=1)

    # print(x.shape)
    # print(y.shape)
    columns = metrics.copy()
    columns.insert(0, "subject")
    columns.insert(1, "phaseId")
    x = pd.DataFrame(data=x, columns=columns)
    y = pd.DataFrame({"subject": subject_col, "ranking": rankings})
    # print(x.head())
    # print(y.head())
    
    # train test split
    test_size = 0.1
    subjects = x.loc[:, "subject"].unique().tolist()
    test_subjects = random.sample(subjects, int(NUM_SUBJECTS*test_size))
    test = x.index[x["subject"].isin(test_subjects)].tolist()
    train = x.index[~x["subject"].isin(test_subjects)].tolist()

    print(f"test subjects: {test_subjects}")

    x_train = x.iloc[train, :].reset_index(drop=True)
    y_train = y.iloc[train, :].reset_index(drop=True)
    x_test = x.iloc[test, :].reset_index(drop=True)
    y_test = y.iloc[test, :].reset_index(drop=True)
    # print(f"x_train: {x_train.shape}")
    # print(f"y_train: {y_train.shape}")
    # print(f"x_test: {x_test.shape}")
    # print(f"y_test: {y_test.shape}")

    return x_train, y_train, x_test, y_test, test_subjects


def get_apd_data_ensemble(metrics, phases):
    """
    Combines features s.t. each feature vector represents each of the phases for a single metric.
    Return: x_trian, y_train, x_test, y_test
    """
    metrics_folder = os.path.join(dr.Paths.DATA_DIR, "metrics")
    data_x = []
    data_y = []
    for phase in phases:
        ha_features = []
        la_features = []
        for metric in metrics:
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, header=None, index_col=[0]).to_numpy()
            arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            # arr = normalize(arr)
            arr = np.mean(arr, axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            la_features.append(arr)

        ha_features = np.hstack(ha_features)
        la_features = np.hstack(la_features)
        x = np.vstack([ha_features, la_features])
        y = np.asarray([ha_labels, la_labels])
        # print(x.shape)
        # print(y.shape)

        data_x.append(x)
        data_y.append(y)
        # print(f"x: {x.shape}")
        # print(f"y: {y.shape}")
    
    data_x = np.vstack(data_x)
    data_y = np.vstack(data_y)

    # print(f"data_x: {data_x.shape}")
    # print(f"data_y: {data_y.shape}")
    test_size = 0.1
    test_indices = random.sample(range(NUM_SUBJECTS), int(NUM_SUBJECTS*test_size))
    # print(test_indices)
    x_train = []
    y_train = []
    x_test = []
    y_test = []

    for i in range(y.size):
        if i%NUM_SUBJECTS in test_indices:
            x_test.append(x[i, :])
            y_test.append(y[i, :])
        else:
            x_train.append(x[i, :])
            y_train.append(y[i, :])
    
    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train).flatten()
    x_test = np.asarray(x_test)
    y_test = np.asarray(y_test).flatten()
    # print(f"x_train: {x_train.shape}")
    # print(f"y_train: {y_train.shape}")
    # print(f"x_test: {x_test.shape}")
    # print(f"y_test: {y_test.shape}")

    return x_train, y_train, x_test, y_test

In [598]:
import xgboost as xgb

model_phases = phases["Baseline"][0]
if type(model_phases) != list:
    model_phases = [model_phases]

x, y = get_apd_data_feature_fusion(metrics, model_phases)
num_subjects = y.shape[0]
num_phases = x.shape[0] // num_subjects
num_rankings = y.shape[1]
x_train, y_train, x_test, y_test, test_subjects = format_input_for_ranking(x, y, metrics)
num_train_subjects = num_subjects - len(test_subjects)
groups = [y_train.shape[0]//num_train_subjects//num_phases for _ in range(num_train_subjects*num_phases)]

# print(len(groups))
# print(groups[0])

# print(x_train.head())
# print(y_train.head())
# print(x_test.head())
# print(y_test.head())


Generating features for phase Baseline_Rest ------------------------------
Generating features for metric bpm
Generating features for metric rmssd
Generating features for metric hf_rr
Generating features for metric lf_rr
Generating features for metric ibi
Generating features for metric mean_SCL
Generating features for metric SCR_rate
test subjects: [22.0, 35.0, 18.0, 85.0, 88.0]


In [599]:
model = xgb.XGBRanker(  
    # tree_method='gpu_hist',
    # booster='gbtree',
    objective='rank:ndcg',
    n_estimators=100, 
    random_state=42, 
    learning_rate=0.1,
    # colsample_bytree=0.9, 
    # eta=0.05, 
    # max_depth=6, 
    # subsample=0.75 
)

model.fit(x_train, y_train.iloc[:, 1], group=groups, verbose=True)

In [612]:
def predict(data, model):
    subjects = data.loc[:, "subject"].unique()
    subject_list = list()
    phase_ids = list()
    ranks = list()
    for subject in subjects:
        df = data.loc[data["subject"] == subject]
        pred = model.predict(df)
        phaseId = np.array(df.reset_index()['phaseId'])
        pred = np.argsort(pred)  # lowest to highest anxiety
        phase_ids.extend(list(phaseId[pred]))
        subject_list.extend([subject]*len(pred))
        ranks.extend(list(range(1, len(pred)+1)))

    results = pd.DataFrame({'subject': subject_list, 'phaseId': phase_ids, 'ranking': ranks})
    results = results.sort_values(by=["subject", "phaseId"])
    
    return results

predicted = predict(x_test, model)

In [613]:
subjects = np.sort(predicted.loc[:, 'subject'].unique())
dfs = []
for subject in subjects:
    subject_df = predicted.loc[predicted['subject'] == subject].reset_index(drop=True)
    label = y_test[y_test['subject'] == subject].iloc[:, 1].reset_index(drop=True)
    subject_df['actual'] = label
    dfs.append(subject_df)

df_view = pd.concat(dfs, axis=1)
print(df_view)

   subject  phaseId  ranking  actual  subject  phaseId  ranking  actual  \
0     18.0      1.0        4       1     22.0      1.0        4       1   
1     18.0      2.0        3       2     22.0      2.0        3       2   
2     18.0      3.0        6       3     22.0      3.0        6       3   
3     18.0      4.0        8       4     22.0      4.0        8       4   
4     18.0      5.0        2       5     22.0      5.0        2       5   
5     18.0      6.0        1       6     22.0      6.0        1       6   
6     18.0      7.0        7       7     22.0      7.0        7       7   
7     18.0      8.0        9       8     22.0      8.0        9       8   
8     18.0      9.0        5       9     22.0      9.0        5       9   

   subject  phaseId  ranking  actual  subject  phaseId  ranking  actual  \
0     35.0      1.0        4       1     85.0      1.0        4       1   
1     35.0      2.0        3       2     85.0      2.0        3       2   
2     35.0      3.0     