### Can we identify the subject based on their physiological signals and self-reports? ###

In [1]:
# IMPORTING MODULES
import glob
import importlib
import matplotlib.pyplot as plt
import numpy as np
import os
cvx_path = os.path.abspath(os.path.join('..', 'cvxEDA', 'src'))
module_path = os.path.abspath(os.path.join('..', 'src'))
sys.path.append(module_path)
import pandas as pd
import random
import scipy.signal as ss
import sys

import tools.data_reader_apd as dr
import tools.display_tools as dt
import tools.preprocessing as preprocessing

from scipy.fft import fft, fftfreq, fftshift
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize

import cvxopt.solvers
cvxopt.solvers.options['show_progress'] = False

import warnings
warnings.filterwarnings(
    "ignore", 
    category=RuntimeWarning
)


phases = {
    "Baseline": [dr.Phases.BASE_REST, dr.Phases.BASE_SPEECH],
    "Bug baseline": [dr.Phases.BUG_RELAX],
    "Speech baseline": [dr.Phases.SPEECH_RELAX],
    "Bug all": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech all": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
    "Bug pre-anxiety": [dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE],
    "Speech pre-anxiety": [dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE],
    "Bug anxiety": [dr.Phases.BUG_EXPOSURE],
    "Speech anxiety": [dr.Phases.SPEECH_EXPOSURE],
    "Bug post-anxiety": [dr.Phases.BUG_BREAK, dr.Phases.BUG_REFLECT],
    "Speech post-anxiety": [dr.Phases.SPEECH_BREAK, dr.Phases.SPEECH_REFLECT],
}

test_phases = [
    phases["Baseline"],
    phases["Bug baseline"],
    phases["Speech baseline"],
    phases["Bug baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"],
    phases["Baseline"] + phases["Speech baseline"],
    phases["Baseline"] + phases["Bug baseline"] + phases["Speech baseline"],

    phases["Bug all"],
    phases["Speech all"],
    phases["Bug all"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"],
    phases["Baseline"] + phases["Speech all"],
    phases["Baseline"] + phases["Bug all"] + phases["Speech all"],

    phases["Bug pre-anxiety"],
    phases["Speech pre-anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Speech pre-anxiety"],

    phases["Bug anxiety"],
    phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"],
    phases["Speech post-anxiety"],
    phases["Bug post-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Speech post-anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug anxiety"] + phases["Speech pre-anxiety"] + phases["Speech anxiety"],

    phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"],
    phases["Baseline"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],
    phases["Baseline"] + phases["Bug post-anxiety"] + phases["Bug anxiety"] + phases["Speech post-anxiety"] + phases["Speech anxiety"],

    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"],
    phases["Baseline"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
    phases["Baseline"] + phases["Bug pre-anxiety"] + phases["Bug post-anxiety"] + phases["Speech pre-anxiety"] + phases["Speech post-anxiety"],
]

In [2]:
# RANKING PHASES BY LOW TO HIGH ANXIETY
SUDS_labels = [
    "Participant",
    "Baseline_SUDS",
    # "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Exposure_SUDS", "BugBox_Break_SUDS",
    "BugBox_Relax_SUDS", "BugBox_Preparation_SUDS", "BugBox_Break_SUDS",
    "Speech_Relax_SUDS", "Speech_SUDS", "Speech_Exposure_SUDS", "Speech_Break_SUDS"
]


ha_participant_indices = [
    'P4', 'P6', 'P7', 'P8', 'P10', 'P12', 'P15', 'P16', 'P18', 'P22', 'P26', 'P27', 'P29', 'P31', 'P32', 'P33', 'P35', 'P42', 'P45', 'P47', 'P48', 'P49', 'P54', 'P55', 'P66', 'P69'
]

la_participant_indices = [
    'P14', 'P21', 'P23', 'P25', 'P34', 'P39', 'P43', 'P46', 'P51', 'P57', 'P71', 'P72', 'P77', 'P78', 'P79', 'P80', 'P82', 'P83', 'P84', 'P85', 'P87', 'P88', 'P89', 'P91', 'P92', 'P93'
]

participant_file = os.path.join(dr.Paths.DATA_DIR, "participants_details.csv")
df = pd.read_csv(participant_file)

suds_df = df[SUDS_labels]
ha_suds_df = suds_df.loc[suds_df['Participant'].isin(ha_participant_indices)]
la_suds_df = suds_df.loc[suds_df['Participant'].isin(la_participant_indices)]

ha_suds_df = ha_suds_df.rename(columns={"Participant": "subject"})
la_suds_df = la_suds_df.rename(columns={"Participant": "subject"})

for i in range(ha_suds_df.shape[0]):
    p = int(ha_suds_df.iloc[i, ha_suds_df.columns.get_loc("subject")][1:])
    ha_suds_df.iloc[i, ha_suds_df.columns.get_loc("subject")] = p
for i in range(la_suds_df.shape[0]):
    p = int(la_suds_df.iloc[i, la_suds_df.columns.get_loc("subject")][1:])
    la_suds_df.iloc[i, la_suds_df.columns.get_loc("subject")] = p

# ha_suds_df['median'] = ha_suds_df.iloc[:, 1:].median(axis=1)
# la_suds_df['median'] = la_suds_df.iloc[:, 1:].median(axis=1)
ha_suds_df['median'] = ha_suds_df.iloc[:, 1:].mean(axis=1)
la_suds_df['median'] = la_suds_df.iloc[:, 1:].mean(axis=1)
columns = {c: SUDS_labels.index(c)-1 for c in ha_suds_df.columns[1:-1]}

ha_rankings = ha_suds_df.rename(columns={c: SUDS_labels.index(c)-1 for c in ha_suds_df.columns[1:-1]}).reset_index(drop=True)
la_rankings = la_suds_df.rename(columns={c: SUDS_labels.index(c)-1 for c in la_suds_df.columns[1:-1]}).reset_index(drop=True)


In [28]:
RANKING_PHASES = [
    dr.Phases.BASE_REST,
    # dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK,
    dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_BREAK,
    dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK
]

NUM_SUBJECTS = 52
TEST_SIZE = 0.1

def get_apd_data_ranking(metrics, phases, verbose=False, include_anxiety_labels=False):
    metrics_folder = os.path.join(dr.Paths.DATA_DIR, "metrics")

    columns = metrics.copy()
    columns.insert(0, "subject")

    data_x = []
    data_y = pd.concat([ha_rankings, la_rankings], axis=0).reset_index(drop=True)

    for phase in phases:
        if verbose: print(f"Generating features for phase {phase} " + "-"*30)
        phase_id = phases.index(phase)
        ha_features = []
        la_features = []

        for i in range(len(metrics)):
            metric = metrics[i]
            if verbose: print(f"Generating features for metric {metric}")
            file = os.path.join(metrics_folder, f"{metric}_{phase}_ha.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                ids = pd.DataFrame(data=ids, columns=["subject"])
                ha_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            arr = pd.DataFrame(data=arr, columns=[f"{metric}"])
            ha_features.append(arr)

            file = os.path.join(metrics_folder, f"{metric}_{phase}_la.csv")
            arr = pd.read_csv(file, index_col=[0]).to_numpy()

            if i == 0:  # subject IDs
                ids = np.reshape(arr[:, 0], (arr[:, 0].size, 1))
                ids = pd.DataFrame(data=ids, columns=["subject"])
                la_features.append(ids)

            # arr = arr[1:, 1:]
            col_mean = np.nanmean(arr, axis=1)
            idx = np.where(np.isnan(arr))
            arr[idx] = np.take(col_mean, idx[0])
            arr = np.nan_to_num(arr)
            arr = np.mean(arr[:, 1:], axis=1)
            arr = np.reshape(arr, (arr.size, 1))
            arr = pd.DataFrame(data=arr, columns=[f"{metric}"])
            la_features.append(arr)

        if include_anxiety_labels: 
            ha_group = pd.DataFrame(data=[1 for _ in range(len(ha_features[0]))])
            la_group = pd.DataFrame(data=[0 for _ in range(len(la_features[0]))])
            anxiety_label = pd.concat([ha_group, la_group])

        ha_features = pd.concat(ha_features, axis=1)
        la_features = pd.concat(la_features, axis=1)
        x = pd.concat([ha_features, la_features], axis=0)
        # print(x["subject"].value_counts().iloc[0:8])
        phase = pd.DataFrame(data=[phase_id for _ in range(x.shape[0])])

        x.insert(1, "phaseId", phase)

        if include_anxiety_labels: 
            x.insert(1, "anxiety group", anxiety_label)

        data_x.append(x)
    
    data_x = pd.concat(data_x).reset_index(drop=True)
    # data_x.sort_values(by=["phaseId", "subject"], inplace=True)

    # print(data_x.head())
    # print(data_y.head())

    subjects = data_x.loc[:, "subject"]
    phase_col = data_x.loc[:, "phaseId"]
    label = []
    for i in range(data_x.shape[0]):
        s = int(subjects.iloc[i])
        p = int(phase_col.iloc[i])
        rating = data_y.loc[data_y["subject"] == s].loc[:, p].values[0]
        med = data_y.loc[data_y["subject"] == s].loc[:, 'median'].values[0]
        if rating < med:
            label.append(0)  # low anxiety
        else:
            label.append(1)  # high anxiety
            
    data_x["anxiety level"] = label
    
    data_y = pd.DataFrame({"subject": subjects, "label": subjects})
    # data_y = pd.DataFrame({"ranking": ranking_col})

    # print(data_x.shape)
    # print(data_y.shape)
    
    return data_x, data_y


def train_test_split(x, y):
    subjects = list(x.loc[:, "subject"].unique())
    test_subjects = random.sample(subjects, int(NUM_SUBJECTS*TEST_SIZE))
    # print(f"test subjects: {test_subjects}")
    x_train = x[~x["subject"].isin(test_subjects)]
    y_train = y[~y["subject"].isin(test_subjects)]
    x_test = x[x["subject"].isin(test_subjects)]
    y_test = y[y["subject"].isin(test_subjects)]

    x_train = x_train.drop(["subject"], axis=1)
    x_test = x_test.drop(["subject"], axis=1)

    # print(x_train.head())
    # print(y_train.head())

    # print(x.shape)
    # print(y.shape)
    # print(x_train.shape)
    # print(y_train.shape)
    # print(x_test.shape)
    # print(y_test.shape)

    return x_train, y_train, x_test, y_test, test_subjects


def train_predict(models, x, y, show_classification=True):
    """
    models: dictionary of {"name": model}
    """
    out = {}
    x_train, y_train, x_test, y_test, test_subjects = train_test_split(x, y)
    y_true = y_test.loc[:, "label"]
    for model_name in models.keys():
        model = models[model_name]
        model.fit(x_train, y_train.loc[:, "label"])
        y_pred = model.predict(x_test)
        if show_classification:
            print(f"Results for {model_name} -------------------------")
            print(classification_report(y_true, y_pred))
        out[model_name] = accuracy_score(y_true, y_pred)
    return out


In [29]:
# LOAD TRAIN AND TEST DATA

metrics = [
    "bpm", 
    "rmssd", 
    # "hf_rr", 
    # "lf_rr", 
    # "ibi", 
    # "mean_SCL", 
    # "SCR_rate"
]

model_phases = [
    dr.Phases.BASE_REST,
    # dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_EXPOSURE, dr.Phases.BUG_BREAK,
    dr.Phases.BUG_RELAX, dr.Phases.BUG_ANTICIPATE, dr.Phases.BUG_BREAK,
    dr.Phases.SPEECH_RELAX, dr.Phases.SPEECH_ANTICIPATE, dr.Phases.SPEECH_EXPOSURE, dr.Phases.SPEECH_BREAK
]

if type(model_phases) != list:
    model_phases = [model_phases]

x, y = get_apd_data_ranking(metrics, model_phases, verbose=False, include_anxiety_labels=True)
# print(x.head())
# print(y.head())
# print(y.loc[:, "label"].value_counts())

models = {
    "SVM": SVC(), 
    "KNN": KNeighborsClassifier()
}

results = {"SVM": [], "KNN": []}
num_iters = 10
for _ in range(num_iters):
    out = train_predict(models, x, y, show_classification=False)
    for model_name in results:
        results[model_name].append(out[model_name])

for model_name in results.keys():
    print(f"{model_name} accuracy over {num_iters} rounds: {np.mean(results[model_name])}")

31     22.0
35     31.0
37     33.0
43     48.0
48     66.0
83     22.0
87     31.0
89     33.0
95     48.0
100    66.0
135    22.0
139    31.0
141    33.0
147    48.0
152    66.0
187    22.0
191    31.0
193    33.0
199    48.0
204    66.0
239    22.0
243    31.0
245    33.0
251    48.0
256    66.0
291    22.0
295    31.0
297    33.0
303    48.0
308    66.0
343    22.0
347    31.0
349    33.0
355    48.0
360    66.0
395    22.0
399    31.0
401    33.0
407    48.0
412    66.0
Name: label, dtype: float64
[15. 25. 54. 12. 25.  7. 27. 54. 15. 46. 54. 25. 49. 82. 25.  7. 55. 54.
 10. 10. 15. 25.  6. 82. 92.  6. 21. 54. 82. 27. 27. 27. 54. 57. 92.  7.
 21.  7. 43. 46.]
31     22.0
35     31.0
37     33.0
43     48.0
48     66.0
83     22.0
87     31.0
89     33.0
95     48.0
100    66.0
135    22.0
139    31.0
141    33.0
147    48.0
152    66.0
187    22.0
191    31.0
193    33.0
199    48.0
204    66.0
239    22.0
243    31.0
245    33.0
251    48.0
256    66.0
291    22.0
295    31.0
297 