In [None]:
%load_ext lab_black
import os
import math
import pickle
import inspect
import itertools
from time import time
from copy import deepcopy

import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import xxhash
import matplotlib
import matplotlib.cm as cm
from cachier import cachier
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from ipywidgets import HBox, VBox
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact

from utils import *
from architecture import *

In [None]:
np.set_printoptions(precision=3, suppress=True)

# ignore FastICA did not converge warnings
# TODO investigate why doesn't it converge
import warnings

warnings.filterwarnings("ignore")

# Load data

#### Data read into dataframe structure. Each epoch is a single record.

In [None]:
df_name = "go_nogo_df_personal"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs = pd.read_pickle(pickled_data_filename)
else:
    print("Pickled file not found. Loading data...")
    epochs = create_df_data(info_filename=info_filename, personal=True)
    epochs.name = df_name
    # save loaded data into a pickle file
    epochs.to_pickle("../data/" + epochs.name + ".pkl")

# epochs

#### Sort participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped = epochs.groupby("id")
epochs["error_sum"] = grouped[["marker"]].transform(lambda x: (x.values == ERROR).sum())
epochs["correct_sum"] = grouped[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs = epochs.sort_values("error_sum", ascending=False, kind="mergesort")
# epochs

#### Get metadata

In [None]:
_mne_epochs = load_epochs_from_file("../data/responses/GNG_AA0303-64 el.vhdr")
times = _mne_epochs.times

_channel_info = _mne_epochs.info["chs"]
channel_locations = np.array([ch["loc"][:3] for ch in _channel_info])
channel_names = [ch["ch_name"] for ch in _channel_info]

channel_colors = channel_locations - channel_locations.min(axis=0)
channel_colors /= channel_colors.max(axis=0)
channel_colors = channel_colors * 255 // 1
channel_colors = [f"rgb({c[0]:.0f},{c[1]:.0f},{c[2]:.0f})" for c in channel_colors]

log_freq = np.log2(get_frequencies())  # for plotting CWT

# Train and test

In [None]:
cachedir = "/home/filip/.erpinator_cache"

steps = steps_simple  # one PCA for all

# steps = steps_parallel_pca
# steps.pop(3)  # remove CWT

# StandardScaler doesn't seem to change anything for LDA
# steps = steps[:-2] + [("lasso", Lasso())]
# steps = steps[:-2] + [("lda", LinearDiscriminantAnalysis())]
# steps = steps[:-1] + [("knr", KNeighborsRegressor())]
steps = steps[:-1] + [("lasso", Lasso())]

steps[1] = ("spatial_filter", PCA(random_state=0))  # replace ICA with PCA

regressor_params = dict(
    spatial_filter__n_components=[4],
    #     cwt__mwt=["morl"],
    #     cwt__octaves=[4],
    pca__n_components=[8],
    # featurize__power__cwt__mwt=["cmor0.5-1"],
    # featurize__power__pca__n_components=[3],
    # featurize__shape__cwt__mwt=["mexh"],
    # featurize__shape__pca__n_components=[3],
    #     svr__C=[0.1],
    #     knr__n_neighbors=[11],
    lasso__alpha=[0.0000003],
    # lda__solver=["lsqr"],  # to turn off scaling, to simplify visualizing
)
steps

### Separate model for each person

In [None]:
%%time


print("participant            AUROC   err/corr")
aurocs = []
auroc_sems = []
pipelines = []

# group data by participants' ids
grouped = epochs.groupby(["id"])
for participant_id in epochs["id"].unique():
    participant_df = grouped.get_group(participant_id)

    X = np.array(participant_df["epoch"].to_list())
    y = np.array(participant_df["marker"].to_list())

    pipeline = Pipeline(deepcopy(steps), memory=cachedir)
    pipeline.set_params(**ParameterGrid(regressor_params)[0])

    aurocs_personal = []
    pipelines_personal = []
    skf = StratifiedKFold(n_splits=2)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train, y_train)

        if type(steps[-1][1]) == LinearDiscriminantAnalysis:
            y_pred = pipeline.predict_proba(X_test)[:, 1]
        else:
            y_pred = pipeline.predict(X_test)
        # corr = np.corrcoef(y_test, y_pred)[0][1]
        # r2 = r2_score(y_test, y_pred)
        auroc = roc_auc_score(y_test, y_pred)
        aurocs_personal.append(auroc)
        pipelines_personal.append(pipeline)

    aurocs.append(np.mean(aurocs_personal))
    auroc_sems.append(scipy.stats.sem(aurocs_personal))
    pipelines.append(pipelines_personal)

    error_size = participant_df["error_sum"].iloc[0]
    correct_size = participant_df["correct_sum"].iloc[0]
    print(
        f"{participant_id:11}    "
        f"{aurocs[-1]:.3f} ± {auroc_sems[-1]:.3f}    "
        f"{error_size:3}/{correct_size:3}"
    )

total_sem = sum(np.array(auroc_sems) ** 2) ** (1 / 2) / len(auroc_sems)
mean_auroc = f"{np.mean(aurocs):.3f} ± {total_sem:.3f}"
print("mean AUROC: " + mean_auroc)