In [None]:
%load_ext lab_black
import os
import pickle
import inspect
import itertools
from time import time

import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import xxhash
from cachier import cachier
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA

from utils import *
from architecture import *

In [None]:
np.set_printoptions(precision=3)

# ignore FastICA did not converge warnings
# TODO investigate why doesn't it converge
import warnings

warnings.filterwarnings("ignore")

# Load data

#### Data read into dataframe structure. Each epoch is a single record.

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs = pd.read_pickle(pickled_data_filename)
else:
    print("Pickled file not found. Loading data...")
    epochs = create_df_data(info_filename=info_filename)
    epochs.name = df_name
    # save loaded data into a pickle file
    epochs.to_pickle("../data/" + epochs.name + ".pkl")

# epochs

#### Sort participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped = epochs.groupby("id")
epochs["error_sum"] = grouped[["marker"]].transform(lambda x: (x.values == ERROR).sum())
epochs["correct_sum"] = grouped[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs = epochs.sort_values("error_sum", ascending=False, kind="mergesort")
# epochs

#### Get metadata

In [None]:
_mne_epochs = load_epochs_from_file("../data/responses/GNG_AA0303-64 el.vhdr")
times = _mne_epochs.times

_channel_info = _mne_epochs.info["chs"]
channel_locations = np.array([ch["loc"][:3] for ch in _channel_info])
channel_names = [ch["ch_name"] for ch in _channel_info]

channel_colors = channel_locations - channel_locations.min(axis=0)
channel_colors /= channel_colors.max(axis=0)
channel_colors = channel_colors * 255 // 1
channel_colors = [f"rgb({c[0]:.0f},{c[1]:.0f},{c[2]:.0f})" for c in channel_colors]

log_freq = np.log2(get_frequencies())  # for plotting CWT

# Train and test

In [None]:
cachedir = "/home/filip/.erpinator_cache"

steps = steps_parallel_pca
# steps = steps[:-1] + [("lda", LinearDiscriminantAnalysis())]
# steps = steps[:-1] + [("knr", KNeighborsRegressor())]
steps = steps[:-1] + [("lasso", Lasso())]

regressor_params = dict(
    ica__n_components=[3],
    cwt__mwt=["mexh"],
    pca__n_components=[3],
    # featurize__power__cwt__mwt=["cmor0.5-1"],
    # featurize__power__pca__n_components=[3],
    # featurize__shape__cwt__mwt=["mexh"],
    # featurize__shape__pca__n_components=[3],
    #     svr__C=[0.1],
    #     knr__n_neighbors=[11],
    lasso__alpha=[0.2, 0.5, 1],
)
steps

### Separate model for each person

In [None]:
%%time

pipeline = Pipeline(steps, memory=cachedir)
pipeline.set_params(**ParameterGrid(regressor_params)[0])

print("participant            AUROC   err/corr")
aurocs = []
auroc_sems = []

# group data by participants' ids
grouped = epochs.groupby(["id"])
for participant_id in epochs["id"].unique():
    participant_df = grouped.get_group(participant_id)

    X = np.array(participant_df["epoch"].to_list())
    y = np.array(participant_df["marker"].to_list())

    aurocs_personal = []
    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipeline.fit(X_train, y_train)
        
        if type(steps[-1][1]) == LinearDiscriminantAnalysis:
            y_pred = pipeline.predict_proba(X_test)[:, 1]
        else:
            y_pred = pipeline.predict(X_test)
        # corr = np.corrcoef(y_test, y_pred)[0][1]
        # r2 = r2_score(y_test, y_pred)
        auroc = roc_auc_score(y_test, y_pred)
        aurocs_personal.append(auroc)

    aurocs.append(np.mean(aurocs_personal))
    auroc_sems.append(scipy.stats.sem(aurocs_personal))

    error_size = participant_df["error_sum"].iloc[0]
    correct_size = participant_df["correct_sum"].iloc[0]
    print(
        f"{participant_id:11}    "
        f"{aurocs[-1]:.3f} ± {auroc_sems[-1]:.3f}    "
        f"{error_size:3}/{correct_size:3}"
    )

total_sem = sum(np.array(auroc_sems) ** 2) ** (1 / 2) / len(auroc_sems)
mean_auroc = f"{np.mean(aurocs):.3f} ± {total_sem:.3f}"
print("mean AUROC: " + mean_auroc)

### One model for all people

In [None]:
def custom_gridsearch(steps, cv, regressor_params, memory):
    pipeline = Pipeline(steps, memory=memory)
    print(" " * 133 + "corr           r2")

    # get params randomly
    all_params = list(ParameterGrid(regressor_params))
    # shuffle(all_params)

    for params in all_params:
        pipeline.set_params(**params)

        scores = []
        kf = KFold(n_splits=cv)
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            pipeline.fit(X_train, y_train)
            if type(steps[-1][1]) == LinearDiscriminantAnalysis:
                y_pred = pipeline.predict_proba(X_test)[:, 1]
            else:
                y_pred = pipeline.predict(X_test)
            corr = np.corrcoef(y_test, y_pred)[0][1]
            r2 = r2_score(y_test, y_pred)
            auroc = roc_auc_score(y_test, y_pred)  # it's different in classification!

            scores.append([corr, r2, auroc])
            print(corr, r2, auroc)

        # print scores
        print(f"{str(params):126}", end=" ")
        means = np.mean(scores, axis=0)
        sems = scipy.stats.sem(scores, axis=0)
        for mean, sem in zip(means, sems):
            print(f"{mean:5.2f}±{sem:4.2f}", end="   ")
        print()

In [None]:
X = np.array(epochs["epoch"].to_list())
y = np.array(epochs["marker"].to_list())

In [None]:
%%time

custom_gridsearch(steps, cv=5, regressor_params=regressor_params, memory=None)

# Testing ICA stability

In [None]:
def correlations(a0, a1):
    """Find correlation matrix between 2 matrices.
    It's similar to np.corrcoef, but it doesn't subtract the mean,
    when calculating the sum of squares.

    Parameters
    ----------
    a0, a1 : array_like
        2-D arrays containing multiple variables and observations.
        Each row represents a variable, and each column a single
        observation of all those variables.
        Their number of columns must be equal.
    """
    cov = a0 @ a1.T
    sum_of_squares0 = np.sum(a0 * a0, axis=1).reshape(-1, 1)
    sum_of_squares1 = np.sum(a1 * a1, axis=1).reshape(1, -1)
    return cov / (sum_of_squares0 @ sum_of_squares1) ** (1 / 2)


def factor_similarity(a0, a1):
    """Measure how similar are the factors.
    Reordering and rescaling them doesn't change the similarity.
    """
    corr = correlations(a0, a1)
    sim = abs(corr)  # don't care if factors' sign is flipped
    sim = sim.max(axis=0)  # don't care if factors are reordered
    return sim.mean()


def show_spatial_filters(filters, coefs):
    # all interpolation methods in mne.viz.plot_topomap
    # give strange artifacts for some reason, so use this instead
    x, y, z = channel_locations.T
    titles = [f"{coef:.2f}" for coef in coefs]

    scalp = go.FigureWidget(make_subplots(cols=len(filters), subplot_titles=titles))
    scalp.update_layout(**base_layout)
    scalp.update_layout(width=200 * len(filters), height=200)
    scalp.update_xaxes(showgrid=False)
    scalp.update_yaxes(showgrid=False)

    for i, filter_ in enumerate(filters):
        scalp.add_scatter(
            x=x,
            y=y,
            row=1,
            col=i + 1,
            mode="markers",
            #         mode="markers+text",
            text=channel_names,
            marker_size=15,
            marker_color=-filter_,  # negate, so that red is positive
            marker_colorscale="RdBu",
        )
    return scalp

### Will ICA find the same factors for one person every time?

In [None]:
id_ = epochs["id"].unique()[7]
grouped = epochs.groupby(["id"])
participant_df = grouped.get_group(id_)
X = np.array(participant_df["epoch"].to_list())
y = np.array(participant_df["marker"].to_list())

clfs = []
spatial_filters = []

skf = StratifiedKFold(n_splits=4)
# skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # use test sets, because they don't overlap
    # so are better to test stability
    params, clf = train(X_test, y_test, wv_weighting="single")

    single_split_spatial_filters = np.array([filt for filt, _ in params])
    spatial_filters.append(single_split_spatial_filters)
    clfs.append(clf)

In [None]:
print("correlations between factors found in the first, and the second split")
correlations(spatial_filters[0], spatial_filters[1])

In [None]:
print(
    "similarity measures between factors found in each pair of splits, for a single participant"
)
similarities = np.array(
    [
        [factor_similarity(sf_i, sf_j) for sf_i in spatial_filters]
        for sf_j in spatial_filters
    ]
)
print(similarities)
print("mean", similarities.mean())

In [None]:
# for clf in clfs:
#     print(clf.coef_, clf.intercept_)

In [None]:
for split in range(4):
    display(show_spatial_filters(spatial_filters[split], clfs[split].coef_[0]))

### Will ICA find similar factors for different people?

In [None]:
clfs = []
spatial_filters = []

grouped = epochs.groupby(["id"])
for participant_id in epochs["id"].unique()[:8]:
    participant_df = grouped.get_group(participant_id)

    X = np.array(participant_df["epoch"].to_list())
    y = np.array(participant_df["marker"].to_list())

    # train
    params, clf = train(X, y, wv_weighting="single")

    one_participant_spatial_filters = np.array([filt for filt, _ in params])
    spatial_filters.append(one_participant_spatial_filters)
    clfs.append(clf)

In [None]:
print("correlations between factors found for the first, and the second participant")
correlations(spatial_filters[0], spatial_filters[1])

In [None]:
print("similarity measures between factors found for each pair of participants")
np.array(
    [
        [factor_similarity(sf_i, sf_j) for sf_i in spatial_filters]
        for sf_j in spatial_filters
    ]
)

In [None]:
for participant in range(4):
    display(
        show_spatial_filters(spatial_filters[participant], clfs[participant].coef_[0])
    )

In [None]:
# # mne plotting for comparison
# x, y, z = channel_locations.T
# mne.viz.plot_topomap(
#     spatial_filters[participant][2], np.stack((x, y), axis=-1)
# )

In [None]:
corr = correlations(spatial_filters[1], spatial_filters[7])
corr

In [None]:
# try to find corresponding components
best_similarity = 0
for perm in itertools.permutations(range(3)):
    perm = list(perm)
    diag = corr[perm].diagonal()
    similarity = abs(diag).mean()
    if similarity > best_similarity:
        best_similarity = similarity
        best_perm = perm

print(best_similarity)
print(best_perm)
corr[best_perm]