In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import scipy.stats
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import cesium.featurize
from time import sleep
from random import shuffle
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ParameterSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


from utils import *
from architecture import *

In [None]:
# ignore np.corrcoef RuntimeWarnings
import warnings

warnings.filterwarnings("ignore")

In [None]:
# from IPython.display import HTML

# display(HTML('<span style="color: #ff0000">red</span>'))

In [None]:
# # dummy regressor for baseline
# dummy_regr = DummyRegressor(strategy="mean")

# scores = []
# kf = KFold(n_splits=5)
# for train_index, test_index in kf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     dummy_regr.fit(X_train, y_train)
#     y_pred = dummy_regr.predict(X_test)

#     corr = np.corrcoef(y_test, y_pred)[0][1]
#     r2 = r2_score(y_test, y_pred)

#     scores.append([corr, r2])

# # print scores
# means = np.mean(scores, axis=0)
# sems = scipy.stats.sem(scores, axis=0)
# for mean, sem in zip(means, sems):
#     print(f"{mean:5.2f}±{sem:4.2f}", end="   ")

In [None]:
# %%time

# grid_search = GridSearchCV(
#     pipeline,
#     regressor_params,
#     cv=5,
#     scoring={"r2": "r2"},
#     refit=False,
#     n_jobs=1,
#     verbose=3,
# )
# grid_search.fit(X, y)

# # predictions = grid_search.predict(X_test)
# # r2 = grid_search.score(X_test, y_test)
# None

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=0, shuffle=False
# )

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data loads all info from given file but one can specify it by passing a list of desired labels from csv file.

In [None]:
df_name = "go_nogo_df_non_personal"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename, personal=False)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

# Data is now read into dataframe and each epoch is a single record.
# Sorting participants by the number of errors, descending. This way the best participants are first.

# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

## Training and predictions

In [None]:
# X_test, y_test = X_train, y_train
def custom_gridsearch(steps, cv, regressor_params, memory):
    pipeline = Pipeline(steps, memory=memory)
    print(" " * 133 + "corr           r2")

    # get params randomly
    all_params = list(ParameterGrid(regressor_params))
    # shuffle(all_params)

    for params in all_params:
        pipeline.set_params(**params)

        scores = []
        kf = KFold(n_splits=cv)
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            corr = np.corrcoef(y_test, y_pred)[0][1]
            r2 = r2_score(y_test, y_pred)

            scores.append([corr, r2])

        # print scores
        print(f"{str(params):126}", end=" ")
        means = np.mean(scores, axis=0)
        sems = scipy.stats.sem(scores, axis=0)
        for mean, sem in zip(means, sems):
            print(f"{mean:5.2f}±{sem:4.2f}", end="   ")
        print()

In [None]:
X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
y = np.array(epochs_df[epochs_df["marker"] == ERROR]["Rumination Full Scale"].to_list())

In [None]:
cachedir = "/home/filip/.erpinator_cache"

steps = steps_peaks_and_power_and_shape
# steps = steps[:-1] + [("knr", KNeighborsRegressor())]

regressor_params = dict(
    spatial_filter__n_components=[6],
    #     cwt__mwt=["mexh"],
    #     pca__n_components=[3],
    featurize__power__cwt__mwt=["cmor0.5-1"],
    featurize__power__pca__n_components=[3],
    featurize__shape__pca__n_components=[3],
    svr__C=[0.05],
#     knr__n_neighbors=[25],
)
print(regressor_params)
steps

In [None]:
# %%time

# custom_gridsearch(steps, cv=5, regressor_params=regressor_params, memory=cachedir)

### Train model on splitted participants, and then predict rumination for each or their epochs

In [None]:
error_epochs = epochs_df[epochs_df["marker"] == CORRECT]

grouped = error_epochs.groupby(["id"])
participant_ids = error_epochs["id"].unique()

personal_scores = []
# kf = KFold(n_splits=len(participant_ids))
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(participant_ids):
    p_train, p_test = participant_ids[train_index], participant_ids[test_index]

    train_epochs = error_epochs[[e_id in p_train for e_id in error_epochs["id"]]]
    test_epochs = error_epochs[[e_id in p_test for e_id in error_epochs["id"]]]

    X_train = np.array(train_epochs["epoch"].to_list())
    y_train = np.array(train_epochs["Rumination Full Scale"].to_list())

    pipeline = Pipeline(steps, memory=cachedir)
    pipeline.set_params(**ParameterGrid(regressor_params)[0])

    pipeline.fit(X_train, y_train)

    for participant_id in p_test:
        participant_df = grouped.get_group(participant_id)
        X_test = np.array(participant_df["epoch"].to_list())
        y_test = np.array(participant_df["Rumination Full Scale"].to_list())

        y_pred = pipeline.predict(X_test)
        print(y_pred.mean(), np.median(y_pred), y_test[0])
        personal_scores.append([y_pred, y_test[0]])

In [None]:
means = [preds.mean() for preds, true in personal_scores]
medians = [np.median(preds) for preds, true in personal_scores]
trues = [true for preds, true in personal_scores]

In [None]:
all_preds = []
all_trues = []
for preds, true in personal_scores:
    for pred in preds:
        all_preds.append(pred)
        all_trues.append(true)

In [None]:
np.corrcoef(all_preds, all_trues)[0][1]

In [None]:
np.corrcoef(means, trues)[0][1]

In [None]:
np.corrcoef(medians, trues)[0][1]

In [None]:
r2_score(all_trues, all_preds)

In [None]:
r2_score(trues, means)

In [None]:
r2_score(trues, medians)

In [None]:
fig = go.FigureWidget(layout=base_layout)
fig.update_layout(height=600, width=600)
fig.add_scatter(x=trues, y=medians, mode="markers")

In [None]:
# fig = go.FigureWidget(layout=base_layout)
# fig.update_layout(height=600, width=600)
# fig.add_scatter(x=all_trues, y=all_preds, mode="markers")

In [None]:
sorted_scores = sorted(personal_scores, key=lambda pair: pair[1])
preds_sorted = []
trues_sorted = []
for i, pair in enumerate(sorted_scores):
    preds, true = pair
    for pred in preds:
        preds_sorted.append(pred)
        trues_sorted.append(i)

fig = go.FigureWidget(layout=base_layout)
fig.update_layout(height=600, width=600)
fig.add_scatter(x=trues_sorted, y=preds_sorted, mode="markers")