# Rumination prediction

### Imports

In [None]:
%load_ext lab_black
import os
import pickle
from time import time
import pywt
import mne
import scipy
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from ipywidgets import Dropdown, FloatRangeSlider, IntSlider, FloatSlider, interact
from sklearn.decomposition import FastICA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from utils import *

### Loading data

Loading EEG data and data from rumination questionnaire. By default create_df_data load only info about rumination but ones can specify it passing list of desired labels from csv file.

In [None]:
df_name = "go_nogo_df"
pickled_data_filename = "../data/" + df_name + ".pkl"
info_filename = "../data/Demographic_Questionnaires_Behavioral_Results_N=163.csv"

# Check if data is already loaded
if os.path.isfile(pickled_data_filename):
    print("Pickled file found. Loading pickled data...")
    epochs_df = pd.read_pickle(pickled_data_filename)
    print("Done")
else:
    print("Pickled file not found. Loading data...")
    epochs_df = create_df_data(info_filename=info_filename)
    epochs_df.name = df_name
    # save loaded data into a pickle file
    epochs_df.to_pickle("../data/" + epochs_df.name + ".pkl")
    print("Done. Pickle file created")

Data is now read into dataframe and each epoch is a single record.

In [None]:
display(epochs_df)

Sorting participants by the number of errors, descending. This way the best participants are first.

In [None]:
# add new columns with info about error/correct responses amount
grouped_df = epochs_df.groupby("id")
epochs_df["error_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == ERROR).sum()
)
epochs_df["correct_sum"] = grouped_df[["marker"]].transform(
    lambda x: (x.values == CORRECT).sum()
)

# mergesort for stable sorting
epochs_df = epochs_df.sort_values("error_sum", ascending=False, kind="mergesort")

In [None]:
display(epochs_df)

### Vectorization

In [None]:
from sklearn.decomposition import PCA


def vectorize(X, y, mwt="mexh", cwt_density=2, ica_n_components=4):

    # compute ICA for reducing dim from 64-channel to n-channel signal.
    # ICA returns n independent components as n arrays of coeffs each of 64 channles (shape = nx64)

    concat = np.concatenate(X, axis=1)
    ica = FastICA(n_components=ica_n_components)
    ica.fit_transform(concat.T)

    for spatial_filter in ica.components_:
        # apply ICA for creating independent channel. Channel has shape EPOCHS x TIMEPOINTS
        X_filtered = filter_(X, spatial_filter)

        # compute CWT for channel.
        # CWT decompose signal into set of basis functions consisting transformations of mother wavelet function.
        # After decomposing it has shape EPOCH x FREQUENCY x TIMEPOINT
        X_cwts = np.array([cwt(epoch, mwt, cwt_density) for epoch in X_filtered])
        print(X_cwts.shape)

        # PCA need 2-dim array (n_samples, n_features)
        # prepare X set for PCA -> flatten two last dimenstion.

        X_flattened = X_cwts.reshape(X_cwts.shape[0], -1)
        print(X_flattened.shape)

        #         compute PCA for reducing dim. It chose best principle

        pca_n_components = 1
        pca = PCA(n_components=pca_n_components)
        pca_comps = pca.fit_transform(X_flattened)
        print(pca_comps.shape)

In [None]:
# get only epochs from error responses
X = np.array(epochs_df[epochs_df["marker"] == ERROR]["epoch"].to_list())
# all participant's rumination level
y = np.array(epochs_df["marker"].to_list())

In [None]:
ica = vectorize(X, y)

In [None]:
ica

In [None]:
def train(X, y, mwt="mexh", cwt_density=2, ica_n_components=4, wavelet_choice="single"):
    # X has a shape EPOCHS x CHANNELS x TIMEPOINTS
    # y has a shape EPOCHS

    # compute ICA
    concat = np.concatenate(X, axis=1)
    # concat.shape == (num_of_channels, timepoints)
    ica = FastICA(n_components=ica_n_components)
    ica.fit_transform(concat.T)
    # ica.components_.shape == (n_components, num_of_channels)

    features_meta = []
    feature_values = []
    for spatial_filter in ica.components_:
        # apply ICA
        X_filtered = filter_(X, spatial_filter)
        # they have shape EPOCHS x TIMEPOINTS

        # apply cwt
        X_cwts = np.array([cwt(epoch, mwt, cwt_density) for epoch in X_filtered])
        # it has a shape EPOCH x FREQUENCY x TIMEPOINT

        if wavelet_choice == "single":
            # find bets separating wavelet
            separations = get_separations(X_cwts[y == ERROR], X_cwts[y == CORRECT])
            # separations are shaped FREQUENCY x TIMEPOINT
            index = np.unravel_index(separations.argmax(), separations.shape)
            wavelet_weights = np.zeros_like(separations)
            wavelet_weights[index] = 1
        elif wavelet_choice == "LDA":
            pass
        else:
            raise ValueError("wrong wavelet_choice argument")

        # X_cwts has a shape EPOCH x FREQUENCY x TIMEPOINT
        # wavelet_weights has a shape FREQUENCY x TIMEPOINT
        X_end = np.tensordot(X_cwts, wavelet_weights, axes=([1, 2], [0, 1]))

        features_meta.append((spatial_filter, wavelet_weights))
        feature_values.append(X_end)

    # create a classifier from end feature values
    feature_values = np.array(feature_values)
    # TODO maybe balance class sizes or priors somehow?
    clf = LinearDiscriminantAnalysis()
    clf.fit(feature_values.T, y)

    return features_meta, clf

### Predict

In [None]:
def predict(epochs, features, clf, mwt="mexh", cwt_density=2):
    end_values = []
    for feature in features:
        spatial_filter, wavelet_weights = feature

        filtered = filter_(epochs, spatial_filter)

        cwts = np.array([cwt(epoch, mwt, cwt_density) for epoch in filtered])
        # EPOCH x FREQUENCY x TIMEPOINT

        end = np.tensordot(cwts, wavelet_weights, axes=([1, 2], [0, 1]))
        end_values.append(end)

    end_values = np.array(end_values)
    probs = clf.predict_proba(end_values.T)
    return probs[:, 1]

### Fire

In [None]:
start = time()
print("participant            AUROC   err/corr")
mwt = "mexh"
aurocs = []
auroc_sems = []


# group data by participants' ids
grouped = epochs_df.groupby(["id"])
for participant_id in epochs_df["id"].unique():
    participant_df = grouped.get_group(participant_id)

    X = np.array(participant_df["epoch"].to_list())

    # you can change y set in a easy way ---> y=np.array(participant_df["column_name"].to_list())
    y = np.array(participant_df["marker"].to_list())

    aurocs_personal = []
    # KFold cross-validation
    skf = StratifiedKFold(n_splits=4)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # train
        features, clf = train(X_train, y_train, mwt, wavelet_choice="single")

        # test
        y_pred = predict(X_test, features, clf, mwt)

        auroc = roc_auc_score(y_test, y_pred)
        aurocs_personal.append(auroc)

    aurocs.append(np.mean(aurocs_personal))
    auroc_sems.append(scipy.stats.sem(aurocs_personal))

    error_size = epochs_df[epochs_df["id"] == participant_id]["error_sum"].iloc[0]
    correct_size = epochs_df[epochs_df["id"] == participant_id]["correct_sum"].iloc[0]
    #     any_statistic_you_need = epochs_df[epochs_df["id"] == participant_id]["Rumination Full Scale"].iloc[0]

    print(
        f"{participant_id:11}    "
        f"{aurocs[-1]:.3f} ± {auroc_sems[-1]:.3f}    "
        f"{error_size:3}/{correct_size:3}"
    )

print(f"\ntraining time: {(time() - start) / 60:.0f} min")
# TODO is this line legit?
total_sem = sum(np.array(auroc_sems) ** 2) ** (1 / 2) / len(auroc_sems)
print(f"mean AUROC: {np.mean(aurocs):.3f} ± {total_sem:.3f}")