In [1]:
# V0. operational
# V1. add feature to allow export of EEG time locations that contain artifact.
#         > first ARF is run, saves artifact locations into pd dataframe which contains, filename of data, 
#         > subsegment number of data, and artifact state


In [2]:
%matplotlib inline

import pickle as pkl
import fnmatch
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt
import mne
import numpy as np
import os
import pandas as pd
import random
import sys
from glob import glob
from scipy import signal
from scipy.stats import kurtosis
from scipy.stats import skew
from sklearn.preprocessing import scale

import lightgbm as lgb

import numpy as np

from sklearn.model_selection import StratifiedKFold

import seaborn as sns
sns.set()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn import metrics
from sklearn.preprocessing import StandardScaler


def get_cv(subjects):
    cv = {}
    
    temp_fnames = []
    temp_lbls = []
    temp_anns = []
    temp_idcs = []

    for subject in subjects:
        base = '../data/%s/%s_' % (str(subject).zfill(3), str(subject).zfill(3))
        fnames = sorted(glob(base + '*.h5'),
                        key=lambda x: int(x.replace(base, '')[:-7]))

        fnames_finals = []
        for fname in fnames:
            ba = '../data/%s/' % (str(subject).zfill(3))
            fn = fname.replace(ba, '')
            fnames_finals.append(fn)
        temp_fnames.extend(fnames_finals)

        for fname in fnames_finals:
            temp_lbls.append(int(fname.split('.')[0][-1]))
            anns = int(fname.split('.')[0].split('_')[2])
            #print fname.split('.')[0].split('_')[2]
            temp_anns.append(anns)
            temp_idcs.append(True)

    cv['fnames'] = np.array(temp_fnames)
    cv['labels'] = np.array(temp_lbls)
    cv['anns'] = np.array(temp_anns)
    cv['indices'] = np.array(temp_idcs)

    #print cv
    return cv

  if LooseVersion(numba.__version__) < LooseVersion('0.40'):
  if LooseVersion(numba.__version__) < LooseVersion('0.40'):


In [8]:
ref_chans = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 
    'F8', 'T3', 'T4', 'T5', 'T6', 'Fz', 'Cz', 'Pz', 'A1', 'A2']  # Note temporal chain has different pattern!

xforms = {
            'original': ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T3', 'T4', 'T5', 'T6', 'Fz', 'Cz', 'Pz', 'A1', 'A2'],
            'leftright': ['Fp2', 'Fp1', 'F4', 'F3', 'C4', 'C3', 'P4' , 'P3', 'O2', 'O1', 'F8', 'F7', 'T4', 'T3', 'T6', 'T5','Fz', 'Cz', 'Pz', 'A2', 'A1'],
         'postant':    [ 'O1', 'O2', 'T3', 'T4', 'T5', 'T6',  'P4' , 'P3', 'C3', 'C4', 'Fz', 'Cz', 'Pz', 'A1', 'A2', 'Fp1', 'Fp2', 'F3', 'F4', 'F7', 'F8']
}

xform_map = []
for xfn, xf_lst in xforms.items():
    xform_map.append([ref_chans.index(xf_ch) for xf_ch in xf_lst])


In [9]:

subjects = np.array([1, 2, 3, 4])
subjects = np.array([3])
subjects = np.arange(1,64)
# bad_subjects = np.array([17, 21])
# subjects = np.setdiff1d(subjects,bad_subjects)

subjects = sorted(subjects)


# feature_datasets = ['relative_log_power', 'arError', 'stats', 'various']
feature_datasets = ['arError', 'relative_log_power', 'stats']
feature_datasets = ['autocorrmat']

feature_datasets = ['relative_log_power', 'stats', 'rqa_delta_bp' ]

cv = get_cv(subjects)

col_names = []
features = []

for jj, feat in enumerate(feature_datasets):    
    print(f"Feature: {feat}")
    temp_features = []
    subjs = []
    for subject in subjects:
        s_file = './features/%s/%s.npz' % (feat, str(subject).zfill(3))
        if not os.path.exists(s_file):
            print(f"{s_file} does not exist!")
            continue
        a = np.load(s_file)
        arr = a['features']
        temp_features.append(arr)
        subjs.extend([subject]*(arr.shape[0]*arr.shape[1]))
    
    if feat == 'rqa_delta_bp':
        temp_features = np.vstack(temp_features)
        print(f'\t\t{feat} shape:', temp_features.shape)

        flat_shape = (temp_features.shape[2] * temp_features.shape[3], )
        temp_features = temp_features.reshape((-1, *flat_shape))
        print('\t:', temp_features.shape)
        features.append(temp_features)
        
    elif feat == 'coherences_transposed':
        # Broken
        print('\t:', temp_features.shape)
    elif feat == 'autocorrmat':
        # Broken
        print('\t:', temp_features.shape)
    elif feat == 'stats':
        # e.g. stats: (70, 30, 21, 6)
        #        (time, 30 twenty second windows = 10 min, 21 channels, 6 stats.)
        temp_features = np.vstack(temp_features)
        temp_features_copy = np.copy(temp_features)
        print(f'\t\t{feat} shape:', temp_features.shape)
        
        xform_features = []
        for xform in xform_map:
            temp_features = np.take(temp_features_copy, xform, axis=2)
            xform_features.append(temp_features)
            
        # Reshape to (180*30, new_shape)
        flat_shape = (temp_features.shape[2] * temp_features.shape[3], )
        temp_features = temp_features.reshape((-1, *flat_shape))
        print('\t:', temp_features.shape)

        features.append(temp_features)
        
    elif feat == 'relative_log_power':
        # e.g. shape already flattened. e.g. (70, 30, 21*6)
        temp_features = np.vstack(temp_features)
        temp_features_copy = np.copy(temp_features)

        print(f'\t\t{feat} shape:', temp_features.shape)
        xform_features = []
        for xform in xform_map:
            temp_features = np.take(temp_features_copy, xform, axis=2)
            xform_features.append(temp_features)

        flat_shape = (temp_features.shape[2] * temp_features.shape[3], )
        temp_features = temp_features.reshape((-1, *flat_shape))
        print('\t:', temp_features.shape)


        features.append(temp_features)
    
    col_names.extend([f'{feat}_{i}' for i in range(temp_features.shape[1])])

# features = np.vstack(features)

print(len(features))


Feature: relative_log_power
./features/relative_log_power/017.npz does not exist!
./features/relative_log_power/021.npz does not exist!
./features/relative_log_power/024.npz does not exist!
./features/relative_log_power/027.npz does not exist!
./features/relative_log_power/040.npz does not exist!
		relative_log_power shape: (5359, 30, 21, 6)
	: (160770, 126)
Feature: stats
./features/stats/017.npz does not exist!
./features/stats/021.npz does not exist!
./features/stats/024.npz does not exist!
./features/stats/027.npz does not exist!
./features/stats/040.npz does not exist!
		stats shape: (5359, 30, 21, 6)
	: (160770, 126)
Feature: rqa_delta_bp
./features/rqa_delta_bp/017.npz does not exist!
./features/rqa_delta_bp/021.npz does not exist!
./features/rqa_delta_bp/024.npz does not exist!
./features/rqa_delta_bp/027.npz does not exist!
./features/rqa_delta_bp/040.npz does not exist!
		rqa_delta_bp shape: (5359, 30, 1, 10)
	: (160770, 10)
3


In [11]:

df_train = pd.DataFrame(np.hstack(features), columns=col_names)
df_train['Subject'] = np.array(subjs)
df_train['Subject'] = df_train['Subject'].astype('int')
df_train = df_train.drop(df_train[df_train['Subject']==2].index)
df_train = df_train.drop(df_train[df_train['Subject']==62].index)

In [12]:
df = pd.read_csv('data/SPQR_clinFeats_20190419.csv')
df = df.rename({'Unnamed: 0': 'Subject', 'Latency (NR)':'Latency - NR'}, axis='columns')

df_resp = df.copy()
df_resp = df_resp[['Subject', 'Class', 'Age(resp)', 'Latency - R']]
df_resp = df_resp[df_resp['Class'] == 0.]
df_resp = df_resp.rename({'Latency - R':'Latency', 'Age(resp)': 'Age'}, axis='columns')
df_nresp = df.copy()
df_nresp = df_nresp[['Subject', 'Class', 'Age(non-resp)', 'Latency - NR']]
df_nresp = df_nresp[df_nresp['Class'] ==1.]
df_nresp = df_nresp.rename({'Latency - NR':'Latency', 'Age(non-resp)': 'Age'}, axis='columns')

df_all = pd.concat([df_resp, df_nresp])
df_all['Subject'] = df_all['Subject'].str.split('_').apply(lambda x:x[0]).astype('int')

df_all = df_all.drop(df_all[df_all['Subject']==65].index)

In [13]:
print(df_train['Subject'].unique())
print(np.sort(df_all['Subject'].unique()))

[ 1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19 20 22 23 25 26 28 29
 30 31 32 33 34 35 36 37 38 39 41 42 43 44 45 46 47 48 49 50 51 52 53 54
 55 56 57 58 59 60 61 63]
[ 1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19 20 22 23 25 26 28 29
 30 31 32 33 34 35 36 37 38 39 41 42 43 44 45 46 47 48 49 50 51 52 53 54
 55 56 57 58 59 60 61 63]


In [14]:
merged_df = pd.merge(df_train, df_all, on='Subject', how='left')
merged_df['Class'] = merged_df['Class'].astype('int')


In [16]:
train_data_df = merged_df.copy()
train_data_df = train_data_df.drop(['Class'], axis=1)
train_ann_df = merged_df[['Subject','Class']]

In [140]:
# LightGBM parameters
params = {
    'task':'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'early_stopping_rounds': 20,
    'max_depth': 5,
    'num_leaves': 256,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 3,
    'verbose': 1
}

In [141]:
train_data_df = train_data_df[['Age', 'Latency', 'Subject']]

In [142]:
unique_subjects = train_data_df["Subject"].unique()
subj_classes= df_all.loc[train_data_df['Subject'].isin(unique_subjects), 'Class']

In [143]:
valid_ann

Unnamed: 0,Subject,Class
17040,8,1
17041,8,1
17042,8,1
17043,8,1
17044,8,1
...,...,...
107725,45,0
107726,45,0
107727,45,0
107728,45,0


In [144]:
n_folds = 10
skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Get unique subjects in the data
unique_subjects = train_data_df["Subject"].unique()

preds = []
metadata = []

val_scores = []
for fold_id, (train_ids, valid_ids) in enumerate(skfold.split(unique_subjects, subj_classes)):
    # Get the subject IDs for the train and validation sets
    train_subjects = unique_subjects[train_ids]
    valid_subjects = unique_subjects[valid_ids]
    
    # Split the data and annotations based on the subject IDs
    train_data = train_data_df[train_data_df["Subject"].isin(train_subjects)]
    valid_data = train_data_df[train_data_df["Subject"].isin(valid_subjects)]
    train_ann = train_ann_df[train_ann_df["Subject"].isin(train_subjects)]
    valid_ann = train_ann_df[train_ann_df["Subject"].isin(valid_subjects)]

    metadata.append([valid_ann, valid_data])
    
    train_data = train_data.drop(['Subject'], axis=1)
    valid_data = valid_data.drop(['Subject'], axis=1)

    print("Fold %d / %d" % (fold_id + 1, n_folds))

    lgb_train = lgb.Dataset(train_data, train_ann['Class'])
    lgb_valid  = lgb.Dataset(valid_data, valid_ann['Class'])
    
    res = {}
    
    gbm = lgb.train(params, lgb_train, num_boost_round=400, 
                    valid_sets=[lgb_valid], valid_names=['valid'],
                    evals_result=res, verbose_eval=-100)
    
    val_scores.append(res['valid']['auc'][-1])
    
    
    probas = gbm.predict(valid_data)
    preds.append(probas)

Fold 1 / 10
[LightGBM] [Info] Number of positive: 65400, number of negative: 74820
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 140220, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466410 -> initscore=-0.134563
[LightGBM] [Info] Start training from score -0.134563
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.589486	valid's auc: 1
Fold 2 / 10
[LightGBM] [Info] Number of positive: 66030, number of negative: 73470
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 139500, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473333 -



Fold 4 / 10
[LightGBM] [Info] Number of positive: 66450, number of negative: 72450
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 138900, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.478402 -> initscore=-0.086447
[LightGBM] [Info] Start training from score -0.086447
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[13]	valid's binary_logloss: 0.548527	valid's auc: 1
Fold 5 / 10
[LightGBM] [Info] Number of positive: 69540, number of negative: 70770
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 140310, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495617 



Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.816269	valid's auc: 0.694878
Fold 7 / 10
[LightGBM] [Info] Number of positive: 65070, number of negative: 79740
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 144810, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.449347 -> initscore=-0.203308
[LightGBM] [Info] Start training from score -0.203308
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	valid's binary_logloss: 0.657193	valid's auc: 0.713004
Fold 8 / 10
[LightGBM] [Info] Number of positive: 68460, number of negative: 71790
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the tra




Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.621688	valid's auc: 1
Fold 10 / 10
[LightGBM] [Info] Number of positive: 63150, number of negative: 77790
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 140940, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448063 -> initscore=-0.208500
[LightGBM] [Info] Start training from score -0.208500
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.614763	valid's auc: 1




In [145]:
train_ann = train_ann_df[train_ann_df["Subject"].isin(train_subjects)]

In [146]:
print("avg_val_score: %4f" % (np.mean(val_scores)))

avg_val_score: 0.797873


In [147]:
metadata_df = pd.DataFrame()
c = 0
for anns, clinvars in metadata:
    curr_df = pd.concat([anns, clinvars], axis=1)
    curr_df['preds'] = preds[c]
    print(np.unique(preds[c]))
    metadata_df = pd.concat([metadata_df, curr_df], axis=0)
    c+=1

[0.42019505 0.51994958]
[0.38438611 0.39310575 0.57311632]
[0.40049212 0.44410875 0.50039956]
[0.1269504  0.46394633 0.85583883 0.86212766]
[0.44625963 0.44844364 0.47358422]
[0.37634343 0.47654841]
[0.29694708 0.29741739 0.63760355]
[0.4395824  0.46062952 0.53926116]
[0.40051487 0.50042209]
[0.4037943  0.50367065]


In [148]:
metadata_df.to_csv('spqr_v20230331.csv', index=False)


In [149]:
curr_df

Unnamed: 0,Subject,Class,Age,Latency,Subject.1,preds
17040,8,1,122.0,56.0,8,0.503671
17041,8,1,122.0,56.0,8,0.503671
17042,8,1,122.0,56.0,8,0.503671
17043,8,1,122.0,56.0,8,0.503671
17044,8,1,122.0,56.0,8,0.503671
...,...,...,...,...,...,...
107725,45,0,230.0,3.0,45,0.403794
107726,45,0,230.0,3.0,45,0.403794
107727,45,0,230.0,3.0,45,0.403794
107728,45,0,230.0,3.0,45,0.403794


In [None]:
# Iterate through kfolds
all_shap_v = []
all_Xv = []
all_subjs = []
all_yp = []
all_yv = []
all_ictal = []
kfolds_dir = 'kfolds_all_noTD_cork_huh_ucsf'


for kf in range(10):
    print('#####', kf)
    Xv_lst, yp_lst, yv_lst, y_test_ictal_lst, subj_lst, raw_data_cols, curr_shaps = run_kf(kfold=kf, kfolds_dir=kfolds_dir)
    

    raw_data_cols_plusArt = raw_data_cols + ['artifact_score']
    Xv_df = pd.DataFrame(data=Xv_lst, columns=raw_data_cols_plusArt)
    Xv_df = Xv_df.fillna(0)
    all_Xv.append(Xv_df)

    all_shap_v.append(curr_shaps)
    all_subjs.append(subj_lst)
    all_yp.append(yp_lst)
    all_yv.append(yv_lst)
    all_ictal.append(y_test_ictal_lst)
    

In [19]:
temp_features[0].shape

(70, 30, 21, 6)