# Time series classification 

## Imports

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt
import mne

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
import epod_helper
import initialization_functions

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [2]:
metadata = pd.read_csv('metadata.csv', sep = ',')

In [3]:
metadata.head()

Unnamed: 0,eeg_file,ParticipantID,test,sex,age_months,dyslexic_parent,Group_AccToParents,path_eeg,path_epoch,path_eventmarkers,epoch_file
0,101a,101,a,m,20,m,At risk,F:/Stage/ePODIUM/Data/ePodium_projectfolder/Da...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ep...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ev...,101a_epo.fif
1,102a,102,a,f,20,Nee,Control,F:/Stage/ePODIUM/Data/ePodium_projectfolder/Da...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ep...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ev...,102a_epo.fif
2,103a,103,a,f,20,m,At risk,F:/Stage/ePODIUM/Data/ePodium_projectfolder/Da...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ep...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ev...,103a_epo.fif
3,104a,104,a,m,18,f,At risk,F:/Stage/ePODIUM/Data/ePodium_projectfolder/Da...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ep...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ev...,104a_epo.fif
4,105a,105,a,f,17,f,At risk,F:/Stage/ePODIUM/Data/ePodium_projectfolder/Da...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ep...,F:/Stage/ePODIUM/Data/ePodium_projectfolder/ev...,105a_epo.fif


In [4]:
metadata['Group_AccToParents'] = np.where(
    (metadata['Group_AccToParents']=='At risk'), 1,0)

In [5]:
drop_files = ["102a","113a", "107b (deel 1+2)", "132a", "121b(2)", "113b", "107b (deel 3+4)", "147a",
                "121a", "134a", "143b", "121b(1)", "145b", "150a","152a", "184a", "165a", "151a", "163a", "179a","179b", "182b", "186a", "193b"]

metadata = metadata[~metadata['eeg_file'].isin(drop_files)]

# Get input data

In [6]:
control_files= metadata.loc[metadata['Group_AccToParents'] == 0]
atrisk_files = metadata.loc[metadata['Group_AccToParents'] == 1]

In [7]:
def read_filtered_data(metadata, to_array=False, verbose=False):
    epochs = []
    for index, file in metadata.iterrows():
        print(f"Checking out file: {file['epoch_file']}")
        path = os.path.join(file['path_epoch'], file['epoch_file'])
        epoch = mne.read_epochs(path, preload=False, verbose=verbose)
        if to_array ==True: 
            epoch = epoch.get_data()
        epochs.append(epoch)
    return epochs

In [8]:
control_epochs = initialization_functions.read_filtered_data(control_files[:3], to_array=True)

Checking out file: 117a_epo.fif
Loading data for 2435 events and 2049 original time points ...
Checking out file: 118a_epo.fif
Loading data for 2418 events and 2049 original time points ...
Checking out file: 119a_epo.fif
Loading data for 2325 events and 2049 original time points ...


In [9]:
atrisk_epochs = initialization_functions.read_filtered_data(atrisk_files[:3], to_array=True)

Checking out file: 101a_epo.fif
Loading data for 2085 events and 2049 original time points ...
Checking out file: 103a_epo.fif
Loading data for 837 events and 2049 original time points ...
Checking out file: 104a_epo.fif
Loading data for 2293 events and 2049 original time points ...


In [10]:
control_labels = control_files['Group_AccToParents'][:3].tolist()
atrisk_labels = atrisk_files['Group_AccToParents'][:3].tolist()

In [11]:
control_labels=[len(i)*[0] for i in control_epochs]
atrisk_labels=[len(i)*[1] for i in atrisk_epochs]

In [12]:
data_list = control_epochs+atrisk_epochs
label_list = control_labels+atrisk_labels

In [13]:
# a list to 
groups_list=[[i]*len(j) for i, j in enumerate(data_list)]

In [14]:
data_array=np.vstack(data_list)
label_array=np.hstack(label_list)
group_array=np.hstack(groups_list)
data_array=np.moveaxis(data_array,1,2)

print(data_array.shape,label_array.shape,group_array.shape) #number of segments, length, channels

(12393, 2049, 32) (12393,) (12393,)


In [15]:
label_array=np.hstack(label_list)

In [16]:
#def input_ts_prep(epoch, standard_events, deviant_events): 
#    print('checkpoint')
#    std_evoked = epoch[standard_events].average() 
#    dev_evoked = epoch[deviant_events].average()
#
#    # calculate the mismatch response between standard and deviant evoked
#    evoked_diff = mne.combine_evoked([std_evoked, dev_evoked], weights=[1, -1])#.get_data() # mismatch for all channels per participant
#        
#  
#    return evoked_diff

In [17]:
standard_events = ['GiepM_S'] # standards: 'GiepM_S','GiepS_S','GopM_S','GopS_S'
deviant_events = ['GiepM_D'] # deviants: 'GiepM_D','GiepS_D','GopM_D','GopS_D'

In [18]:
from tensorflow.keras.layers import Conv1D,BatchNormalization,LeakyReLU,MaxPool1D,\
GlobalAveragePooling1D,Dense,Dropout,AveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.backend import clear_session
def cnnmodel():
    clear_session()
    model=Sequential()
    model.add(Conv1D(filters=5,kernel_size=3,strides=1,input_shape=(6250,19)))#1
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2,strides=2))#2
    model.add(Conv1D(filters=5,kernel_size=3,strides=1))#3
    model.add(LeakyReLU())
    model.add(MaxPool1D(pool_size=2,strides=2))#4
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=5,kernel_size=3,strides=1))#5
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2,strides=2))#6
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=5,kernel_size=3,strides=1))#7
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2,strides=2))#8
    model.add(Conv1D(filters=5,kernel_size=3,strides=1))#9
    model.add(LeakyReLU())
    model.add(GlobalAveragePooling1D())#10
    model.add(Dense(1,activation='sigmoid'))#11
    
    model.compile('adam',loss='binary_crossentropy',metrics=['accuracy'])
    return model

model=cnnmodel()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 6248, 5)           290       
                                                                 
 batch_normalization (BatchN  (None, 6248, 5)          20        
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 6248, 5)           0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 3124, 5)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 3122, 5)           80        
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 3122, 5)           0

In [19]:
from sklearn.model_selection import GroupKFold,LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler
gkf=GroupKFold()

In [20]:
accuracy=[]
for train_index, val_index in gkf.split(data_array, label_array, groups=group_array):
    train_features,train_labels=data_array[train_index],label_array[train_index]
    val_features,val_labels=data_array[val_index],label_array[val_index]
    scaler=StandardScaler()
    train_features = scaler.fit_transform(train_features.reshape(-1, train_features.shape[-1])).reshape(train_features.shape)
    val_features = scaler.transform(val_features.reshape(-1, val_features.shape[-1])).reshape(val_features.shape)
    model=cnnmodel()
    model.fit(train_features,train_labels,epochs=50,batch_size=128,validation_data=(val_features,val_labels))
    accuracy.append(model.evaluate(val_features,val_labels)[1])

MemoryError: Unable to allocate 1.19 GiB for an array with shape (2435, 2049, 32) and data type float64

In [None]:
tot_epoch = []
for epoch in epochs:
    arr_epoch = input_ts_prep(epoch, standard_events, deviant_events)
    tot_epoch.append(arr_epoch)

In [None]:
X = tot_epoch

In [None]:
test = epochs[6].get_data()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
X_train.shape #no of epochs, channels, length of signal

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)