In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler

path = 'train'

**Load Data**

We start out by reading in our data and formatting it appropriately.

Define some helper functions to do so.

In [2]:
# specify our datatypes
types1 = {
    'Id': 'str', 'Fp1': 'int16', 'Fp2': 'int16', 'F7': 'int16', 'F3': 'int16', 'Fz': 'int16',
    'F4': 'int16', 'F8': 'int16', 'FC5': 'int16', 'FC1': 'int16', 'FC2': 'int16', 'FC6': 'int16',
    'T7': 'int16', 'C3': 'int16', 'Cz': 'int16', 'C4': 'int16', 'T8': 'int16', 'TP9': 'int16',
    'CP5': 'int16', 'CP1': 'int16', 'CP2': 'int16', 'CP6': 'int16', 'TP10': 'int16', 'P7': 'int16',
    'P3': 'int16', 'Pz': 'int16', 'P4': 'int16', 'P8': 'int16', 'PO9': 'int16', 'O1': 'int16',
    'Oz': 'int16', 'O2': 'int16', 'PO10': 'int16'}

types2 = {
    'Id': 'str', 'HandStart': 'int8', 'FirstDigitTouch': 'int8', 'BothStartLoadPhase': 'int8', 'LiftOff': 'int8', 
    'Replace': 'int8', 'BothReleased': 'int8'}

# define our dependent variables to predict
dep_vars = ['HandStart','FirstDigitTouch','BothStartLoadPhase','LiftOff','Replace','BothReleased']

# define our read data function
def read_data(subjects, series, path):
    df = None
    labels = None
    
    # if we are passed individual subjects or series, encode as iterables
    if isinstance(subjects,int):
         subjects = [subjects]
    
    if isinstance(series,int):
        series = [series]
    
    # for each passed subject and series, build a pandas dataframe
    for i in subjects:
        print('Loading',path,'data for subject',i)
        for j in series:
            c_df = pd.read_csv(f'../input/{path}/subj{i}_series{j}_data.csv', dtype=types1)
            df = c_df if df is None else df.append(c_df, ignore_index = True)
            
            if path == 'train':
                c_label = pd.read_csv(f'../input/{path}/subj{i}_series{j}_events.csv', dtype=types2)
                labels = c_label if labels is None else labels.append(c_label, ignore_index = True)
            else: labels = None    
            
       
    return df, labels

# optional function to add "subject" and "series" as columns to our dataframe
def add_subs_and_ser(df):
    df.insert(0,'subject', df.id.str.extract(r'(\d)'))
    df.insert(1,'series', df.id.str.extract(r'[^\d]*[\d]+[^\d]+([\d]+)'))
    return df

# formating function to strip out the "id" column from label data
def format_y(df):
    df = df.drop(columns='id', axis=1)
    return df

# formating function to strip out the "id" column from signal data. Options to encode additional signal 
# preprocessing in this function
def format_X(df):
    df = df.drop(columns='id', axis=1)
    columns = df.columns      
    
    #Preprocessing    
    scaler = StandardScaler() 
    df =np.asarray(df.astype(float))
    df = scaler.fit_transform(df)
    df = pd.DataFrame(df, columns=columns)
    
    ## additional preprocessing could go here
    
    return df

In order to demonstrate this model, I've set it up to train on Subject 4. It uses series 1-7 as the training set and series 8 as the validation set. To train, simply shift-enter your way through, and enjoy.

In [3]:
# demonstration training dataset parameters
subjects = [4]
series_train = range(1,8)
series_test = [8]

In [4]:
# reads training data for Subject 4, training on series 1-7

X_train, y_train = read_data(subjects, series_train, path='train')

Loading train data for subject 4


In [5]:
# splits series 8 to use as a validation set for the model

X_test, y_test = read_data(subjects, series_test, path='train')

Loading train data for subject 4


In [6]:
# formats the training and validation datasets, preparing them to be fed into the model

X_train = format_X(X_train)
y_train = format_y(y_train)

X_test = format_X(X_test)
y_test = format_y(y_test)

**Build the model**

Now that we have our data loaded up, let's construct some of the hyperparameters for our model.

In [7]:
### Model parameters ###

n_features = 32  # how many channels of eeg in each sample
event_types = 6 #len(set(y))  # how many different event types
look_back = 35 #add a rear-facing window
downsample = 15 #choose every 15th datapoint
# l1 = 0 #used for regularization in Haunke architecture
epochs = 30

In [8]:
# in order to feed data to our LSTM, we need a 3D dataset. We use a
# "look_back" window to add the third dimension to the model.

def add_lookback(dataset,labels,look_back):
    dataX = []
    dataY = labels[look_back:]
    for i in range(len(dataset)-look_back):
        dataX.append(dataset[i:(i+look_back), ])
    return np.array(dataX), np.array(dataY)

In [9]:
# if you are shift-entering through the script, this adds the lookback window
# to the train and validation datasets

X_train, y_train = add_lookback(
                    X_train.iloc[::downsample].values,
                    y_train.iloc[::downsample].values,
                    look_back=look_back)

X_test, y_test = add_lookback(
                    X_test.iloc[::downsample].values,
                    y_test.iloc[::downsample].values,
                    look_back=look_back)

In [10]:
# import keras layers to use in subsequent models

from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import AveragePooling1D
from keras import regularizers
from keras.callbacks import EarlyStopping

callbacks=[EarlyStopping(monitor="acc", 
                         verbose=1, 
                         patience=6, 
                         restore_best_weights=True)]

Using TensorFlow backend.


In [11]:
### define an LSTM model ###

def simple_LSTM(look_back, n_features):
    # this implements a simple LSTM network that can train across all subjects, all labels.
    # network has 100 neurons and dropout. In limited optimization, 10-15 epochs worked pretty well
    
    model = Sequential()
    model.add(LSTM(100, return_sequences=False, input_shape=(look_back, n_features)))
    model.add(Dropout(0.5))
    # model.add(LSTM(100)) dramatically worse results 
    model.add(Dense(event_types, activation='sigmoid'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


In [None]:
### uncomment below to continue to shift-enter through ###

#model = simple_LSTM(look_back=look_back, n_features=n_features)
#model.fit(X_train, y_train, batch_size=32, epochs=epochs, callbacks=callbacks)

In [12]:
# per Haunke, et al, 2018

def hauke_cnn(look_back, n_features,l1):
    # I attempted to implement a CNN with layers operating across both the temporal and spatial axes.
    # Unfortunately the model wasn't working well and I couldn't get it outperforming my simple 
    # LSTM in the given time.
    
    input_shape = (look_back, n_features)

    model = Sequential()
    model.add(Conv1D(40, 30, activation="relu", kernel_regularizer=regularizers.l1(l1), padding="same", input_shape=input_shape))
    model.add(Conv1D(40, n_features, activation="relu", kernel_regularizer=regularizers.l1(l1), padding="valid"))
    model.add(AveragePooling1D(1, strides=(15)))
    model.add(Flatten())
    model.add(Dense(80, activation="relu"))
    model.add(Dense(event_types, activation="softmax"))
    
    model.compile(loss="categorical_crossentropy", 
                  optimizer="adam", 
                  metrics=["acc"])
    return model

#model.fit(X_train, y_train, batch_size=32, epochs=epochs)

In [13]:
score = model.evaluate(X_test, y_test, batch_size=32)
score

NameError: name 'model' is not defined

In [15]:
print("Accuracy: %.2f%%" % (score[1]*100))

NameError: name 'score' is not defined

In [16]:
### this function is called inside the Kaggle kernel to train the model, test it on the Kaggle "test set",
### and then submit it.

def make_submission(subjects, batch_size):
    # initialize results lists
    Results = []
    subj_ids = []
    
    ### call the full stack on each subject. For each subject we load data, train the model, predict output,
    ### and append results
    for i in subjects:
        print("Training on subject",i)
        
        ### start by training the model
        model = simple_LSTM(look_back=look_back, n_features=n_features)
        
        ### for each subject, load all training data
        series_train = range(1,9)
        
        ### load training data for our subject
        X_train, y_train = read_data(i, series_train, path='train')
        
        ### process our data
        X_train = format_X(X_train)  
        y_train = format_y(y_train)
        
        ### add our lookback window
        X_train, y_train = add_lookback(X_train.iloc[::downsample].values,
                                        y_train.iloc[::downsample].values,
                                        look_back=look_back)
        
        
        ### crop training data to fit batch size
        train_crop_to_batch = (((len(X_train)) // batch_size) * batch_size)
        
        ### fit model to training data
        model.fit(X_train[0:train_crop_to_batch], y_train[0:train_crop_to_batch], batch_size=batch_size, epochs=epochs, callbacks=callbacks)
        
        ### move on to predicting results. we use series 9 and 10, the Kaggle "test"
        ### dataset
        for j in range(9,11):
            
            ### read in test data
            X_test, y_test = read_data(i, j, path='test')
            y_test = X_test['id'] # save the 'id' column for later
            
            ### format test data
            X_test = format_X(X_test)
            
            ### record the length of the test set for future reference
            test_length = len(X_test)
            #print("length of test dataset is",test_length) #useful for debugging
            
            # add look_back window to test data so we can predict it
            X_test, y_test_smol = add_lookback(X_test.iloc[::downsample].values,
                                        y_test.iloc[::downsample].values,
                                        look_back=look_back)
            #print("length of test dataset post-lookback is",len(X_test)) #useful for debugging
            
            ### crop predicting data to fit batch size
            test_crop_to_batch = ((len(X_test) // batch_size) * batch_size)
            #print("length of test dataset cropped-to-batch is",test_crop_to_batch) #useful for debugging
            
            ### predict results on the provided test data
            result = model.predict(X_test[0:test_crop_to_batch], batch_size=batch_size)
            result = np.array(result)
            #print("length of result is",len(result)) #useful for debugging
            
            ### because of our rear window, we have to pad out the beginning of the results with 0's
            for _ in range((look_back)*downsample):
                Results.append(np.array([[0,0,0,0,0,0]]))
            
            #print("length of Results with initial zero padding is",len(Results)) #useful for debugging
            
            ### because we downsampled, we have to fill in the missing values. A simple solution is to
            ### expand the results we have to fill the space.
            result = np.repeat(result,downsample, axis=0)
            for row in result:
                Results.append(np.array([row]))
            #print("length of result after multiplying is",len(result)) #useful for debugging
            
            for _ in range(test_length-len(result)-((look_back)*downsample)):
                Results.append(np.array([[0,0,0,0,0,0]]))
            #print("length of final 0s padding is",(test_length-len(result)-(look_back*downsample))) #useful for debugging
            
            subj_ids.append(y_test)
            
            ### calculate the total length of our results and our subject ids to make sure
            ### they match
            length_results = len(np.concatenate(Results))
            length_ids = len(np.concatenate(subj_ids))
            
            if length_results == length_ids:
                print("predictions for Series",j,"are the correct length!")
            else: print("Woops! Length of results is",length_results,"but ids length is",length_ids)
            
                
    print("printing submissions")
    submission_name = "simple_LSTM.csv"
    
    ### concatenate everything into a submission dataframe
    submission = pd.DataFrame(columns=dep_vars, data=np.concatenate(Results), index=np.concatenate(subj_ids))
    
    ### sort the dataframe into the correct order for submission
    submission = pd.concat([submission[submission.index.str.contains('series9')],
                            submission[submission.index.str.contains('series10')]])
    
    ### write the submission dataframe to csv
    submission.to_csv(submission_name,index_label="id",float_format='%.3f')
            
    
    

In [17]:
# call our function inside Kaggle to submit it.
make_submission(range(1,13),500)

Training on subject 1
Loading train data for subject 1
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Restoring model weights from the end of the best epoch
Epoch 00014: early stopping
Loading test data for subject 1
predictions for Series 9 are the correct length!
Loading test data for subject 1
predictions for Series 10 are the correct length!
Training on subject 2
Loading train data for subject 2
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Restoring model weights from the end of the best epoch
Epoch 00019: early stopping
Loading test data for subject 2
predictions for Series 9 are the correct length!
Loading test data for subject 2
predictions for Series 10 are the correct length!
Training on subject 