# CNN for many subjects


Primero:

Cargamos los datos y los normalizamos. Para esto, primero pasamos un filtro pasa-bajo de 0 a 20hz, luego lo normalizamos a $N(0, 1)$

In [1]:
%pylab
%matplotlib inline

import sys
if ".." not in sys.path:
    sys.path.append("..")
import glob
import os
import mne
from keras import backend as K
from p300.preprocessing import normalize_subject, load_data

print("GPU's disponibles = {}".format(K.tensorflow_backend._get_available_gpus()))

CORPORA_PATH = "~/projects/corpora/P3Speller/P3Speller-old-y-datos/sets"

file_path = os.path.expanduser(CORPORA_PATH)
files = sorted(glob.glob(os.path.join(file_path, "*.set")))



Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


GPU's disponibles = ['/job:localhost/replica:0/task:0/device:GPU:0']


Targets appear as 2 in the third column


We remove last channel as well

In [2]:
%%capture
%%time 

# this line is to avoid output
pretraining_no = 100

training_files = files[:pretraining_no]
testing_files = files[pretraining_no:]

X_train, y_train = load_data(training_files)
# Check that there are no overlaps!
assert(len([f for f in training_files if f in testing_files]) == 0)

In [3]:
from sklearn.utils import class_weight
y_t = y_train.reshape(-1)
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_t), y_t)

class_weights = dict(zip([0,1], class_weights))

print("Class weights: {}".format(class_weights))
print(X_train.shape)

Class weights: {0: 0.6, 1: 3.0}
(196380, 14, 104, 1)


In [4]:
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Flatten, Dense, Dropout

def create_model():
    model = Sequential()
    activation = 'relu'

    n_kernels = 12
    model.add(Conv2D(n_kernels, (14, 1), padding='same',
                    activation=activation, input_shape=(14, 104, 1)))
    model.add(Conv2D(5*n_kernels, (1, 13), padding='same',
                    activation=activation))
    model.add(Flatten())
    model.add(Dropout(0.45))
    model.add(Dense(128, activation=activation))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', # using the cross-entropy loss function
                  optimizer='rmsprop', 
                  metrics=['accuracy']) # reporting the accuracy
    return model


model = create_model()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
%%time
from keras.callbacks import ModelCheckpoint, EarlyStopping
checkpointer = ModelCheckpoint(filepath='models/model_cnn_1.h5', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

model.fit(
    X_train, y_train, epochs=40, 
    batch_size=256, class_weight=class_weights, validation_split=0.10,
    callbacks=[checkpointer, early_stopping]
)

Train on 176742 samples, validate on 19638 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
CPU times: user 2min 56s, sys: 41.3 s, total: 3min 37s
Wall time: 3min 28s


## Pretraining

Let's fix the first four layers, and retrain the dense layers only

In [6]:

def fix_layers(model, fixed_layers):
    for i in range(fixed_layers):
        model.layers[i].trainable = False
    
    model.compile(loss='binary_crossentropy',
              optimizer='rmsprop', 
              metrics=['accuracy'])
    
fix_layers(model, 4)    

[(l, "Trainable: {}".format(l.trainable)) for l in model.layers]

[(<keras.layers.convolutional.Conv2D at 0x7f71025dc0b8>, 'Trainable: False'),
 (<keras.layers.convolutional.Conv2D at 0x7f71025dc400>, 'Trainable: False'),
 (<keras.layers.core.Flatten at 0x7f71025dcac8>, 'Trainable: False'),
 (<keras.layers.core.Dropout at 0x7f710512eb00>, 'Trainable: False'),
 (<keras.layers.core.Dense at 0x7f710512ea58>, 'Trainable: True'),
 (<keras.layers.core.Dense at 0x7f710506bcf8>, 'Trainable: True')]

Now, the idea is to train each subject and fine tune the last layers.

In [23]:
%%capture output

from keras import backend as K
from keras.models import load_model
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    roc_auc_score, 
    accuracy_score, 
    f1_score
)
from p300.preprocessing import normalize_subject, load_data, load_data_from_subject


def get_fine_tune_results(model_path, file):
    K.clear_session()
    
    model = load_model(model_path)
    
    fix_layers(model, 4)

    X_sub, y_sub = load_data([file])
    
    if X_sub is None:
        return 

    length = X_sub.shape[0] 
    limit = int(length / 2)
    X_sub_train, X_sub_test = X_sub[:limit], X_sub[limit:]
    y_sub_train, y_sub_test = y_sub[:limit], y_sub[limit:]
    
    model.fit(
        X_sub_train, y_sub_train, epochs=20, 
        batch_size=64, class_weight=class_weights, validation_split=0.1,
    )
    
    y_pred = model.predict_classes(X_sub_test)
    y_prob = model.predict(X_sub_test)

    precision = precision_score(y_sub_test, y_pred)
    recall = recall_score(y_sub_test, y_pred)
    auc = roc_auc_score(y_sub_test, y_prob)
    accuracy = accuracy_score(y_sub_test, y_pred)
    f1 = f1_score(y_sub_test, y_pred)
    
    subject_name = file.split("/")[-1].split(".")[0].split("_")[-1]
    K.clear_session()
    return {
        "subject": subject_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }, model
    


model_path = 'models/model_cnn_1.h5'

all_results = []
for file in testing_files:
    results = get_fine_tune_results(model_path, file)
    if results is None:
        print("Skipping {}")
        continue
    all_results.append(results[0])


In [24]:
import pandas as pd

df = pd.DataFrame(all_results)
df.set_index("subject", inplace=True)
df.to_csv("results.csv")

df

Unnamed: 0_level_0,Accuracy,F1,Precision,Recall
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
29257001,0.761111,0.304207,0.295597,0.313333
29273001,0.828889,0.541667,0.489247,0.606667
29426001,0.734343,0.340852,0.290598,0.412121
29789001,0.822222,0.480519,0.468354,0.493333
30243001,0.79899,0.387692,0.39375,0.381818
30261001,0.691111,0.199616,0.175676,0.231111
31056001,0.708148,0.245211,0.215488,0.284444
3109001,0.765657,0.411168,0.353712,0.490909
31102001,0.722222,0.273256,0.242268,0.313333
31397001,0.739394,0.27933,0.259067,0.30303


In [25]:
df.mean()

Accuracy     0.744343
F1           0.320001
Precision    0.296415
Recall       0.359035
dtype: float64

Accuracy     0.779232
F1           0.406924
Precision    0.373494
Recall       0.457576
dtype: float64

## Without Pretraining

In [26]:
%%capture output


def get_fine_tune_results(model_path, file):
    K.clear_session()
    
    model = create_model()

    X_sub, y_sub = load_data([file])
    
    if X_sub is None:
        return

    length = X_sub.shape[0] 
    limit = int(length / 2)
    X_sub_train, X_sub_test = X_sub[:limit], X_sub[limit:]
    y_sub_train, y_sub_test = y_sub[:limit], y_sub[limit:]
    
    model.fit(
        X_sub_train, y_sub_train, epochs=20, 
        batch_size=64, class_weight=class_weights, validation_split=0.01,
        callbacks=[early_stopping]
    )
    
    y_pred = model.predict_classes(X_sub_test)
    y_prob = model.predict(X_sub_test)

    precision = precision_score(y_sub_test, y_pred)
    recall = recall_score(y_sub_test, y_pred)
    auc = roc_auc_score(y_sub_test, y_prob)
    accuracy = accuracy_score(y_sub_test, y_pred)
    f1 = f1_score(y_sub_test, y_pred)
    
    subject_name = file.split("/")[-1].split(".")[0].split("_")[-1]
    K.clear_session()
    return {
        "subject": subject_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }, model
    


model_path = 'models/model_cnn_1.h5'

nft_all_results = []
for file in testing_files:
    results = get_fine_tune_results(model_path, file)
    if results is None:
        print("Skipping {}")
        continue
    nft_all_results.append(results[0])


In [30]:
import pandas as pd

df_without = pd.DataFrame(nft_all_results)
df_without.set_index("subject", inplace=True)
df_without.to_csv("results_without.csv")

df_without.mean()

Accuracy     0.744764
F1           0.172782
Precision    0.266223
Recall       0.211941
dtype: float64

# Metadata

Files we have trained our CNN with:

In [31]:
print("Training files: ")
[path.split("/")[-1] for path in training_files]

Training files: 


['PruebasMuseo_10229001.set',
 'PruebasMuseo_10444001.set',
 'PruebasMuseo_10729001.set',
 'PruebasMuseo_10882001.set',
 'PruebasMuseo_10924001.set',
 'PruebasMuseo_11551001.set',
 'PruebasMuseo_11627001.set',
 'PruebasMuseo_11632001.set',
 'PruebasMuseo_11693001.set',
 'PruebasMuseo_12137001.set',
 'PruebasMuseo_12168001.set',
 'PruebasMuseo_12521001.set',
 'PruebasMuseo_12702001.set',
 'PruebasMuseo_12900001.set',
 'PruebasMuseo_13235001.set',
 'PruebasMuseo_13252001.set',
 'PruebasMuseo_13431001.set',
 'PruebasMuseo_13640002.set',
 'PruebasMuseo_13863001.set',
 'PruebasMuseo_14023001.set',
 'PruebasMuseo_1414001.set',
 'PruebasMuseo_1491001.set',
 'PruebasMuseo_14998001.set',
 'PruebasMuseo_15362001.set',
 'PruebasMuseo_15424001.set',
 'PruebasMuseo_15641001.set',
 'PruebasMuseo_16003001.set',
 'PruebasMuseo_1609001.set',
 'PruebasMuseo_16266001.set',
 'PruebasMuseo_1635001.set',
 'PruebasMuseo_16637001.set',
 'PruebasMuseo_16683001.set',
 'PruebasMuseo_16779001.set',
 'PruebasMuseo

In [32]:
print("Testing files: ")
[path.split("/")[-1] for path in testing_files]

Testing files: 


['PruebasMuseo_29257001.set',
 'PruebasMuseo_29273001.set',
 'PruebasMuseo_29426001.set',
 'PruebasMuseo_29789001.set',
 'PruebasMuseo_30243001.set',
 'PruebasMuseo_30261001.set',
 'PruebasMuseo_31056001.set',
 'PruebasMuseo_3109001.set',
 'PruebasMuseo_31102001.set',
 'PruebasMuseo_31397001.set',
 'PruebasMuseo_31777001.set',
 'PruebasMuseo_3195001.set',
 'PruebasMuseo_32459001.set',
 'PruebasMuseo_32505001.set',
 'PruebasMuseo_358001.set',
 'PruebasMuseo_36001.set',
 'PruebasMuseo_3703001.set',
 'PruebasMuseo_3800001.set',
 'PruebasMuseo_3942001.set',
 'PruebasMuseo_4305001.set',
 'PruebasMuseo_4824001.set',
 'PruebasMuseo_4949001.set',
 'PruebasMuseo_4971001.set',
 'PruebasMuseo_499001.set',
 'PruebasMuseo_5224001.set',
 'PruebasMuseo_5251001.set',
 'PruebasMuseo_5510001.set',
 'PruebasMuseo_5568001.set',
 'PruebasMuseo_5857001.set',
 'PruebasMuseo_630001.set',
 'PruebasMuseo_6694001.set',
 'PruebasMuseo_6830001.set',
 'PruebasMuseo_7330001.set',
 'PruebasMuseo_7385001.set',
 'Prueb