# CNN for many subjects


Primero:

Cargamos los datos y los normalizamos. Para esto, primero pasamos un filtro pasa-bajo de 0 a 20hz, luego lo normalizamos a $N(0, 1)$

In [1]:
%pylab
%matplotlib inline

import sys
if ".." not in sys.path:
    sys.path.append("..")
import glob
import os
import mne
from keras import backend as K
from p300.preprocessing import normalize_subject, load_data

print("GPU's disponibles = {}".format(K.tensorflow_backend._get_available_gpus()))

CORPORA_PATH = "~/projects/corpora/P3Speller/P3Speller-old-y-datos/sets"

file_path = os.path.expanduser(CORPORA_PATH)
files = glob.glob(os.path.join(file_path, "*.set"))



Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


GPU's disponibles = ['/job:localhost/replica:0/task:0/device:GPU:0']


Targets appear as 2 in the third column


We remove last channel as well

In [2]:
%%capture

# this line is to avoid output
pretraining_no = 100

training_files = files[:pretraining_no]
testing_files = files[pretraining_no:]

X_train, y_train = load_data(training_files)
# Check that there are no overlaps!
assert(len([f for f in training_files if f in testing_files]) == 0)

In [3]:
from sklearn.utils import class_weight
y_t = y_train.reshape(-1)
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_t), y_t)

class_weights = dict(zip([0,1], class_weights))

print("Class weights: {}".format(class_weights))
print(X_train.shape)

Class weights: {0: 0.6, 1: 3.0}
(198360, 14, 104, 1)


In [4]:
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Flatten, Dense, Dropout

model = Sequential()
activation = 'relu'

n_kernels = 12
model.add(Conv2D(n_kernels, (14, 1), padding='same',
                activation=activation, input_shape=(14, 104, 1)))
model.add(Conv2D(5*n_kernels, (1, 13), padding='same',
                activation=activation))
model.add(Flatten())
model.add(Dropout(0.45))
model.add(Dense(128, activation=activation))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # using the cross-entropy loss function
              optimizer='rmsprop', 
              metrics=['accuracy']) # reporting the accuracy


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [5]:
%%time
from keras.callbacks import ModelCheckpoint, EarlyStopping
checkpointer = ModelCheckpoint(filepath='models/model_cnn_1.h5', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

model.fit(
    X_train, y_train, epochs=40, 
    batch_size=256, class_weight=class_weights, validation_split=0.10,
    callbacks=[checkpointer, early_stopping]
)

Train on 178524 samples, validate on 19836 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
CPU times: user 2min 41s, sys: 34.7 s, total: 3min 15s
Wall time: 3min 15s


Let's fix the first four layers.

In [6]:

def fix_layers(model, fixed_layers):
    for i in range(fixed_layers):
        model.layers[i].trainable = False
    
    model.compile(loss='binary_crossentropy',
              optimizer='rmsprop', 
              metrics=['accuracy'])
    
fix_layers(model, 4)    

[(l, "Trainable: {}".format(l.trainable)) for l in model.layers]

[(<keras.layers.convolutional.Conv2D at 0x7f8fc9599a20>, 'Trainable: False'),
 (<keras.layers.convolutional.Conv2D at 0x7f8fc95999b0>, 'Trainable: False'),
 (<keras.layers.core.Flatten at 0x7f8fc9599cf8>, 'Trainable: False'),
 (<keras.layers.core.Dropout at 0x7f8fc955fe80>, 'Trainable: False'),
 (<keras.layers.core.Dense at 0x7f8fc955fe48>, 'Trainable: True'),
 (<keras.layers.core.Dense at 0x7f8fc954b320>, 'Trainable: True')]

Now, the idea is to train each subject and fine tune the last layers.

In [16]:
%%capture output
from keras import backend as K
from keras.models import load_model
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    roc_auc_score, 
    accuracy_score, 
    f1_score
)
from p300.preprocessing import normalize_subject, load_data, load_data_from_subject

file = files[130]

def get_fine_tune_results(model_path, file):
    K.clear_session()
    
    model = load_model(model_path)
    
    fix_layers(model, 4)

    X_sub, y_sub = load_data([file])

    length = X_sub.shape[0] 
    limit = int(length / 2)
    X_sub_train, X_sub_test = X_sub[:limit], X_sub[limit:]
    y_sub_train, y_sub_test = y_sub[:limit], y_sub[limit:]
    
    model.fit(
        X_sub_train, y_sub_train, epochs=20, 
        batch_size=64, class_weight=class_weights, validation_split=0.01,
        callbacks=[early_stopping]
    )
    
    y_pred = model.predict_classes(X_sub_test)
    y_prob = model.predict(X_sub_test)

    precision = precision_score(y_sub_test, y_pred)
    recall = recall_score(y_sub_test, y_pred)
    auc = roc_auc_score(y_sub_test, y_prob)
    accuracy = accuracy_score(y_sub_test, y_pred)
    f1 = f1_score(y_sub_test, y_pred)
    
    subject_name = file.split("/")[-1].split(".")[0].split("_")[-1]
    K.clear_session()
    return {
        "subject": subject_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }, model
    


model_path = 'models/model_cnn_1.h5'

all_results = []
for file in testing_files:
    try:
        all_results.append(get_fine_tune_results(model_path, file)[0])
    except Exception as e:
        # if file is not ok, discard
        print("="*80)
        print("="*80)
        print(e)
        print("="*80)
        print("="*80)


In [22]:
import pandas as pd

df = pd.DataFrame(all_results)
df.set_index("subject", inplace=True)
df.to_csv("results.csv")

df

Unnamed: 0_level_0,Accuracy,F1,Precision,Recall
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10924001,0.746825,0.405959,0.333333,0.519048
22650001,0.589683,0.262482,0.187373,0.438095
11693001,0.764646,0.371968,0.334951,0.418182
26715001,0.677778,0.372294,0.275641,0.573333
22109001,0.715556,0.396226,0.306569,0.56
31397001,0.753535,0.290698,0.27933,0.30303
20668001,0.764444,0.442105,0.365217,0.56
29426001,0.663636,0.335329,0.25,0.509091
29257001,0.747778,0.34957,0.306533,0.406667
10229001,0.683333,0.299754,0.237354,0.406667


In [29]:
df.mean()

Accuracy     0.712260
F1           0.357327
Precision    0.294537
Recall       0.469452
dtype: float64

# Metadata

Files we have trained our CNN with:

In [28]:
print("Training files: ")
[path.split("/")[-1] for path in training_files]

Training files: 


['PruebasMuseo_27030001.set',
 'PruebasMuseo_26636001.set',
 'PruebasMuseo_21011001.set',
 'PruebasMuseo_13235001.set',
 'PruebasMuseo_16003001.set',
 'PruebasMuseo_19561001.set',
 'PruebasMuseo_24227001.set',
 'PruebasMuseo_1491001.set',
 'PruebasMuseo_29789001.set',
 'PruebasMuseo_27131001.set',
 'PruebasMuseo_6694001.set',
 'PruebasMuseo_13863001.set',
 'PruebasMuseo_24053001.set',
 'PruebasMuseo_17005001.set',
 'PruebasMuseo_14998001.set',
 'PruebasMuseo_5857001.set',
 'PruebasMuseo_22072001.set',
 'PruebasMuseo_25871001.set',
 'PruebasMuseo_5510001.set',
 'PruebasMuseo_6830001.set',
 'PruebasMuseo_26721001.set',
 'PruebasMuseo_24101001.set',
 'PruebasMuseo_27157001.set',
 'PruebasMuseo_358001.set',
 'PruebasMuseo_12702001.set',
 'PruebasMuseo_9689001.set',
 'PruebasMuseo_18967001.set',
 'PruebasMuseo_16683001.set',
 'PruebasMuseo_11627001.set',
 'PruebasMuseo_31056001.set',
 'PruebasMuseo_28005001.set',
 'PruebasMuseo_2089001.set',
 'PruebasMuseo_27846001.set',
 'PruebasMuseo_1252