In [1]:
import numpy as np
from scipy import linalg

import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import gc

%matplotlib inline

In [2]:
sample = 128 #hz
trial_time = 3 #s

origin_channel = 5 #5 channel eeg


In [3]:
def cov_mat(X):
    return np.matmul(X , X.T)/np.trace(np.matmul(X , X.T))

#计算每种样本的平均协方差矩阵
def average_norm_cov_mat(data):
    count = data.shape[0]
    sum_mat = np.zeros(shape=(data[0].shape[0] , data[0].shape[0]))
    
    for i in range(count):
        sum_mat += cov_mat(data[i])
    
    return sum_mat/count

def load_data(file_name):
    #pink and white
    
    temp = pd.read_csv(file_name)
    
    #删除前3秒和后2秒数据
    temp = temp.iloc[ : temp.shape[0] - 2*128] #后2秒 2s sample:128hz
    temp = temp.iloc[3*128 : ] #前3秒 3s sample:128hz
    
    for column in temp.columns:
        temp[column] = (temp[column] - temp[column].mean())/temp[column].std() #norm
    
    #5 channels data
    return temp[['AF3' , 'T7','Pz' , 'T8' , 'AF4']]

def sep(one_data , label):
    train_data = []
    train_labels = []
    
    size = sample*trial_time #384
    
    for i in range(one_data.shape[0] - size):
        train_data.append(one_data.iloc[i : i+size].values) #add one train sample
        train_labels.append(label) #corresponding label
    
    return train_data , train_labels

In [4]:
def concat_eeg_csv(file_names):
    #concat a big csv file
    first_file = load_data(file_name = file_names[0])
    
    file_names.remove(file_names[0])
    
    for file_name in file_names:
        first_file = first_file.append(load_data(file_name = file_name) , ignore_index =  True)
    
    return first_file

In [11]:
m = 2 #选取最高m个特征向量 m最小为1
def CSP_matrix(train_data_1 , train_data_2):
    #计算投影矩阵
    R_pink = average_norm_cov_mat(train_data_1)
    R_white = average_norm_cov_mat(train_data_2)
    
    R = R_pink + R_white
    
    eigenvalues , U0 = np.linalg.eig(R)

    sort_index = np.argsort(eigenvalues)
    sort_index = sort_index[:: -1]

    U0 = U0[: , sort_index]
    eigenvalues = sorted(eigenvalues , reverse=True)

    sigma = np.diag(eigenvalues)
    
    #白化矩阵
    P = np.matmul(np.diag(np.power(eigenvalues , -0.5)) , U0.T)
    
    S_pink = np.matmul(np.matmul(P , R_pink) , P.T)
    S_white = np.matmul(np.matmul(P , R_white) , P.T)
    
    E1 , US1 = np.linalg.eig(S_pink)
    E2 , US2 = np.linalg.eig(S_white)

    #E1+E2=I
    #US1=US2
    
    sort_index_1 = np.argsort(E1)
    sort_index_1 = sort_index_1[:: -1]
    
    E1 = sorted(E1 , reverse=True)
    US1 = US1[: , sort_index_1]
    
    W_1 = np.matmul(US1[: , 0:m].T , P) #前2列特征向量
    #=======================
    sort_index_2 = np.argsort(E2)
    sort_index_2 = sort_index_2[:: -1]
    
    E2 = sorted(E2 , reverse=True)
    US2 = US2[: , sort_index_2]
    
    W_2 = np.matmul(US2[: , 0:m].T , P) #前2列特征向量
    
    return W_1 , W_2

In [12]:
data_1 = concat_eeg_csv(['../input/white1.csv' , '../input/long_white_recording.csv' , '../input/white2.csv' , '../input/white3.csv'])
data_2 = concat_eeg_csv(['../input/pink1.csv' , '../input/long_pink_recording.csv' , '../input/pink2.csv' , '../input/pink3.csv'])

#单一数据集时
#color_1 = 'long_white_recording.csv'
#color_2 = 'long_pink_recording.csv'
#data_pink = load_data(file_name = 'test_raw_eeg/test_color1/'+color_1)
#data_white = load_data(file_name = 'test_raw_eeg/test_color1/'+color_2)

train_data_1 , train_labels_1 = sep(data_1 , 0)
train_data_2 , train_labels_2 = sep(data_2 , 1)

train_data_1 = np.array(train_data_1)
train_data_2 = np.array(train_data_2)

train_labels_1 = np.array(train_labels_1)
train_labels_2 = np.array(train_labels_2)

train_data_1 = np.transpose(train_data_1 , axes=(0 , 2 , 1))
train_data_2 = np.transpose(train_data_2 , axes=(0 , 2 , 1))

In [None]:
W_1 , W_2 = CSP_matrix(train_data_1 , train_data_2)

In [16]:
train_data = np.concatenate((train_data_1 , train_data_2))

train_labels = np.concatenate((train_labels_1 , train_labels_2))

train_data_features = []

In [17]:
def fetch_feature(xi):
    #m = 2 #2*m features
    #
    #Z = np.matmul(W , xi)
    #
    #Z_pre_m = Z[0:m ,:]
    #Z_after_m = Z[-m: , :]
    #
    #Zp = np.concatenate((Z_pre_m , Z_after_m))
    #
    #sigma_var_Zp = 0
    #
    #for i in range(2*m):
    #    sigma_var_Zp += np.var(Zp[i])
    #    
    #fi = [np.log(np.var(Zpi)/sigma_var_Zp) for Zpi in Zp]
    #
    #fi = np.array(fi)
    
    #======
    #特征维数较低时 例如低于100
    #fi = np.concatenate([np.matmul(W_1 , xi) , np.matmul(W_2 , xi)])
    #======
    
    z1 = np.matmul(W_1 , xi) #2*384
    z2 = np.matmul(W_2 , xi) #2*384
    
    #特征提取方式1
    #var_z1_1 = np.var(z1 , axis = 1)[0]
    #var_z1_2 = np.var(z1 , axis = 1)[1]
    #var_z2_1 = np.var(z2 , axis = 1)[0]
    #var_z2_2 = np.var(z2 , axis = 1)[1]
    #
    #f1 = var_z1_1/(var_z1_1+var_z2_1)
    #f2 = var_z2_1/(var_z1_1+var_z2_1)
    #f3 = var_z1_2/(var_z1_2+var_z2_2)
    #f4 = var_z2_2/(var_z1_2+var_z2_2)
    #
    #return np.array([f1 , f2 , f3 , f4])
    #==========
    return np.concatenate((z1 , z2))
    

In [18]:
for i in range(train_data.shape[0]):
    train_data_features.append(fetch_feature(train_data[i]))

train_data_features = np.array(train_data_features)

In [23]:
train_data_features = np.transpose(train_data_features , axes=(0 , 2 , 1))

In [24]:
train_data_features.shape

(60332, 384, 4)

In [29]:
#==============
#==============
#val step
color_val_1 = 'whitev1.csv'
color_val_2 = 'pinkv1.csv'

data_1_val = load_data('../input/'+ color_val_1)
data_2_val = load_data('../input/'+ color_val_2)

val_data_1 , val_labels_1 = sep(data_1_val , 0)
val_data_2 , val_labels_2 = sep(data_2_val , 1)

val_data_1 = np.array(val_data_1)
val_data_2 = np.array(val_data_2)

val_labels_1 = np.array(val_labels_1)
val_labels_2 = np.array(val_labels_2)

val_data_1 = np.transpose(val_data_1 , axes=(0 , 2 , 1))
val_data_2 = np.transpose(val_data_2 , axes=(0 , 2 , 1))


#===================
#concat
val_data_features = []

val_data = np.concatenate((val_data_1 , val_data_2))
val_labels = np.concatenate((val_labels_1 , val_labels_2))


#===================
#fetch feature
for i in range(val_data.shape[0]):
    val_data_features.append(fetch_feature(val_data[i]))

val_data_features = np.array(val_data_features)

In [30]:
val_data_features = np.transpose(val_data_features , axes=(0 , 2 , 1))

In [31]:
val_data_features.shape

(12812, 384, 4)

In [43]:
#========
#========
#ANN
from keras.models import Sequential
from keras.layers import Dense , Dropout , Conv2D , MaxPooling2D , Reshape , BatchNormalization , Flatten
from keras.regularizers import l2

In [44]:
keep_prob = 0.5

model = Sequential()
model.add(Dense(units=30 , input_shape=(sample*trial_time , 4) , activation='elu' , kernel_regularizer=l2()))
model.add(Reshape((30 , sample*trial_time , 1)))

#conv pool 1
model.add(Conv2D(60 , kernel_size=(1,15) , strides=(1,3) , padding='valid' , activation='elu' , kernel_regularizer=l2()))
model.add(MaxPooling2D(pool_size=(1,2) , strides=(1,2) , padding='valid'))

#dropout
model.add(Dropout(1 - keep_prob))
#batch norm
model.add(BatchNormalization())

#conv pool 2
model.add(Conv2D(60 , kernel_size=(1,4) , strides=(1,3) , padding='valid' , activation='elu' , kernel_regularizer=l2()))
model.add(MaxPooling2D(pool_size=(1,2) , strides=(1,2) , padding='valid'))
#dropout
model.add(Dropout(1 - keep_prob))
#batch norm
model.add(BatchNormalization())

#conv 3
model.add(Conv2D(60 , kernel_size=(30,1) , strides=(1,3) , padding='valid' , activation='elu' , kernel_regularizer=l2()))
#dropout
model.add(Dropout(1 - keep_prob))
#batch norm
model.add(BatchNormalization())

#conv pool 4
model.add(Conv2D(90 , kernel_size=(1,3) , strides=(1,1) , padding='same' , activation='elu' , kernel_regularizer=l2()))
model.add(MaxPooling2D(pool_size=(1,2) , strides=(1,2) , padding='valid'))
#dropout
model.add(Dropout(1 - keep_prob))
#batch norm
model.add(BatchNormalization())

#conv pool 5
model.add(Conv2D(120 , kernel_size=(1,3) , strides=(1,1) , padding='same' , activation='elu' , kernel_regularizer=l2()))
model.add(MaxPooling2D(pool_size=(1,2) , strides=(1,2) , padding='valid'))
#dropout
model.add(Dropout(1 - keep_prob))
#batch norm
model.add(BatchNormalization())

#flatten
model.add(Flatten())

#fc
model.add(Dense(units=24 , activation='elu' , kernel_regularizer=l2()))

#extra for cluster layer
#model.add(Dense(units=12 , activation='elu' , kernel_regularizer=l2() , name = 'feature1'))
#model.add(Dense(units=4 , activation='elu' , kernel_regularizer=l2() , name = 'feature2'))
#model.add(Dense(units=2 ,  activation='elu' , kernel_regularizer=l2() , name = 'feature3'))

#fc last layer
model.add(Dense(units=2 , activation='softmax'))

In [45]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 384, 30)           150       
_________________________________________________________________
reshape_3 (Reshape)          (None, 30, 384, 1)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 30, 124, 60)       960       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 30, 62, 60)        0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 30, 62, 60)        0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 30, 62, 60)        240       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 30, 20, 60)        14460     
__________

In [None]:
model.compile(optimizer='adam' , loss='binary_crossentropy' , metrics=['accuracy'])

In [None]:
model.fit(train_data_features , train_labels , batch_size=16 , epochs=100 , shuffle=True , validation_data=(val_data_features , val_labels))

In [None]:
val_data_features.shape