In [1]:
import numpy as np
from scipy import linalg

import pandas as pd

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import gc

%matplotlib inline

In [2]:
sample = 128 #hz
trial_time = 3 #s

origin_channel = 5 #5 channel eeg


In [3]:
def cov_mat(X):
    return np.matmul(X , X.T)/np.trace(np.matmul(X , X.T))

#计算每种样本的平均协方差矩阵
def average_norm_cov_mat(data):
    count = data.shape[0]
    sum_mat = np.zeros(shape=(data[0].shape[0] , data[0].shape[0]))
    
    for i in range(count):
        sum_mat += cov_mat(data[i])
    
    return sum_mat/count

def load_data(file_name):
    #pink and white
    
    temp = pd.read_csv(file_name)
    
    #删除前3秒和后2秒数据
    temp = temp.iloc[ : temp.shape[0] - 2*128] #后2秒 2s sample:128hz
    temp = temp.iloc[3*128 : ] #前3秒 3s sample:128hz
    
    for column in temp.columns:
        temp[column] = (temp[column] - temp[column].mean())/temp[column].std() #norm
    
    #5 channels data
    return temp[['AF3' , 'T7','Pz' , 'T8' , 'AF4']]

def sep(one_data , label):
    train_data = []
    train_labels = []
    
    size = sample*trial_time #384
    
    for i in range(one_data.shape[0] - size):
        train_data.append(one_data.iloc[i : i+size].values) #add one train sample
        train_labels.append(label) #corresponding label
    
    return train_data , train_labels

In [4]:
def concat_eeg_csv(file_names):
    #concat a big csv file
    first_file = load_data(file_name = file_names[0])
    
    file_names.remove(file_names[0])
    
    for file_name in file_names:
        first_file = first_file.append(load_data(file_name = file_name) , ignore_index = True)
    
    return first_file

In [5]:
#在进行聚类 坐标系绘图时 将m设置为1 进行绘制 CSP算法后特征变为2维

m = 2 #选取最高m个特征向量 m最小为1
def CSP_matrix(train_data_1 , train_data_2):
    #计算投影矩阵
    R_pink = average_norm_cov_mat(train_data_1)
    R_white = average_norm_cov_mat(train_data_2)
    
    R = R_pink + R_white
    
    eigenvalues , U0 = np.linalg.eig(R)

    sort_index = np.argsort(eigenvalues)
    sort_index = sort_index[:: -1]

    U0 = U0[: , sort_index]
    eigenvalues = sorted(eigenvalues , reverse=True)

    sigma = np.diag(eigenvalues)
    
    #白化矩阵
    P = np.matmul(np.diag(np.power(eigenvalues , -0.5)) , U0.T)
    
    S_pink = np.matmul(np.matmul(P , R_pink) , P.T)
    S_white = np.matmul(np.matmul(P , R_white) , P.T)
    
    E1 , US1 = np.linalg.eig(S_pink)
    E2 , US2 = np.linalg.eig(S_white)

    #E1+E2=I
    #US1=US2
    
    sort_index_1 = np.argsort(E1)
    sort_index_1 = sort_index_1[:: -1]
    
    E1 = sorted(E1 , reverse=True)
    US1 = US1[: , sort_index_1]
    
    W_1 = np.matmul(US1[: , 0:m].T , P) #前2列特征向量
    #=======================
    sort_index_2 = np.argsort(E2)
    sort_index_2 = sort_index_2[:: -1]
    
    E2 = sorted(E2 , reverse=True)
    US2 = US2[: , sort_index_2]
    
    W_2 = np.matmul(US2[: , 0:m].T , P) #前2列特征向量
    
    return W_1 , W_2

In [20]:
#使用白色
#data_1 = concat_eeg_csv(['data/train_1/fei_white_1.csv' , 'data/train_1/fei_white_2.csv'])
#data_2 = concat_eeg_csv(['data/train_1/sen_white_1.csv' , 'data/train_1/sen_white_2.csv'])

#使用粉色
data_1 = concat_eeg_csv(['data/train_1/fei_pink_1.csv' , 'data/train_1/fei_pink_2.csv'])
data_2 = concat_eeg_csv(['data/train_1/sen_pink_1.csv' , 'data/train_1/sen_pink_2.csv'])

train_data_1 , train_labels_1 = sep(data_1 , 0)
train_data_2 , train_labels_2 = sep(data_2 , 1)

train_data_1 = np.array(train_data_1)
train_data_2 = np.array(train_data_2)

train_labels_1 = np.array(train_labels_1)
train_labels_2 = np.array(train_labels_2)

train_data_1 = np.transpose(train_data_1 , axes=(0 , 2 , 1))
train_data_2 = np.transpose(train_data_2 , axes=(0 , 2 , 1))

In [40]:
print(train_data_1.shape , train_data_2.shape)

(8752, 5, 384) (24762, 5, 384)


In [None]:
#此处增加滤波阶段 此时一个小样本为3秒的数据量  此时大致认为信号为平稳的

In [22]:
W_1 , W_2 = CSP_matrix(train_data_1 , train_data_2)

In [23]:
W_1

array([[ 0.74631085, -1.28168709, -0.1997141 ,  0.8326652 ,  0.02130055],
       [ 1.20387098, -0.71146031,  1.5257984 , -1.4246383 ,  0.05634279]])

In [24]:
W_2

array([[ 0.79197144,  0.65626808,  0.01052797,  0.40990003, -1.10138692],
       [ 1.13924665, -0.21978192, -1.05780713, -0.99771489, -0.47911316]])

In [25]:
train_data = np.concatenate((train_data_1 , train_data_2))

train_labels = np.concatenate((train_labels_1 , train_labels_2))

train_data_features = []

In [26]:
def fetch_feature(xi):
    #m = 2 #2*m features
    #
    #Z = np.matmul(W , xi)
    #
    #Z_pre_m = Z[0:m ,:]
    #Z_after_m = Z[-m: , :]
    #
    #Zp = np.concatenate((Z_pre_m , Z_after_m))
    #
    #sigma_var_Zp = 0
    #
    #for i in range(2*m):
    #    sigma_var_Zp += np.var(Zp[i])
    #    
    #fi = [np.log(np.var(Zpi)/sigma_var_Zp) for Zpi in Zp]
    #
    #fi = np.array(fi)
    
    #======
    #特征维数较低时 例如低于100
    #fi = np.concatenate([np.matmul(W_1 , xi) , np.matmul(W_2 , xi)])
    #======
    
    
    '''特征提取方式1'''
    z1 = np.matmul(W_1 , xi) #2*384
    z2 = np.matmul(W_2 , xi) #2*384
    var_z1_1 = np.var(z1 , axis = 1)[0]
    var_z1_2 = np.var(z1 , axis = 1)[1]
    var_z2_1 = np.var(z2 , axis = 1)[0]
    var_z2_2 = np.var(z2 , axis = 1)[1]
    
    f1 = var_z1_1/(var_z1_1+var_z2_1)
    f2 = var_z2_1/(var_z1_1+var_z2_1)
    f3 = var_z1_2/(var_z1_2+var_z2_2)
    f4 = var_z2_2/(var_z1_2+var_z2_2)
    
    return np.array([f1 , f2 , f3 , f4])
    #==========
    
    '''特征提取方式2'''
    #z1 = np.matmul(W_1 , xi) #2*384
    #z2 = np.matmul(W_2 , xi) #2*384
    #return np.concatenate((z1 , z2))
    

In [27]:
for i in range(train_data.shape[0]):
    train_data_features.append(fetch_feature(train_data[i]))

train_data_features = np.array(train_data_features)

In [32]:
#train_data_features = np.transpose(train_data_features , axes=(0 , 2 , 1))

In [28]:
print(train_data_features.shape , train_labels.shape)

(33514, 4) (33514,)


In [29]:
svc = SVC(verbose = True)
svc.fit(train_data_features , train_labels)

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [30]:
svc.score(train_data_features , train_labels)

0.9997016172345885

In [31]:
rf = RandomForestClassifier(verbose=False)

rf.fit(train_data_features , train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=False, warm_start=False)

In [32]:
rf.score(train_data_features , train_labels)

1.0

In [33]:
adaboost = AdaBoostClassifier()
adaboost.fit(train_data_features , train_labels)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [34]:
adaboost.score(train_data_features , train_labels)

1.0

In [35]:
#==============
#==============
#val step
#白色
#data_1_val = load_data('data/val_1/fei_white_3.csv')
#data_2_val = load_data('data/val_1/sen_white_3.csv')

#粉色
data_1_val = load_data('data/val_1/fei_pink_3.csv')
data_2_val = load_data('data/val_1/sen_pink_3.csv')

val_data_1 , val_labels_1 = sep(data_1_val , 0)
val_data_2 , val_labels_2 = sep(data_2_val , 1)

val_data_1 = np.array(val_data_1)
val_data_2 = np.array(val_data_2)

val_labels_1 = np.array(val_labels_1)
val_labels_2 = np.array(val_labels_2)

val_data_1 = np.transpose(val_data_1 , axes=(0 , 2 , 1))
val_data_2 = np.transpose(val_data_2 , axes=(0 , 2 , 1))


#===================
#concat
val_data_features = []

val_data = np.concatenate((val_data_1 , val_data_2))
val_labels = np.concatenate((val_labels_1 , val_labels_2))


#===================
#fetch feature
for i in range(val_data.shape[0]):
    val_data_features.append(fetch_feature(val_data[i]))

val_data_features = np.array(val_data_features)

In [36]:
val_data_features.shape

(16507, 4)

In [37]:
#SVM
svc.score(val_data_features , val_labels)

0.9205791482401405

In [38]:
#RandomForest
rf.score(val_data_features , val_labels)

0.9183376749257891

In [39]:
adaboost.score(val_data_features , val_labels)

0.9153086569334222

In [63]:
'''简单测试一下'''
val_data_fei = []
for i in range(val_data_1.shape[0]):
    val_data_fei.append(fetch_feature(val_data_1[i]))
    
val_data_fei = np.array(val_data_fei)    
rf.score(val_data_fei , val_labels_1)
'''简单测试一下'''

0.881307929969104