In [1]:
import pandas as pd
import numpy as np
import pyeeg as pe
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import joblib
import time
import random
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [13]:
n_splits=5
data_num = 7
band = [4,8,12,16,25,45] #5 bands
window_size = 1000 #Averaging band power of 2 sec
step_size = 10 #Each 0.04 sec update once
sample_rate = 500 #Sampling rate of 500 Hz
features_num = 11
feature_name = ['4-8Hz theta','8-12Hz alpha','12-16Hz lowbeta','16-25 highbeta','25-45Hz gamma','skew','std','mean','max-min','max','min']

In [3]:
data=[]
data_raw=[]
for i in range(data_num):
    data.append(pd.read_csv("data"+str(i)+".txt",sep=','))
    data[-1] = data[-1][pd.isnull(data[-1]['RAW'])==False]
    #tag = data['MOOD'].replace([0],[-1])
    #data['MOOD'] = tag
    data_raw.append(np.array(data[-1]['RAW']))

In [4]:
'''
除了脑电波原有的5个频段特征
加入6个统计特征，偏度，方差，均值，最大值-最小值,最大值，最小值
'''
def FFT(data_,num_,band=[4,8,12,16,25,45],window_size=1000,step_size=50,sample_rate=500):
    start = 0
    shape = data_.shape[0]
    out_shape = (shape - window_size)// step_size + 1
    output = np.zeros(shape=(out_shape,features_num+1)) 
    num = 0
    while start + window_size < data_.shape[0]:
        X = data_[start:start+window_size]
        Y = pe.bin_power(X, band, sample_rate)
        output_each = list(Y[1])
        output_each.extend([pd.Series(X).skew(),X.std(),X.mean(),X.max()-X.min(),X.max(),X.min(),])#加入统计特征
        output_each.append(data[num_].iloc[start+window_size-1,11])
        output[num] = output_each
        start += step_size
        num+=1
    np.save('output'+str(num_),output, allow_pickle=True, fix_imports=True)

In [14]:
for i in range(data_num):
    FFT(data_raw[i],i,band,window_size,step_size,sample_rate)

In [15]:
output=[]
for i in range(data_num):
    output.append(np.load(open('output'+str(i)+'.npy','rb'),allow_pickle=True))
    print(output[-1].shape)

(39355, 12)
(31187, 12)
(22766, 12)
(40016, 12)
(32668, 12)
(22496, 12)
(30213, 12)


In [16]:
output_all = []
for i in range(data_num):
    output_all.append(output[i])
output_all = np.concatenate(output_all)
output_all.shape

(218701, 12)

In [17]:
np.isnan(output_all).any()

False

# 1.LightGBM

In [9]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iterations':10000,
    'early_stopping_rounds': 100
}
'''
'n_estimators':2048,
'''

"\n'n_estimators':2048,\n"

In [18]:
#scaler = StandardScaler()
#X = scaler.fit_transform(output_all[:,0:5])
fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=41)
X = output_all[:,0:features_num]
y = output_all[:,features_num]
models = []
oof = np.zeros((len(X)))
for index,(train_idx,val_idx) in enumerate(fold.split(X, y)):
    train_set = lgb.Dataset(X[train_idx],y[train_idx])
    val_set = lgb.Dataset(X[val_idx],y[val_idx])
    model = lgb.train(params,train_set,valid_sets=[train_set,val_set],verbose_eval=100,feature_name=feature_name)
    models.append(model)
    val_pred = model.predict(X[val_idx])
    val_pred = np.round(val_pred)
    oof[val_idx] = val_pred
    val_y = y[val_idx]
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average=None))



[LightGBM] [Info] Number of positive: 36124, number of negative: 138836
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 174960, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206470 -> initscore=-1.346336
[LightGBM] [Info] Start training from score -1.346336
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.30463	valid_1's binary_logloss: 0.310034
[200]	training's binary_logloss: 0.242426	valid_1's binary_logloss: 0.25153
[300]	training's binary_logloss: 0.198933	valid_1's binary_logloss: 0.211214
[400]	training's binary_logloss: 0.166225	valid_1's binary_logloss: 0.18082
[500]	training's binary_logloss: 0.14312	valid_1's binary_logloss: 0.159814
[600]	training's binary_logloss: 0.123412	valid_1's binary_logloss: 0.14198
[700]	training's binary_l

[3400]	training's binary_logloss: 0.00818883	valid_1's binary_logloss: 0.0472645
[3500]	training's binary_logloss: 0.00751406	valid_1's binary_logloss: 0.0470015
[3600]	training's binary_logloss: 0.00687554	valid_1's binary_logloss: 0.0467721
[3700]	training's binary_logloss: 0.00632437	valid_1's binary_logloss: 0.0464371
[3800]	training's binary_logloss: 0.00585207	valid_1's binary_logloss: 0.0461676
[3900]	training's binary_logloss: 0.0053655	valid_1's binary_logloss: 0.0458846
[4000]	training's binary_logloss: 0.0048891	valid_1's binary_logloss: 0.0456605
[4100]	training's binary_logloss: 0.00451578	valid_1's binary_logloss: 0.0454402
[4200]	training's binary_logloss: 0.00415206	valid_1's binary_logloss: 0.0452474
[4300]	training's binary_logloss: 0.00384326	valid_1's binary_logloss: 0.0451724
[4400]	training's binary_logloss: 0.00354501	valid_1's binary_logloss: 0.0451439
Early stopping, best iteration is:
[4346]	training's binary_logloss: 0.00370371	valid_1's binary_logloss: 0.045

[2300]	training's binary_logloss: 0.0212676	valid_1's binary_logloss: 0.0539308
[2400]	training's binary_logloss: 0.019527	valid_1's binary_logloss: 0.0526305
[2500]	training's binary_logloss: 0.0178369	valid_1's binary_logloss: 0.0512483
[2600]	training's binary_logloss: 0.0163754	valid_1's binary_logloss: 0.0501297
[2700]	training's binary_logloss: 0.0151154	valid_1's binary_logloss: 0.0492617
[2800]	training's binary_logloss: 0.0138664	valid_1's binary_logloss: 0.0482816
[2900]	training's binary_logloss: 0.0126774	valid_1's binary_logloss: 0.0473862
[3000]	training's binary_logloss: 0.0116046	valid_1's binary_logloss: 0.0465057
[3100]	training's binary_logloss: 0.0105965	valid_1's binary_logloss: 0.0457994
[3200]	training's binary_logloss: 0.0097324	valid_1's binary_logloss: 0.0452195
[3300]	training's binary_logloss: 0.00900663	valid_1's binary_logloss: 0.0449605
[3400]	training's binary_logloss: 0.00828803	valid_1's binary_logloss: 0.0445476
[3500]	training's binary_logloss: 0.007

In [19]:
print('oof f1', metrics.f1_score(oof, y, average=None))

oof f1 [0.99126475 0.96565953]


In [20]:
for i in range(n_splits):
    joblib.dump(models[i],'lgb'+str(i)+'.model')

In [21]:
models[0].feature_name()

['4-8Hz_theta',
 '8-12Hz_alpha',
 '12-16Hz_lowbeta',
 '16-25_highbeta',
 '25-45Hz_gamma',
 'skew',
 'std',
 'mean',
 'max-min',
 'max',
 'min']

In [22]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
df = pd.concat(ret)

In [23]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)
df

Unnamed: 0,name,score
5,max,16460.8
8,min,15842.8
10,std,13423.4
9,skew,13280.2
6,max-min,12679.0
2,25-45Hz_gamma,11249.2
3,4-8Hz_theta,10395.4
4,8-12Hz_alpha,10385.8
7,mean,10168.0
0,12-16Hz_lowbeta,10105.4


## 2.Random Forest Regression

In [84]:
#scaler = StandardScaler()
#x = scaler.fit_transform(output_all[:,0:5])
x = output_all[:,0:5]
y = output_all[:,5]
fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=41)
models = []
oof = np.zeros((len(x)))
for index, (train_idx, val_idx) in enumerate(fold.split(x, y)):
    train_set_x,train_set_y = x[train_idx], y[train_idx]
    val_set_x,val_set_y = x[val_idx], y[val_idx]
    rfr = RandomForestClassifier(n_estimators=512, n_jobs=-1)
    rfr.fit(train_set_x,train_set_y)
    models.append(rfr)
    val_pred = rfr.predict(val_set_x)
    #val_pred = np.around(val_pred)
    oof[val_idx] = val_pred
    val_y = val_set_y
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average=None))

0 val f1 [0.98469662 0.95932561]
1 val f1 [0.98451113 0.95882894]
2 val f1 [0.98433598 0.95830987]
3 val f1 [0.98452014 0.95879028]
4 val f1 [0.9842591  0.95812542]


In [86]:
print('oof f1', metrics.f1_score(oof, y, average=None))

oof f1 [0.98446458 0.95867614]


In [55]:
for i in models:    
    print(i.feature_importances_)

[0.22514058 0.19445062 0.17899713 0.17360194 0.22780974]
[0.22391495 0.19484324 0.18136518 0.17383927 0.22603736]
[0.22422827 0.1946285  0.17959088 0.17231861 0.22923375]
[0.22357922 0.19330085 0.17954152 0.1750695  0.22850891]
[0.22445352 0.19417766 0.18076557 0.17394622 0.22665703]


In [85]:
for i in range(n_splits):
    joblib.dump(models[i],'rf'+str(i)+'.model')

# 3.AdaBoost Regression

In [9]:
x = output_all[:,0:5]
y = output_all[:,5]
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)
models = []
oof = np.zeros((len(x)))
for index, (train_idx, val_idx) in enumerate(fold.split(x, y)):
    train_set_x,train_set_y = x[train_idx], y[train_idx]
    val_set_x,val_set_y = x[val_idx], y[val_idx]
    abr = AdaBoostRegressor(n_estimators=5000, learning_rate=0.01)
    abr.fit(train_set_x,train_set_y)
    val_pred = abr.predict(val_set_x)
    val_pred = np.around(val_pred)
    oof[val_idx] = val_pred
    val_y = val_set_y
    print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='binary'))

0 val f1 0.24572253143681716
1 val f1 0.2323462414578588
2 val f1 0.0999250562078441
3 val f1 0.0989010989010989
4 val f1 0.09327983951855566


In [10]:
print('oof f1', metrics.f1_score(oof, y, average='binary'))

oof f1 0.16064590542099194
