## Read file and fix classes

In [10]:
import pandas as pd
from helpers import EstimatorSelectionHelper

win_overlaps = [(800, 400), (800, 0), (400, 200), (400, 0), (200, 100), (200, 0)]

def read_features_file(folder, win_size, overlap_size):
    df_features = pd.read_pickle(f"{folder}\\features_win_size_{win_size}_overlap_size_{overlap_size}.pkl")    
    df_features.loc[df_features['label'] == 0, 'label'] = 0
    df_features.loc[df_features['label'] == 1, 'label'] = 1
    df_features.loc[df_features['label'] == 3, 'label'] = 2
    return df_features


### Check for NaN

In [16]:
df_features = read_features_file('D:\\facul\\features', 400, 200)

df_features.isnull().values.any()

False

## Set models and parameters

In [26]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from helpers import EstimatorSelectionHelper
import numpy as np
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from datetime import datetime
import pickle

models = {
    'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': Pipeline([
        ('scaling', StandardScaler()),
        ('estimator', LogisticRegression()),

    ]),
    'MLPClassifier': Pipeline([
        ('scaling', StandardScaler()),
        ('estimator', MLPClassifier(solver='lbfgs')),

    ]),
    'LinearSVC': Pipeline([
        ('scaling', StandardScaler()),
        ('estimator', LinearSVC())
    ]),
    'SVC': Pipeline([
        # ('feat', SelectKBest(mutual_info_classif, k=20)),        
        ('scaling', StandardScaler()),
        ('estimator', SVC())
    ])
}

params = {
    'LinearDiscriminantAnalysis': {},
    'RandomForestClassifier': { 'n_estimators': [16, 32, 48, 100] },
    'LogisticRegression': {'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3]},
    'MLPClassifier': {
        'estimator__hidden_layer_sizes': [(15,10), (10,5), (5,3),(30,15,10), (20,15,10), (15,10,5)],
        'estimator__alpha': np.logspace(-1, -7, num=7)
    },
    'LinearSVC': [
        {'estimator__C': np.logspace(5, -5, base=2, num=10)},
    ],
    'SVC': [
        {'estimator__C': np.logspace(15, -15, base=2, num=31), 'estimator__gamma': np.logspace(8, -8, base=10, num=17)},
    ]
}

## Experiment 1: Subject-dependent models with sessions randomly mixed between different days 

In [17]:
%%time
%load_ext autoreload
%autoreload 
from helpers import RepeatedStratifiedGroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from helpers import EstimatorSelectionHelper
import numpy as np
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from datetime import datetime
import pickle

cols = ['session', 'trial', 'user', 'label', 'T7', 'T8', 'FT7', 'FT8', 'TP7', 'TP8']


# df_features_cols = df_features.loc[df_features['session'] == 1, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]
df_features_cols = df_features.loc[:, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]


df_user_scores = pd.DataFrame()

user_helper_list = []

folder = './models/user-dependent-mixed-days'

for user in range(1, 16):

    cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
    outer_cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)

    print(outer_cv_grp.get_n_splits())

    helper = EstimatorSelectionHelper(models, params)
    user_helper_list.append(helper)

    X = df_features_cols.loc[df_features_cols['user'].isin([user]), ~df_features_cols.columns.isin(['session', 'trial', 'user', 'label'])]
    y = df_features_cols.loc[df_features_cols['user'].isin([user]), 'label'].astype(int)


    groups = df_features_cols.loc[df_features_cols['user'].isin([user]), 'trial']

    # helper.fit(X, y, scoring='f1', n_jobs=2, cv=cv_grp, outer_cv=outer_cv_grp, groups=df_features_cols['trial'])
    helper.fit(X, y, scoring='accuracy', n_jobs=-1, cv=cv_grp, outer_cv=outer_cv_grp, verbose=10, groups=groups, persist_dir=f"{folder}/user_{user}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", randomSearchFor=['MLPClassifier', 'SVC'])

    temp_scores = helper.score_summary(sort_by='mean_score')
    temp_scores.insert(0, 'user', user)
    df_user_scores = df_user_scores.append(temp_scores)

with open(f"{folder}/all_users_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", "wb") as f:
    pickle.dump(user_helper_list, f)

df_user_scores


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
5
dict_keys(['LinearDiscriminantAnalysis', 'RandomForestClassifier'])
Running GridSearchCV for LinearDiscriminantAnalysis with nested cross validation.
[CV]  ................................................................
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.3s remaining:    9.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.3s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.5s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[CV

KeyboardInterrupt: 

## Experiment 2: Subject-dependent models with sessions within the same days
#### - 1 trained per day/user/model-type 
#### - Need to calculate the average of the model performance between the 3 days 

In [186]:
%%time
%load_ext autoreload
%autoreload 
from helpers import RepeatedStratifiedGroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from helpers import EstimatorSelectionHelper
import numpy as np
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from datetime import datetime
import pickle

cols = ['session', 'trial', 'user', 'label', 'T7', 'T8', 'FT7', 'FT8', 'TP7', 'TP8']

df_features = read_features_file('D:\\facul\\features', 800, 400)

df_features_cols = df_features.loc[:, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]


df_user_scores = pd.DataFrame()

folder = './models/user-dependent-same-days'

for day in range(1, 4):
    user_helper_list = []

    for user in range(1, 16):

        cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
        outer_cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)

        print(outer_cv_grp.get_n_splits())

        helper = EstimatorSelectionHelper(models, params)
        user_helper_list.append(helper)

        user_day_index = (df_features_cols['user'].isin([user])) & (df_features_cols['session'] == day)

        X = df_features_cols.loc[user_day_index, ~df_features_cols.columns.isin(['session', 'trial', 'user', 'label'])]
        
        y = df_features_cols.loc[user_day_index, 'label'].astype(int)

        groups = df_features_cols.loc[user_day_index, 'trial']

        # helper.fit(X, y, scoring='f1', n_jobs=2, cv=cv_grp, outer_cv=outer_cv_grp, groups=df_features_cols['trial'])
        helper.fit(X, y, scoring='accuracy', n_jobs=-1, cv=cv_grp, outer_cv=outer_cv_grp, verbose=10, groups=groups, persist_dir=f"{folder}/user_{user}_day_{day}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", randomSearchFor=['MLPClassifier', 'SVC'])

        temp_scores = helper.score_summary(sort_by='mean_score')
        temp_scores.insert(0, 'user', user)
        temp_scores.insert(0, 'day', day)
        df_user_scores = df_user_scores.append(temp_scores)

    with open(f"{folder}/all_users_day_{day}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", "wb") as f:
        pickle.dump(user_helper_list, f)

df_user_scores


ng:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[CV] ................ , score=(train=0.819, test=0.782), total=   0.1s
Object persisted
Running GridSearchCV for RandomForestClassifier with nested cross validation.
[CV]  ................................................................
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0728s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.1s finished
[Parallel(n_jobs

Unnamed: 0,day,user,estimator,min_score,max_score,mean_score,std_score,train_mean_score,n_estimators,estimator__C,estimator__alpha,estimator__hidden_layer_sizes,estimator__gamma
0,1,1,LinearDiscriminantAnalysis,0.277512,0.511111,0.382965,0.083175,0.512056,,,,,
0,1,1,RandomForestClassifier,0.367698,0.472222,0.413429,0.043020,0.998772,16,,,,
0,1,1,LogisticRegression,0.315789,0.420635,0.366543,0.036591,0.470970,,0.01,,,
0,1,1,MLPClassifier,0.311005,0.460317,0.378056,0.057923,0.660978,,,0.001,"(10, 5)",
0,1,1,LinearSVC,0.287081,0.444444,0.389457,0.053441,0.494454,,32.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,3,15,RandomForestClassifier,0.698864,0.838346,0.798162,0.050510,1.000000,16,,,,
0,3,15,LogisticRegression,0.590308,0.924812,0.744512,0.131901,0.898900,,0.01,,,
0,3,15,MLPClassifier,0.555024,0.890977,0.731323,0.113036,0.995100,,,0.1,"(20, 15, 10)",
0,3,15,LinearSVC,0.555024,0.909774,0.741159,0.135552,0.892107,,32.0,,,


### The first implementation was generated with accumulated results (it was a bug), but given the time constraints, we did not regenerate the data, but we fixed the bug. This means that all the data is accumulated on the day 3 file.

In [11]:
all_users_data = EstimatorSelectionHelper.load("./models/user-dependent-same-days/all_users_day_3_2020-06-20T18-06-38.pkl")
# data_days = [
all_users_data[:15]

[<helpers.EstimatorSelectionHelper at 0x238b56cbc18>,
 <helpers.EstimatorSelectionHelper at 0x238b965a2e8>,
 <helpers.EstimatorSelectionHelper at 0x238ba790c50>,
 <helpers.EstimatorSelectionHelper at 0x238bb870940>,
 <helpers.EstimatorSelectionHelper at 0x238bc9b3160>,
 <helpers.EstimatorSelectionHelper at 0x238bdc221d0>,
 <helpers.EstimatorSelectionHelper at 0x238bed5b9b0>,
 <helpers.EstimatorSelectionHelper at 0x238bfef6fd0>,
 <helpers.EstimatorSelectionHelper at 0x238bca88d30>,
 <helpers.EstimatorSelectionHelper at 0x238c32d3898>,
 <helpers.EstimatorSelectionHelper at 0x238c4400630>,
 <helpers.EstimatorSelectionHelper at 0x238c54eda58>,
 <helpers.EstimatorSelectionHelper at 0x238c6635278>,
 <helpers.EstimatorSelectionHelper at 0x238c7785a58>,
 <helpers.EstimatorSelectionHelper at 0x238c88863c8>]

### Get models with best scores for each user

In [129]:
data_days = [ all_users_data[:15], all_users_data[15:30], all_users_data[30:]]

df_results = pd.DataFrame()
for data_day in data_days:
    for user, user_data in enumerate(data_day):
        score_summary = data_day[user].score_summary()
        best_score = score_summary.loc[score_summary['mean_score'] == score_summary['mean_score'].max()]

        df_results = df_results.append({'user': user+1, 'estimator': best_score.loc[0, 'estimator'], 'mean_score': best_score.loc[0, 'mean_score'] }, ignore_index=True)

df_results.groupby(['user', 'estimator'])[['mean_score']].max()


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_score
user,estimator,Unnamed: 2_level_1
1.0,RandomForestClassifier,0.484388
2.0,LinearDiscriminantAnalysis,0.713086
2.0,LogisticRegression,0.629984
2.0,RandomForestClassifier,0.651907
3.0,LinearDiscriminantAnalysis,0.487387
3.0,RandomForestClassifier,0.509086
4.0,LinearSVC,0.614193
4.0,RandomForestClassifier,0.73875
5.0,LogisticRegression,0.575294
5.0,RandomForestClassifier,0.585441


### Get the mean score for the best models for all users

In [133]:
df_results.groupby(['user'])[['mean_score']].mean().mean()

mean_score    0.58898
dtype: float64

### Get a table with all the models (best ones) for each user

In [151]:
data_days = [all_users_data[:15], all_users_data[15:30], all_users_data[30:]]
data_days[0][0].score_summary()[['estimator', 'mean_score']]

df_users_scores = pd.DataFrame(columns=['user'] + list(models.keys()))
for day, data_day in enumerate(data_days):
    for user, user_data in enumerate(data_day):
        score_summary = data_day[user].score_summary()[['estimator', 'mean_score']].set_index('estimator').T
        score_summary.insert(0, column='user', value=int(user) + 1)
        score_summary.insert(1, column='day', value=int(day))
        # score_summary.drop(columns=['estimator'], inplace=True, axis=1)

        df_users_scores = df_users_scores.append(score_summary)
        # break

# df_users_scores.rename_axis(index='None', axis=1, inplace=True)
# df_users_scores.index.name = None
# df_users_scores.reset_index(drop=False, inplace=True)

df_users_scores.columns.name = None
df_users_scores.reset_index(drop=True, inplace=True)
# df_user_scores.rename({})
df_users_scores = df_users_scores.drop(columns=['day']).groupby(['user']).mean().round(4)*100
df_for_excel = df_users_scores.reset_index().drop(columns=['LinearSVC','SVC'])

df_for_excel =df_for_excel.rename(columns={
    'user':                         'Usuário',
    'LinearDiscriminantAnalysis':   'LDA',
    'RandomForestClassifier':       'Random Forest',
    'LogisticRegression':           'Regressão Logística',
    'MLPClassifier':                'Rede Neural (MLP)',
})

df_for_excel['SVM'] = df_users_scores[['LinearSVC','SVC']].max(axis=1)

df_for_excel.to_excel('user-dependent-same-days.xlsx')

In [143]:
df_users_scores[['LinearSVC','SVC']].max(axis=1)

0     0.399592
1     0.694628
2     0.393085
3     0.614193
4     0.448867
5     0.454028
6     0.663884
7     0.499767
8     0.576516
9     0.479004
10    0.438735
11    0.486102
12    0.507980
13    0.524749
14    0.730137
15    0.448810
16    0.604981
17    0.460137
18    0.636266
19    0.571678
20    0.776448
21    0.509074
22    0.610411
23    0.426553
24    0.498326
25    0.506523
26    0.419296
27    0.442421
28    0.469671
29    0.814016
30    0.401138
31    0.613689
32    0.493714
33    0.684257
34    0.566229
35    0.816302
36    0.698844
37    0.743879
38    0.573736
39    0.608932
40    0.548680
41    0.434427
42    0.670619
43    0.624228
44    0.741159
dtype: float64

In [None]:
data_days = [ all_users_data[:15], all_users_data[15:30], all_users_data[30:]]

df_all_results = pd.DataFrame()
for data_day in data_days:
    for user, user_data in enumerate(data_day):
        score_summary = data_day[user].score_summary()
        best_score = score_summary.loc[score_summary['mean_score'] == score_summary['mean_score'].max()]

        df_all_results = df_results.append({'user': user+1, 'estimator': best_score.loc[0, 'estimator'], 'mean_score': best_score.loc[0, 'mean_score'] }, ignore_index=True)

df_results.groupby(['user', 'estimator'])[['mean_score']].max()


## Experiment 3: Subject dependent models with varying windows and overlaps
#### - To limit execution time, only 3 users on the first day/session are used for comparison

In [8]:
%%time
%load_ext autoreload
%autoreload 
from helpers import StratifiedGroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from helpers import EstimatorSelectionHelper
import numpy as np
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from datetime import datetime
import pickle
from sklearn.model_selection import GroupKFold, StratifiedShuffleSplit, StratifiedKFold

cols = ['session', 'trial', 'user', 'label', 'T7', 'T8', 'FT7', 'FT8', 'TP7', 'TP8']




df_user_scores = pd.DataFrame()



folder = './models/user-dependent-varying-window-and-overlap'
win_overlaps = [(800, 400), (800, 0), (400, 200), (400, 0), (200, 100), (200, 0)]
# win_overlaps = [(400, 200), (400, 0), (200, 100), (200, 0)]


for window_size, overlap_size in win_overlaps:
    user_helper_list = []

    df_features = read_features_file('D:\\facul\\features', window_size, overlap_size)
    # Use only first day/session
    df_features_cols = df_features.loc[df_features['session'] == 1, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]

    # Only first 3 users
    for user in range(1, 4):

        # cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
        # outer_cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
        cv_grp = StratifiedKFold(n_splits=5, shuffle=True)
        outer_cv_grp = StratifiedGroupKFold(n_splits=5)   


        print(outer_cv_grp.get_n_splits())

        helper = EstimatorSelectionHelper(models, params)
        user_helper_list.append(helper)

        user_day_index = df_features_cols['user'].isin([user])

        X = df_features_cols.loc[user_day_index, ~df_features_cols.columns.isin(['session', 'trial', 'user', 'label'])]
        y = df_features_cols.loc[user_day_index, 'label'].astype(int)

        groups = df_features_cols.loc[user_day_index, 'trial']

        # helper.fit(X, y, scoring='f1', n_jobs=2, cv=cv_grp, outer_cv=outer_cv_grp, groups=df_features_cols['trial'])
        helper.fit(X, y, scoring='accuracy', n_jobs=-1, cv=cv_grp, outer_cv=outer_cv_grp, verbose=10, groups=groups, persist_dir=f"{folder}/user_{user}_win_{window_size}_overlap_{overlap_size}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", randomSearchFor=['MLPClassifier', 'SVC'])

        temp_scores = helper.score_summary(sort_by='mean_score')
        temp_scores.insert(0, 'user', user)
        df_user_scores = df_user_scores.append(temp_scores)

    with open(f"{folder}/all_users_win_{window_size}_overlap_{overlap_size}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", "wb") as f:
        pickle.dump(user_helper_list, f)

df_user_scores


out of   5 | elapsed:   35.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0199s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0209s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parall

Unnamed: 0,user,estimator,min_score,max_score,mean_score,std_score,train_mean_score,n_estimators,estimator__C,estimator__alpha,estimator__hidden_layer_sizes
0,1,LinearDiscriminantAnalysis,0.319588,0.436508,0.384530,0.039319,0.513476,,,,
0,1,RandomForestClassifier,0.364261,0.472222,0.419792,0.044398,1.000000,16,,,
0,1,LogisticRegression,0.316151,0.428571,0.376207,0.039490,0.518120,,0.01,,
0,1,MLPClassifier,0.267943,0.444444,0.359600,0.066116,0.691797,,,0.1,"(20, 15, 10)"
0,1,LinearSVC,0.216495,0.448413,0.351649,0.087931,0.528748,,32.0,,
...,...,...,...,...,...,...,...,...,...,...,...
0,3,LinearDiscriminantAnalysis,0.135593,0.565141,0.403401,0.147465,0.561222,,,,
0,3,RandomForestClassifier,0.218884,0.538732,0.343459,0.109975,1.000000,16,,,
0,3,LogisticRegression,0.057203,0.538732,0.368412,0.163541,0.576678,,0.01,,
0,3,MLPClassifier,0.246753,0.445423,0.322530,0.068454,0.762025,,,1e-07,"(15, 10, 5)"


In [36]:
files_dir = './models/user-dependent-varying-window-and-overlap'

windows_configs = {
    'win_200_overlap_0': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_200_overlap_0_2020-06-21T00-15-44.pkl"),
    'win_200_overlap_100': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_200_overlap_100_2020-06-21T00-10-13.pkl"),
    'win_400_overlap_0': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_400_overlap_0_2020-06-20T23-57-51.pkl"),
    'win_400_overlap_200': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_400_overlap_200_2020-06-20T23-54-51.pkl"),
    'win_800_overlap_0': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_800_overlap_0_2020-06-20T23-49-31.pkl"),
    'win_800_overlap_400': EstimatorSelectionHelper.load(f"{files_dir}/all_users_win_800_overlap_400_2020-06-20T23-47-36.pkl"),
}

df_user_scores = pd.DataFrame()
for conf in windows_configs:
    win_size, overlap_size = conf.split('_')[1], conf.split('_')[3]
    df_window_scores = df_window_scores.append({'window': win_size, 'overlap': overlap_size}, ignore_index=True)

    for user_results in windows_configs[conf]:

        temp_scores = user_results.score_summary(sort_by='mean_score')
        temp_scores.insert(0, 'window', win_size)
        temp_scores.insert(1, 'overlap', overlap_size)

        df_user_scores = df_user_scores.append(temp_scores)


    # df_window_scores[conf].score_summary().groupby
df_user_scores.groupby(['window', 'overlap']).agg({'mean_score': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_score
window,overlap,Unnamed: 2_level_1
200,0,0.466692
200,100,0.472884
400,0,0.476674
400,200,0.475305
800,0,0.451227
800,400,0.479198


## Experiment 4: Subject-independent models with sessions within the same days

In [9]:
%%time
%load_ext autoreload
%autoreload 
from helpers import RepeatedStratifiedGroupKFold
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from helpers import EstimatorSelectionHelper
import numpy as np
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from datetime import datetime
import pickle

cols = ['session', 'trial', 'user', 'label', 'T7', 'T8', 'FT7', 'FT8', 'TP7', 'TP8']

df_features = read_features_file('D:\\facul\\features', 800, 400)




df_scores = pd.DataFrame()
helper_list = []
folder = './models/user-independent-same-days'

for day in range(1, 4):
    df_features_cols = df_features.loc[df_features['session'] == day, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]
    cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
    outer_cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)

    helper = EstimatorSelectionHelper(models, params)
    helper_list.append(helper)


    X = df_features_cols.loc[:, ~df_features_cols.columns.isin(['session', 'trial', 'user', 'label'])]
    
    y = df_features_cols.loc[:, 'label'].astype(int)

    groups = df_features_cols.loc[:, 'trial']

    # helper.fit(X, y, scoring='f1', n_jobs=2, cv=cv_grp, outer_cv=outer_cv_grp, groups=df_features_cols['trial'])
    helper.fit(X, y, scoring='accuracy', n_jobs=-1, cv=cv_grp, outer_cv=outer_cv_grp, verbose=10, groups=groups, persist_dir=f"{folder}/day_{day}_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", randomSearchFor=['MLPClassifier', 'SVC'])

    temp_scores = helper.score_summary(sort_by='mean_score')
    temp_scores.insert(0, 'day', day)
    df_scores = df_scores.append(temp_scores)

with open(f"{folder}/all_days_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S.pkl')}", "wb") as f:
    pickle.dump(helper_list, f)

df_user_scores


f  50 | elapsed:  4.7min finished
[CV] ................ , score=(train=0.482, test=0.422), total= 4.9min
[CV]  ................................................................
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.9min finished
[CV] ................ , score=(train=0.446, test=0.367), total=11.6min
[CV]  ................................................................
Fitting 5 folds for ea

NameError: name 'df_user_scores' is not defined

In [10]:
df_scores

Unnamed: 0,day,estimator,min_score,max_score,mean_score,std_score,train_mean_score,estimator__C,estimator__hidden_layer_sizes,estimator__alpha,estimator__gamma
0,1,LogisticRegression,0.367927,0.483749,0.403516,0.042812,0.414465,0.01,,,
0,1,MLPClassifier,0.362074,0.46121,0.404945,0.037572,0.429209,,"(15, 10)",0.001,
0,1,LinearSVC,0.370074,0.476868,0.423402,0.042487,0.443583,32.0,,,
0,1,SVC,0.442153,0.525678,0.472046,0.029666,0.537359,0.0001220703125,,,0.1
0,2,LogisticRegression,0.380952,0.514603,0.434235,0.050916,0.467147,0.01,,,
0,2,MLPClassifier,0.43483,0.550554,0.493385,0.037853,0.595323,,"(20, 15, 10)",1e-05,
0,2,LinearSVC,0.380408,0.49873,0.446042,0.040429,0.482866,32.0,,,
0,2,SVC,0.367347,0.503526,0.450435,0.050632,0.592567,6.103515625e-05,,,10000000.0
0,3,LogisticRegression,0.325552,0.476136,0.38726,0.056558,0.397168,0.01,,,
0,3,MLPClassifier,0.340904,0.487419,0.399655,0.062583,0.447798,,"(30, 15, 10)",0.1,


In [33]:
df_user_scores[(df_user_scores['window'] == '800') & (df_user_scores['overlap'] == '400')].mean()

window              5.338672e+43
overlap             2.669336e+43
min_score           3.365907e-01
max_score           5.947198e-01
mean_score          4.791984e-01
std_score           9.289172e-02
train_mean_score    7.498655e-01
dtype: float64

In [23]:
windows_configs['win_800_overlap_0'].score_summary()

AttributeError: 'list' object has no attribute 'score_summary'

In [47]:
cv_grp = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)
cv_grp2 = RepeatedStratifiedGroupKFold(n_splits=5, n_repeats=1)

df_features = read_features_file('D:\\facul\\features', 400, 200)
df_features_cols = df_features.loc[df_features['session'] == 1, df_features.filter(regex=r"({})".format('|'.join(cols)), axis=1).columns]
X = df_features_cols.loc[df_features_cols['user'].isin([1]), ~df_features_cols.columns.isin(['session', 'trial', 'user', 'label'])]

X.shape
y = df_features_cols.loc[df_features_cols['user'].isin([1]), 'label'].astype(int)

groups = df_features_cols.loc[df_features_cols['user'].isin([1]), 'trial']
for train_idx, test_idx in cv_grp.split(X, y=y, groups=groups):
    df_features_cols2 = df_features_cols[df_features_cols.index.isin(train_idx)]
    X2 = df_features_cols2.loc[:, ~df_features_cols2.columns.isin(['session', 'trial', 'user', 'label'])]
    y2 = df_features_cols2.loc[:, 'label'].astype(int)
    print(y2.value_counts())
    raise
    for train_idx2, test_idx2 in cv_grp2.split(X2, y=y2, groups=groups):
    # for train_idx, test_idx in stratified_group_k_fold(train_x, y=train_y, groups=train_groups, k=4, seed=0):
        print("TRAIN:")
        print(df_features_cols2[df_features_cols2.index.isin(train_idx2)]['label'].value_counts(normalize=True))
        print("TEST:")
        print(df_features_cols2[df_features_cols2.index.isin(test_idx2)]['label'].value_counts(normalize=True))

        print("-----------------")

1    820
0    697
2    437
Name: label, dtype: int64


RuntimeError: No active exception to reraise

In [198]:
# df_user_scores[df_user_scores['user'] == 6]
df_user_scores.groupby('user')['mean_score'].max().mean()
df_user_scores

Unnamed: 0,day,user,estimator,min_score,max_score,mean_score,std_score,train_mean_score,n_estimators,estimator__C,estimator__alpha,estimator__hidden_layer_sizes,estimator__gamma
0,1,1,LinearDiscriminantAnalysis,0.277512,0.511111,0.382965,0.083175,0.512056,,,,,
0,1,1,RandomForestClassifier,0.367698,0.472222,0.413429,0.043020,0.998772,16,,,,
0,1,1,LogisticRegression,0.315789,0.420635,0.366543,0.036591,0.470970,,0.01,,,
0,1,1,MLPClassifier,0.311005,0.460317,0.378056,0.057923,0.660978,,,0.001,"(10, 5)",
0,1,1,LinearSVC,0.287081,0.444444,0.389457,0.053441,0.494454,,32.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,3,15,RandomForestClassifier,0.698864,0.838346,0.798162,0.050510,1.000000,16,,,,
0,3,15,LogisticRegression,0.590308,0.924812,0.744512,0.131901,0.898900,,0.01,,,
0,3,15,MLPClassifier,0.555024,0.890977,0.731323,0.113036,0.995100,,,0.1,"(20, 15, 10)",
0,3,15,LinearSVC,0.555024,0.909774,0.741159,0.135552,0.892107,,32.0,,,


In [21]:
df_features.columns.values[4:][helper1.grid_searches['SVC'].best_estimator_.named_steps['feat'].get_support()]

array(['FP1_de_theta', 'FP1_de_gamma', 'FP2_de_theta', 'FP2_de_alpha',
       'FP2_de_gamma', 'AF4_de_theta', 'F7_de_beta', 'F7_de_gamma',
       'F4_de_gamma', 'F6_de_gamma', 'F8_de_gamma', 'FT7_de_gamma',
       'FC5_de_beta', 'FC5_de_gamma', 'FC4_de_gamma', 'FC6_de_beta',
       'FC6_de_gamma', 'FT8_de_gamma', 'T7_de_delta', 'T7_de_beta',
       'T7_de_gamma', 'C5_de_gamma', 'C3_de_gamma', 'C2_de_gamma',
       'C4_de_beta', 'C4_de_gamma', 'C6_de_beta', 'C6_de_gamma',
       'T8_de_gamma', 'TP7_de_beta', 'TP7_de_gamma', 'CP5_de_beta',
       'CP5_de_gamma', 'CP3_de_gamma', 'CP1_de_gamma', 'CP4_de_beta',
       'CP4_de_gamma', 'TP8_de_gamma', 'P7_de_beta', 'P7_de_gamma',
       'P5_de_gamma', 'P3_de_gamma', 'PZ_de_gamma', 'PO7_de_gamma',
       'PO5_de_gamma', 'PO3_de_gamma', 'PO4_de_gamma', 'CB1_de_gamma',
       'O1_de_beta', 'O1_de_gamma'], dtype=object)

## Save models to Production

### Load all model's for an experiment

In [6]:
temp = EstimatorSelectionHelper.load('./models/user-dependent-same-days/all_users_day_2_2020-06-20T17-49-08.pkl')

### Get model with highest score

In [9]:
df = temp[14].score_summary()
df
df[df['mean_score'] == df['mean_score'].max()]

Unnamed: 0,estimator,min_score,max_score,mean_score,std_score,train_mean_score,n_estimators,estimator__C,estimator__alpha,estimator__hidden_layer_sizes,estimator__gamma
0,LinearSVC,0.460481,0.853333,0.730137,0.138959,0.883166,,32.0,,,


In [10]:
temp[14].grid_searches['LinearSVC']

GridSearchCV(cv=<helpers.RepeatedStratifiedGroupKFold object at 0x00000240F82B80B8>,
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaling',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('estimator',
                                        LinearSVC(C=1.0, class_weight=None,
                                                  dual=True, fit_intercept=True,
                                                  intercept_scaling=1,
                                                  loss='squared_hinge',
                                                  max_iter=1000,
                                                  multi_class='ovr'...
                                                  random_state=None, tol=0.0001,
          

In [11]:
# Check if model is fitted
# from https://stackoverflow.com/a/48046685

import inspect

def is_fitted(model):
        """Checks if model object has any attributes ending with an underscore"""
        return 0 < len( [k for k,v in inspect.getmembers(model) if k.endswith('_') and not k.startswith('__')] )

is_fitted(temp[3].grid_searches['LinearSVC'])

True

In [12]:
any_features = np.random.rand(5,5).reshape(1, -1)
temp[3].grid_searches['LinearSVC'].predict(any_features)

ValueError: operands could not be broadcast together with shapes (1,25) (30,) (1,25) 

### Save model for production usage

In [13]:
import joblib

joblib.dump(temp[14].grid_searches['LinearSVC'], '../../openbci/best_model_with_scaling.sav')

['../../openbci/best_model_with_scaling.sav']