In [1]:
import numpy as np
import pandas as pd

In [2]:
clean_data = pd.read_csv("clean_data.csv")
clean_data.shape

(1584, 134)

In [3]:
clean_data['Valence'].value_counts(normalize=True)

Positive    0.5
Negative    0.5
Name: Valence, dtype: float64

In [4]:
# Distribution of label

print('Training Positive:', len(clean_data[(clean_data['Type'] == 'Training') & (clean_data['Valence'] == 'Positive')]))
print('Training Negative:', len(clean_data[(clean_data['Type'] == 'Training') & (clean_data['Valence'] == 'Negative')]))
print('Testing Positive:', len(clean_data[(clean_data['Type'] == 'Testing') & (clean_data['Valence'] == 'Positive')]))
print('Testing Negative:', len(clean_data[(clean_data['Type'] == 'Testing') & (clean_data['Valence'] == 'Negative')]))

Training Positive: 168
Training Negative: 168
Testing Positive: 624
Testing Negative: 624


In [5]:
train_data = clean_data[clean_data['Type'] == 'Training']
test_data = clean_data[clean_data['Type'] == 'Testing']

In [7]:
train_data_X = train_data.loc[:, ~train_data.columns.isin(['Type', 'Valence', 'Time', 'Time.1'])]
train_data_y = train_data.loc[:, 'Valence']

test_data_X = test_data.loc[:, ~test_data.columns.isin(['Type', 'Valence', 'Time', 'Time.1'])]
test_data_y = test_data.loc[:, 'Valence']

In [65]:
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder, Normalizer, MaxAbsScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from scipy.stats import reciprocal, uniform
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance

In [9]:
train_data_X.shape

(336, 130)

In [10]:
# Choosing Model

pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
hp = {
            'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler()],
            'classifier': [SVC(), DecisionTreeClassifier(), GaussianNB(), KNeighborsClassifier(), RandomForestClassifier(random_state=42)]
        }
grid = RandomizedSearchCV(pipe, hp, n_iter=25, scoring='accuracy', n_jobs=1, cv=10, random_state=42)
final = grid.fit(train_data_X, train_data_y)
final.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(random_state=42))])

In [11]:
final.best_score_
from pandas import DataFrame   
result_df = DataFrame.from_dict(final.cv_results_, orient='columns')
result_df[["params", "mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,"{'scaler': StandardScaler(), 'classifier': SVC()}",0.74385
1,"{'scaler': RobustScaler(), 'classifier': SVC()}",0.723351
2,"{'scaler': MinMaxScaler(), 'classifier': SVC()}",0.785829
3,"{'scaler': Normalizer(), 'classifier': SVC()}",0.555704
4,"{'scaler': MaxAbsScaler(), 'classifier': SVC()}",0.798128
5,"{'scaler': StandardScaler(), 'classifier': Dec...",0.797504
6,"{'scaler': RobustScaler(), 'classifier': Decis...",0.761854
7,"{'scaler': MinMaxScaler(), 'classifier': Decis...",0.803565
8,"{'scaler': Normalizer(), 'classifier': Decisio...",0.779768
9,"{'scaler': MaxAbsScaler(), 'classifier': Decis...",0.809358


In [12]:
final.best_estimator_.score(test_data_X, test_data_y)

0.5865384615384616

In [14]:
scaler = StandardScaler()
train_data_X_scaled = scaler.fit_transform(train_data_X.astype(np.float32))
test_data_X_scaled = scaler.transform(test_data_X.astype(np.float32))

In [13]:
#Hyperparameter Tuning for Random Forest Classifier

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [15]:
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_data_X_scaled, train_data_y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100, 1200,
                                                         1300, 1400, 1500, 1600,
                                                         1700, 1800, 1900,
     

In [16]:
result_df_rf = DataFrame.from_dict(rf_random.cv_results_, orient='columns')
result_df_rf[["params", "mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,"{'n_estimators': 600, 'min_samples_split': 5, ...",0.857308
1,"{'n_estimators': 1100, 'min_samples_split': 2,...",0.857308
2,"{'n_estimators': 300, 'min_samples_split': 10,...",0.866043
3,"{'n_estimators': 500, 'min_samples_split': 5, ...",0.848307
4,"{'n_estimators': 1900, 'min_samples_split': 2,...",0.854278
...,...,...
95,"{'n_estimators': 2000, 'min_samples_split': 5,...",0.863191
96,"{'n_estimators': 1900, 'min_samples_split': 10...",0.857308
97,"{'n_estimators': 100, 'min_samples_split': 10,...",0.854189
98,"{'n_estimators': 2000, 'min_samples_split': 10...",0.863191


In [17]:
print(rf_random.best_estimator_)

RandomForestClassifier(bootstrap=False, max_depth=80, n_estimators=1800)


In [18]:
print(rf_random.best_estimator_.score(test_data_X_scaled, test_data_y))

0.6201923076923077
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1100; total time=   2.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1900; total time=   2.8s
[CV] END bootstrap=F

[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1100; total time=   2.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   0.8s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1900; total time=   2.7s
[CV] END bootstrap=False, max_depth=70

[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   1.1s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1100; total time=   2.2s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1100; total time=   2.2s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1900; total time=   2.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1900; total time=   3.3s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=1300; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1700; total time=   2.5s
[CV] END bootstrap=False, max_depth=1

In [19]:
# Based accuracy for all fatures
rfc = RandomForestClassifier(bootstrap=False, max_depth=80, n_estimators=1800)

np.mean(cross_val_score(rfc,train_data_X_scaled, train_data_y,cv=10,scoring="accuracy"))

0.8662210338680927

In [20]:
# Feature selection with classif

pp = Pipeline([('selector', SelectKBest(f_classif, k = 10)),
                       ('classifier', rfc)])
fc_random = {
    'selector__k': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 130],
}

rf_random_pp = RandomizedSearchCV(estimator = pp, param_distributions = fc_random, n_iter = 11, cv = 10, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random_pp.fit(train_data_X_scaled, train_data_y)

Fitting 10 folds for each of 11 candidates, totalling 110 fits


RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('selector', SelectKBest()),
                                             ('classifier',
                                              RandomForestClassifier(bootstrap=False,
                                                                     max_depth=80,
                                                                     n_estimators=1800))]),
                   n_iter=11, n_jobs=-1,
                   param_distributions={'selector__k': [10, 20, 30, 40, 50, 60,
                                                        70, 80, 90, 100, 120,
                                                        130]},
                   random_state=42, verbose=2)

In [21]:
result_df_rf_pp = DataFrame.from_dict(rf_random_pp.cv_results_, orient='columns')
result_df_rf_pp[["params", "mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,{'selector__k': 120},0.857219
1,{'selector__k': 100},0.857308
2,{'selector__k': 10},0.845276
3,{'selector__k': 90},0.848485
4,{'selector__k': 60},0.851426
5,{'selector__k': 30},0.878075
6,{'selector__k': 20},0.854367
7,{'selector__k': 130},0.863191
8,{'selector__k': 50},0.857308
9,{'selector__k': 80},0.845455


[CV] END ....................................selector__k=120; total time=   3.3s
[CV] END ....................................selector__k=100; total time=   3.2s
[CV] END ....................................selector__k=100; total time=   3.3s
[CV] END .....................................selector__k=90; total time=   3.0s
[CV] END .....................................selector__k=60; total time=   2.4s
[CV] END .....................................selector__k=60; total time=   2.4s
[CV] END .....................................selector__k=30; total time=   1.8s
[CV] END .....................................selector__k=20; total time=   1.6s
[CV] END ....................................selector__k=130; total time=   3.5s
[CV] END .....................................selector__k=50; total time=   2.3s
[CV] END .....................................selector__k=50; total time=   2.3s
[CV] END .....................................selector__k=80; total time=   2.7s
[CV] END ...................

In [24]:
# feature selection

print(rf_random_pp.best_estimator_.score(test_data_X_scaled, test_data_y))

#Yeah lower a bit

0.5817307692307693


In [30]:
# Feature selection first before training

select_class = SelectKBest(k=30, score_func=f_classif)
select_class.fit(train_data_X, train_data_y)
train_data_X_30 = select_class.transform(train_data_X)

print("Num Features before:", train_data_X.shape[1])
print("Num Features after:", train_data_X_30.shape[1])

Num Features before: 130
Num Features after: 30


In [33]:
train_data_X_30 = train_data_X.iloc[:,select_class.get_support()]
train_data_X_30

Unnamed: 0,EOG,EOG.1,ACC,EDA,EDA.1,EDA.2,EMG,EMG.1,EMG.2,EEGth,...,EEGConn.23,EEGConn.25,EEGConn.27,EEGConn.30,EEGConn.36,EEGConn.40,EEGConn.41,EEGConn.42,EEGConn.49,EEGConn.51
0,0.03501,0.18852,0.08911,0.0,0.00030,0.01733,3.98510,4.16329,3.40618,7.04455,...,0.01336,0.69433,0.06634,0.68213,0.00892,0.16736,0.11560,0.05529,0.10725,0.32123
1,0.03269,0.18852,0.07023,0.0,0.00003,0.00579,3.69995,3.90775,3.15758,7.82496,...,0.06691,0.59508,0.02569,0.74953,0.36614,0.42193,0.39327,0.03909,0.02751,0.35654
2,0.03522,0.16972,0.03413,0.0,0.00001,0.00305,3.11420,3.74782,3.16962,7.98087,...,0.35400,0.86812,0.21092,0.68888,0.31714,0.39651,0.44351,0.01514,0.02362,0.25614
3,0.03522,0.16972,0.02725,0.0,0.00004,0.00617,3.11963,3.70944,3.17337,8.28311,...,0.38487,0.89461,0.21361,0.69747,0.35363,0.36143,0.37128,0.03072,0.03808,0.16711
4,0.03608,0.16972,0.02725,0.0,0.00003,0.00531,3.33145,3.79952,3.22674,8.63026,...,0.47137,0.92771,0.36036,0.60497,0.33469,0.50176,0.44076,0.04727,0.01227,0.20394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.03787,0.15418,0.04489,0.0,0.00000,0.00088,3.05837,6.81899,5.77793,7.70822,...,0.55377,0.80561,0.49863,0.73317,0.19910,0.36342,0.34314,0.33728,0.04079,0.05548
332,0.03787,0.15418,0.03114,0.0,0.00001,0.00297,2.82540,6.29711,5.69343,8.05088,...,0.42996,0.82607,0.49569,0.73311,0.15318,0.38391,0.33310,0.31985,0.02409,0.05373
333,0.03787,0.15418,0.03077,0.0,0.00004,0.00610,2.70717,5.73415,5.15523,8.08889,...,0.41142,0.76754,0.51573,0.67625,0.28401,0.45603,0.31596,0.32416,0.04635,0.09566
334,0.02792,0.15072,0.03329,0.0,0.00004,0.00611,2.48429,5.20472,4.90899,8.56130,...,0.43870,0.77325,0.41026,0.49273,0.35570,0.26878,0.09229,0.06666,0.02965,0.22586


In [34]:
# Choosing Model after feature selection

pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
hp = {
            'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler()],
            'classifier': [SVC(), DecisionTreeClassifier(), GaussianNB(), KNeighborsClassifier(), RandomForestClassifier(random_state=42)]
        }
grid = RandomizedSearchCV(pipe, hp, n_iter=25, scoring='accuracy', n_jobs=1, cv=10, random_state=42)
final = grid.fit(train_data_X_30, train_data_y)
final.best_estimator_

Pipeline(steps=[('scaler', RobustScaler()), ('classifier', SVC())])

In [35]:
final.best_score_

0.8934937611408198

In [36]:
result_df = DataFrame.from_dict(final.cv_results_, orient='columns')
result_df[["params", "mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,"{'scaler': StandardScaler(), 'classifier': SVC()}",0.884135
1,"{'scaler': RobustScaler(), 'classifier': SVC()}",0.893494
2,"{'scaler': MinMaxScaler(), 'classifier': SVC()}",0.887255
3,"{'scaler': Normalizer(), 'classifier': SVC()}",0.857219
4,"{'scaler': MaxAbsScaler(), 'classifier': SVC()}",0.893137
5,"{'scaler': StandardScaler(), 'classifier': Dec...",0.812567
6,"{'scaler': RobustScaler(), 'classifier': Decis...",0.806506
7,"{'scaler': MinMaxScaler(), 'classifier': Decis...",0.841889
8,"{'scaler': Normalizer(), 'classifier': Decisio...",0.788859
9,"{'scaler': MaxAbsScaler(), 'classifier': Decis...",0.806595


In [37]:
test_data_X_30 = test_data_X.iloc[:,select_class.get_support()]

final.best_estimator_.score(test_data_X_30, test_data_y)

0.6049679487179487

In [38]:
# hyperparameter tuning for SVC

pipeline = Pipeline([
                     ('scaler', RobustScaler()),
                     ('classifier', SVC())
                      ],
                     memory=None)

params = {"classifier__C":[0.1, 1, 10, 100, 1000],
          "classifier__gamma": [1, 0.1, 0.01, 0.001, 0.0001],
          "classifier__kernel": ['rbf','linear']
         }

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']} 
  
grid = GridSearchCV(pipeline, params, refit = True, verbose =0,cv=10)
  
# fitting the model for grid search
grid.fit(train_data_X_30, train_data_y)

print("tuned hyperparameters :(best parameters) ",grid.best_params_) 

print("accuracy :",grid.best_score_*100)

tuned hyperparameters :(best parameters)  {'classifier__C': 100, 'classifier__gamma': 0.01, 'classifier__kernel': 'rbf'}
accuracy : 92.59358288770052


In [40]:
grid_predictions = grid.predict(test_data_X_30) 
test_accuracy=accuracy_score(test_data_y,grid_predictions)*100
print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

Accuracy for our testing dataset with tuning is : 62.18%


In [41]:
# hyperparemeter tuning for LinearSVC

pipeline = Pipeline([
                     ('scaler', RobustScaler()),
                     ('classifier', LinearSVC())
                      ],
                     memory=None)

params = {"classifier__C":[0.1, 1, 10, 100, 1000]
#          "classifier__gamma": [1, 0.1, 0.01, 0.001, 0.0001],
#          "classifier__kernel": ['rbf','linear']
         }
 
grid_linsvc = GridSearchCV(pipeline, params, refit = True, verbose =0,cv=10)
  
# fitting the model for grid search
grid_linsvc.fit(train_data_X_30, train_data_y)

print("tuned hyperparameters :(best parameters) ",grid_linsvc.best_params_) 

print("accuracy :",grid_linsvc.best_score_*100)



tuned hyperparameters :(best parameters)  {'classifier__C': 10}
accuracy : 86.6488413547237




In [45]:
grid_predictions = grid_linsvc.best_estimator_.predict(test_data_X_30) 
test_accuracy=accuracy_score(test_data_y,grid_predictions)*100
print("Accuracy for our testing dataset with tuning is : {:.2f}%".format(test_accuracy) )

Accuracy for our testing dataset with tuning is : 62.58%


In [46]:
lin_svc_fay = LinearSVC(C=1, dual=False, class_weight='balanced', max_iter = 100, random_state=42)
lin_svc_fay.fit(train_data_X_30, train_data_y)
predictions = lin_svc_fay.predict(test_data_X_30)

print('training accuracy:', np.mean(cross_val_score(lin_svc_fay,train_data_X, train_data_y,cv=10,scoring="accuracy")))
print('testing accuracy:', accuracy_score(test_data_y, predictions))
print('confusion matrix:', confusion_matrix(test_data_y, predictions))

# https://datascience.stackexchange.com/questions/28426/train-accuracy-vs-test-accuracy-vs-confusion-matrix

training accuracy: 0.8038324420677361
testing accuracy: 0.6217948717948718
confusion matrix: [[409 215]
 [257 367]]


In [80]:
lin_svc_fay_2 = LinearSVC(C=1, dual=False, class_weight='balanced', max_iter = 100, random_state=42)
lin_svc_fay_2.fit(train_data_X, train_data_y)
predictions = lin_svc_fay_2.predict(test_data_X)

print('training accuracy:', np.mean(cross_val_score(lin_svc_fay_2,train_data_X, train_data_y,cv=10,scoring="accuracy")))
print('testing accuracy:', accuracy_score(test_data_y, predictions))
print('confusion matrix:', confusion_matrix(test_data_y, predictions))

# https://datascience.stackexchange.com/questions/28426/train-accuracy-vs-test-accuracy-vs-confusion-matrix

training accuracy: 0.8038324420677361
testing accuracy: 0.6466346153846154
confusion matrix: [[507 117]
 [324 300]]


In [50]:
# selecting features with feature importance permutation

my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_data_X_scaled, train_data_y)

In [52]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(lin_svc_fay, random_state=1).fit(test_data_X_scaled, test_data_y)
eli5.show_weights(perm, feature_names = test_data_X.columns.tolist(), top=150)

Weight,Feature
0.0715  ± 0.0106,EMG
0.0093  ± 0.0089,ACC.1
0.0066  ± 0.0076,EMG.1
0.0059  ± 0.0046,EEGg.5
0.0037  ± 0.0030,EEGb.9
0.0034  ± 0.0071,EEGConn.16
0.0030  ± 0.0045,EEGConn.3
0.0030  ± 0.0031,EEGa.13
0.0022  ± 0.0037,EEGa.11
0.0022  ± 0.0049,EEGConn.20


In [66]:
def permutation_importance_feature(model, X, y, n_repeats):
    result = permutation_importance(model, X, y, n_repeats=n_repeats, random_state=42)
    chosen_feats = []
    
    for i in result.importances_mean.argsort()[::-1]:
        if result.importances_mean[i] - 2 * result.importances_std[i] >0:
            chosen_feats.append(X.columns[i])
    
    return chosen_feats, result

In [74]:
chosen, result = permutation_importance_feature(lin_svc_fay_2, train_data_X, train_data_y, 30)

In [76]:
chosen

['EMG',
 'EEGa.1',
 'EEGa.4',
 'EEGa.5',
 'EEGa.9',
 'EEGa.10',
 'EEGa.13',
 'EEGa',
 'EMG.1',
 'EEGa.3',
 'EMG.2',
 'EEGb.1',
 'EEGb.4',
 'EEGa.8',
 'EEGg.13',
 'EEGb.5',
 'EEGb.8',
 'EEGg.8',
 'EEGg.6',
 'EEGth.12',
 'EEGg.4',
 'EEGth.7',
 'ACC.1',
 'EEGth.9',
 'EEGg.12',
 'EEGa.7']

In [79]:
train_data_X_perm = train_data_X.loc[:, chosen]
test_data_X_perm = test_data_X.loc[:, chosen]

train_data_X_perm

Unnamed: 0,EMG,EEGa.1,EEGa.4,EEGa.5,EEGa.9,EEGa.10,EEGa.13,EEGa,EMG.1,EEGa.3,...,EEGb.8,EEGg.8,EEGg.6,EEGth.12,EEGg.4,EEGth.7,ACC.1,EEGth.9,EEGg.12,EEGa.7
0,3.98510,5.57593,7.39746,8.47643,6.31080,4.46472,10.16926,5.77247,4.16329,7.39446,...,3.76727,2.72814,5.63534,4.81479,8.11057,2.56450,6520.83036,4.03971,4.34951,2.75862
1,3.69995,4.26961,6.64888,6.66284,5.23405,3.52305,8.38724,5.80802,3.90775,6.70635,...,2.43230,1.14240,1.70475,4.70753,4.88182,2.04336,6531.95043,4.15952,2.38991,2.91310
2,3.11420,5.73762,8.76481,8.63043,7.01924,4.70875,11.85267,9.15003,3.74782,8.80649,...,2.77989,1.10680,1.31853,4.63060,4.42747,1.89560,6524.41338,4.86761,2.09693,3.18623
3,3.11963,5.51253,8.44539,8.27111,6.70220,5.11825,11.36607,9.46040,3.70944,8.43309,...,2.81840,1.02949,1.26493,4.92243,4.32785,2.04153,6522.16183,4.54632,2.13329,3.30245
4,3.33145,6.25030,10.27402,9.28880,8.05664,5.53824,13.33634,10.19491,3.79952,8.89868,...,2.82263,1.03245,1.16770,4.53448,4.08216,2.67223,6520.32162,4.32416,2.08000,3.09618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,3.05837,9.09050,13.60958,11.99537,11.92132,6.38031,16.29183,12.93556,6.81899,13.89958,...,3.68325,2.22607,4.96396,4.96253,6.64343,3.26539,6514.44701,5.90982,3.63299,4.75679
332,2.82540,8.42773,13.03771,11.15911,10.90212,6.41473,14.62573,12.62698,6.29711,12.64504,...,3.86937,2.27881,5.24971,5.80558,6.99304,3.66933,6502.08566,5.73583,3.76306,4.71150
333,2.70717,7.71033,13.34288,10.27384,10.10264,6.39639,14.10281,12.65701,5.73415,11.09673,...,3.65032,2.01472,4.41513,5.77886,6.50700,3.78054,6497.39222,5.75372,3.64639,5.01581
334,2.48429,7.28189,13.02084,10.72958,9.77572,6.74874,15.16358,12.84769,5.20472,10.94737,...,2.67342,1.13931,1.70142,4.95164,3.73468,3.81643,6491.18975,5.68587,2.92127,5.43976


In [82]:
lin_svc_fay_3 = LinearSVC(C=1, dual=False, class_weight='balanced', max_iter = 100, random_state=42)
lin_svc_fay_3.fit(train_data_X_perm, train_data_y)
predictions = lin_svc_fay_3.predict(test_data_X_perm)

print('training accuracy:', np.mean(cross_val_score(lin_svc_fay_3,train_data_X_perm, train_data_y,cv=10,scoring="accuracy")))
print('testing accuracy:', accuracy_score(test_data_y, predictions))
print('confusion matrix:', confusion_matrix(test_data_y, predictions))

# https://datascience.stackexchange.com/questions/28426/train-accuracy-vs-test-accuracy-vs-confusion-matrix

training accuracy: 0.8390374331550803
testing accuracy: 0.6282051282051282
confusion matrix: [[518 106]
 [358 266]]


In [84]:
# choosing model based on feature importance

# Choosing Model

pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
hp = {
            'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler(), Normalizer(), MaxAbsScaler()],
            'classifier': [SVC(), DecisionTreeClassifier(), GaussianNB(), KNeighborsClassifier(), RandomForestClassifier(random_state=42)]
        }
grid = RandomizedSearchCV(pipe, hp, n_iter=25, scoring='accuracy', n_jobs=1, cv=10, random_state=42)
final = grid.fit(train_data_X_perm, train_data_y)
final.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier(random_state=42))])

In [85]:
final.best_score_

0.869073083778966

In [86]:
result_df = DataFrame.from_dict(final.cv_results_, orient='columns')
result_df[["params", "mean_test_score"]]

Unnamed: 0,params,mean_test_score
0,"{'scaler': StandardScaler(), 'classifier': SVC()}",0.767647
1,"{'scaler': RobustScaler(), 'classifier': SVC()}",0.758824
2,"{'scaler': MinMaxScaler(), 'classifier': SVC()}",0.797504
3,"{'scaler': Normalizer(), 'classifier': SVC()}",0.579234
4,"{'scaler': MaxAbsScaler(), 'classifier': SVC()}",0.812389
5,"{'scaler': StandardScaler(), 'classifier': Dec...",0.856774
6,"{'scaler': RobustScaler(), 'classifier': Decis...",0.842068
7,"{'scaler': MinMaxScaler(), 'classifier': Decis...",0.862834
8,"{'scaler': Normalizer(), 'classifier': Decisio...",0.838681
9,"{'scaler': MaxAbsScaler(), 'classifier': Decis...",0.841979


In [87]:
final.best_estimator_.score(test_data_X_perm, test_data_y)

0.6434294871794872

In [89]:
rfc_perm = RandomForestClassifier(random_state=42)
rfc_perm.fit(train_data_X, train_data_y)

chosen, result = permutation_importance_feature(rfc_perm, train_data_X, train_data_y, 30)

In [90]:
chosen

#damn. I can not just predict by one feature tho 😂

['EMG']

In [91]:
robust_scaler = RobustScaler()
train_data_X_perm_scaled = robust_scaler.fit_transform(train_data_X_perm.astype(np.float32))
test_data_X_perm_scaled = robust_scaler.transform(test_data_X_perm.astype(np.float32))

In [96]:
rfc_perm.fit(train_data_X_perm_scaled, train_data_y)
predictions = rfc_perm.predict(test_data_X_perm_scaled)

print('training accuracy:', np.mean(cross_val_score(rfc_perm, train_data_X_perm_scaled, train_data_y,cv=10,scoring="accuracy")))
print('testing accuracy:', accuracy_score(test_data_y, predictions))
print('confusion matrix:', confusion_matrix(test_data_y, predictions))

training accuracy: 0.869073083778966
testing accuracy: 0.6434294871794872
confusion matrix: [[507 117]
 [328 296]]


In [None]:
#hyperparameter tuning for random forest after permutation importance