# Machine Learning models on connectivity data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [14]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
#import initialization_functions #library to import data
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [15]:
df = pd.read_csv('df_connectivity.csv', sep = ',')

In [16]:
df

Unnamed: 0,32,64,65,96,97,98,128,129,130,131,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,Group_AccToParents
0,0.152030,0.069506,0.028244,0.216019,0.236540,0.110768,0.146072,0.234775,0.123786,0.224846,...,0.092895,0.109223,0.091130,0.110989,0.126434,0.059135,0.116726,0.058032,0.164387,1
1,0.049602,0.027627,0.060067,0.021557,0.016116,0.090414,0.022394,0.053370,0.050649,0.114483,...,0.114692,0.096693,0.043533,0.169527,0.042905,0.028882,0.031812,0.038928,0.098577,0
2,0.380034,0.336761,0.322622,0.404456,0.434876,0.360326,0.452442,0.464439,0.064267,0.492716,...,0.339332,0.338046,0.392888,0.127249,0.395458,0.368038,0.422022,0.341902,0.328620,1
3,0.044057,0.063730,0.048361,0.123975,0.148566,0.029508,0.107992,0.155943,0.071107,0.087090,...,0.045082,0.057582,0.068238,0.159016,0.117418,0.093033,0.115574,0.086475,0.150410,1
4,0.137384,0.124695,0.082479,0.244021,0.198389,0.036115,0.171791,0.170327,0.102733,0.158370,...,0.027086,0.053197,0.029771,0.063202,0.110786,0.061249,0.121279,0.102245,0.085652,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.066393,0.027664,0.032787,0.065574,0.107992,0.030533,0.080123,0.150205,0.065164,0.136680,...,0.021107,0.042828,0.038525,0.177869,0.128074,0.068852,0.125820,0.072951,0.175820,1
86,0.126844,0.023566,0.020287,0.169672,0.138115,0.068033,0.156762,0.223156,0.099385,0.187500,...,0.070287,0.071107,0.085656,0.211066,0.145902,0.079508,0.137295,0.077049,0.206352,1
87,0.105123,0.071107,0.068033,0.112910,0.087705,0.031762,0.079713,0.110246,0.099590,0.098566,...,0.155738,0.183811,0.116598,0.162090,0.189959,0.080123,0.158402,0.123156,0.202049,1
88,0.121926,0.013934,0.025000,0.178689,0.180533,0.065779,0.159016,0.177664,0.090164,0.148770,...,0.071926,0.055328,0.032787,0.115984,0.097131,0.053689,0.102869,0.072131,0.128484,1


## Split data

In [17]:
y = df['Group_AccToParents'].values # dependant variable
X = df.drop(['Group_AccToParents'],axis=1).values   # independant features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
y

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0])

## Scale data

In [19]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## SVM model

In [20]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

In [21]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.519


In [22]:
y_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1])

In [23]:
# OK let's try a k-fold approach instead

In [42]:
from sklearn.model_selection import cross_val_score
svm_scores = cross_val_score(svm, X, y, cv=7)
svm_scores

array([0.53846154, 0.53846154, 0.53846154, 0.53846154, 0.53846154,
       0.53846154, 0.5       ])

In [43]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (svm_scores.mean(), svm_scores.std()))

0.53 accuracy with a standard deviation of 0.01


## Logistic Regression model

In [10]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

In [11]:
y_pred = lr.predict(X_test)

In [12]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.519


In [13]:
y_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1])

In [44]:
print(lr.coef_)

[[ 2.92624810e-01 -5.56565669e-02 -7.27771977e-02  2.78741247e-01
   2.74218231e-02  1.52318120e-01  1.07227274e-01 -1.30908560e-01
   1.76854080e-01 -1.16429114e-01  1.47627831e-02 -9.62285537e-02
   1.69211862e-01 -2.72913164e-01 -1.87572998e-01 -9.51591420e-02
   5.16049131e-03 -1.83982315e-01 -1.71120642e-01 -9.06896955e-02
  -3.19999346e-02  1.65016226e-01 -7.89033163e-02  1.16655401e-02
  -5.78456021e-02 -1.24686669e-01  1.04039667e-01 -2.62832026e-02
  -1.51643191e-01 -1.59677891e-01 -8.12428738e-02 -3.80832414e-02
  -8.99005569e-03  1.02317613e-01 -6.68204835e-02  1.43322280e-01
   4.48290529e-02 -1.63747496e-02 -3.87396725e-02 -6.43959429e-02
   1.86223597e-02  7.05462274e-02 -4.06584221e-02 -7.48669920e-03
  -1.19168180e-02  1.29068940e-01  1.59098377e-01  3.81249020e-02
  -1.51324702e-03 -8.53993547e-02 -3.38066151e-02  1.11487516e-01
   8.68426390e-03 -4.24554480e-02  7.44769384e-02  9.27079156e-02
   1.02414269e-01  8.00207628e-02 -4.34146407e-02 -3.09361261e-02
   2.06009

In [31]:
# let'sa try some kfoling on that:
lr_scores = cross_val_score(lr, X, y, cv=7)
lr_scores

array([0.61538462, 0.53846154, 0.46153846, 0.76923077, 0.53846154,
       0.38461538, 0.75      ])

In [32]:
# notice here you have a lot of variation depending on the fold you pick
print("%0.2f accuracy with a standard deviation of %0.2f" % (lr_scores.mean(), lr_scores.std()))

0.58 accuracy with a standard deviation of 0.13


So the problem here is believability...the more your accuracy scores vary, the more of a problem you have.

## Decision Tree model

In [35]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [36]:
y_pred = dt.predict(X_test)

In [37]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.593


In [38]:
y_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1])

In [39]:
# let'sa try some kfoling on that:
dt_scores = cross_val_score(dt, X, y, cv=7)
dt_scores

array([0.69230769, 0.61538462, 0.53846154, 0.76923077, 0.61538462,
       0.69230769, 0.83333333])

In [41]:
# notice here you have a lot of variation depending on the fold you pick
print("%0.2f accuracy with a standard deviation of %0.2f" % (dt_scores.mean(), dt_scores.std()))

0.68 accuracy with a standard deviation of 0.09


Suddenly decision tree looks hot?