# Machine Learning models on connectivity data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
#import initialization_functions #library to import data
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [2]:
df = pd.read_csv('df_connectivity.csv', sep = ',')

In [3]:
df

Unnamed: 0,32,64,65,96,97,98,128,129,130,131,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,Group_AccToParents
0,0.152030,0.069506,0.028244,0.216019,0.236540,0.110768,0.146072,0.234775,0.123786,0.224846,...,0.092895,0.109223,0.091130,0.110989,0.126434,0.059135,0.116726,0.058032,0.164387,1
1,0.049602,0.027627,0.060067,0.021557,0.016116,0.090414,0.022394,0.053370,0.050649,0.114483,...,0.114692,0.096693,0.043533,0.169527,0.042905,0.028882,0.031812,0.038928,0.098577,0
2,0.380034,0.336761,0.322622,0.404456,0.434876,0.360326,0.452442,0.464439,0.064267,0.492716,...,0.339332,0.338046,0.392888,0.127249,0.395458,0.368038,0.422022,0.341902,0.328620,1
3,0.044057,0.063730,0.048361,0.123975,0.148566,0.029508,0.107992,0.155943,0.071107,0.087090,...,0.045082,0.057582,0.068238,0.159016,0.117418,0.093033,0.115574,0.086475,0.150410,1
4,0.137384,0.124695,0.082479,0.244021,0.198389,0.036115,0.171791,0.170327,0.102733,0.158370,...,0.027086,0.053197,0.029771,0.063202,0.110786,0.061249,0.121279,0.102245,0.085652,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.066393,0.027664,0.032787,0.065574,0.107992,0.030533,0.080123,0.150205,0.065164,0.136680,...,0.021107,0.042828,0.038525,0.177869,0.128074,0.068852,0.125820,0.072951,0.175820,1
86,0.126844,0.023566,0.020287,0.169672,0.138115,0.068033,0.156762,0.223156,0.099385,0.187500,...,0.070287,0.071107,0.085656,0.211066,0.145902,0.079508,0.137295,0.077049,0.206352,1
87,0.105123,0.071107,0.068033,0.112910,0.087705,0.031762,0.079713,0.110246,0.099590,0.098566,...,0.155738,0.183811,0.116598,0.162090,0.189959,0.080123,0.158402,0.123156,0.202049,1
88,0.121926,0.013934,0.025000,0.178689,0.180533,0.065779,0.159016,0.177664,0.090164,0.148770,...,0.071926,0.055328,0.032787,0.115984,0.097131,0.053689,0.102869,0.072131,0.128484,1


## Split data

In [4]:
y = df['Group_AccToParents'].values # dependant variable
X = df.drop(['Group_AccToParents'],axis=1).values   # independant features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
y

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0], dtype=int64)

## Scale data

In [6]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## SVM model

In [7]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

SVC(C=0.1, kernel='linear', random_state=1)

In [8]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.611


In [9]:
y_pred

array([1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

## Logistic Regression model

In [10]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [11]:
y_pred = lr.predict(X_test)

In [12]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.611


In [13]:
y_pred

array([1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0], dtype=int64)

In [14]:
print(lr.coef_)

[[ 1.82264678e-01 -2.46356357e-01 -1.61989107e-01  2.03102281e-01
  -3.90751863e-02  1.10828294e-01  1.25036855e-01 -1.02349548e-01
   1.81476512e-01 -3.54700486e-02  3.10194690e-03 -3.83065065e-02
   1.30639454e-01 -2.72338440e-01 -1.48288388e-01 -1.89644300e-01
  -9.50992805e-02 -1.98478053e-01 -2.16623801e-01 -1.44611288e-01
   9.22329747e-02  1.57791841e-01 -1.77657557e-02  4.95403365e-02
   1.46797613e-02 -1.40013507e-01  1.59556844e-01 -6.02384260e-02
  -1.67136751e-01 -1.49634134e-01 -1.93114104e-02 -4.05247159e-02
  -9.22482751e-02  4.99064437e-02 -1.57698153e-01  1.05791427e-01
  -6.08847576e-02 -4.39272807e-02  6.53146251e-02 -7.44479399e-02
   6.03309791e-02  3.31876632e-02 -6.62620549e-02 -2.41845091e-02
  -3.20552906e-02  1.06157440e-01  1.92170764e-01  2.25618483e-01
   1.73422747e-01  2.27062931e-02  2.93362786e-04  3.88264352e-02
  -2.58387007e-02 -5.94348552e-02  3.87257675e-02  1.03435497e-01
   1.24419030e-01  7.46216808e-02  6.78564700e-02  9.00189107e-02
   2.50670

## Decision Tree model

In [15]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [16]:
y_pred = dt.predict(X_test)

In [17]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.444


In [18]:
y_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

<font color='blue'>Review:  If you have time later there are couple possible typo-s in the comments, and you can add more explanation markdown cells. But overall, good job! </font>