# Machine Learning models on connectivity data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
#import initialization_functions #library to import data
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [2]:
df = pd.read_csv('df_connectivity.csv', sep = ',')

In [3]:
df

Unnamed: 0,32,64,65,96,97,98,128,129,130,131,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,Group_AccToParents
0,0.245155,0.090722,0.026804,0.285567,0.198144,0.146186,0.274433,0.292371,0.226598,0.251134,...,0.116289,0.041856,0.120619,0.158969,0.094639,0.065567,0.097113,0.084948,0.091340,1
1,0.242668,0.084469,0.034490,0.290582,0.234201,0.144775,0.295952,0.298843,0.241223,0.250516,...,0.009913,0.023337,0.013631,0.009707,0.005370,0.033251,0.007435,0.021272,0.010739,1
2,0.205882,0.031632,0.059378,0.298557,0.283019,0.171476,0.352386,0.380133,0.260266,0.303552,...,0.119312,0.171476,0.218091,0.209212,0.261931,0.225305,0.219201,0.210877,0.229745,1
3,0.246960,0.037317,0.106289,0.121384,0.022013,0.099161,0.084067,0.042977,0.072746,0.051782,...,0.151363,0.137107,0.145702,0.054507,0.106918,0.120964,0.100210,0.101048,0.050943,1
4,0.166667,0.065981,0.011354,0.242502,0.179306,0.118038,0.302699,0.327335,0.191088,0.301628,...,0.052057,0.038346,0.029349,0.063410,0.035347,0.011568,0.021422,0.021851,0.009426,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.248847,0.050154,0.062068,0.279593,0.263836,0.170061,0.338586,0.371253,0.250192,0.291699,...,0.027095,0.017487,0.004612,0.024981,0.013259,0.027479,0.019216,0.032091,0.027671,1
97,0.172960,0.019555,0.093729,0.228928,0.183749,0.185098,0.257249,0.271409,0.220836,0.207687,...,0.020904,0.015846,0.042819,0.051247,0.055293,0.040796,0.047876,0.047876,0.077883,1
98,0.180275,0.072018,0.018578,0.202752,0.144725,0.120183,0.141972,0.147248,0.120183,0.125917,...,0.023165,0.084862,0.038303,0.015826,0.024083,0.033945,0.032110,0.032339,0.016972,0
99,0.182128,0.052524,0.069577,0.141201,0.091405,0.085266,0.104366,0.096180,0.021146,0.061392,...,0.056617,0.074352,0.047067,0.023192,0.022510,0.039563,0.030696,0.033424,0.027285,1


## Split data

In [4]:
y = df['Group_AccToParents'].values # dependant variable
X = df.drop(['Group_AccToParents'],axis=1).values   # independant features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0])

## Scale data

In [6]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## SVM model

In [7]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

In [8]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.355


In [9]:
y_pred

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1])

## Logistic Regression model

In [10]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

In [11]:
y_pred = lr.predict(X_test)

In [12]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.387


In [13]:
y_pred

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1])

In [14]:
print(lr.coef_)

[[ 4.56377491e-02  1.54483198e-03 -1.59213948e-01  8.63815008e-02
  -8.93552139e-02  1.16779902e-01 -9.62270039e-03 -9.96036121e-02
  -1.78624558e-02 -5.41832738e-02 -2.80969793e-02  9.80350481e-02
  -2.12823803e-01  4.46138953e-02  1.03831484e-01  9.65786969e-02
  -1.19444055e-01 -3.33566928e-02 -2.55017861e-02 -7.95170218e-02
  -1.38692049e-01  2.84467040e-02  2.26616599e-02 -2.47218109e-02
   1.50522137e-02  2.78673041e-02 -1.37854507e-01 -3.20953535e-01
  -7.44751589e-02 -9.65723901e-02  4.47276994e-03 -1.56808103e-02
  -1.43189092e-01  2.71386284e-02  5.12089086e-02  1.55237222e-02
   1.01957359e-01  2.96517748e-03 -1.18569460e-02  6.84902734e-02
  -2.25423007e-03 -5.63001009e-02 -1.75706122e-01  2.60956045e-03
  -8.54226009e-02  1.46817906e-01 -8.08954595e-03  5.75979243e-02
   8.20560474e-02 -9.92649310e-02 -1.44934984e-01 -1.06985814e-01
  -1.62979976e-01  3.14846856e-03  1.77728779e-01  8.82277711e-02
   2.93342069e-02 -3.03734341e-02  1.09019872e-02  1.49992025e-02
  -8.95133

## Decision Tree model

In [15]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [16]:
y_pred = dt.predict(X_test)

In [17]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.387


In [18]:
y_pred

array([0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1])