# Machine Learning models on Mismatch Response data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [3]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
#import initialization_functions #library to import data
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [4]:
df = pd.read_csv('df_avg_mmr.csv', sep = ',')

In [21]:
df = df.iloc[:,0:643]
df

Unnamed: 0,Group_AccToParents,mean_2_Fp1_a,mean_2_AF3_a,mean_2_F7_a,mean_2_F3_a,mean_2_FC1_a,mean_2_FC5_a,mean_2_T7_a,mean_2_C3_a,mean_2_CP1_a,...,var_11_FC6_a,var_11_FC2_a,var_11_F4_a,var_11_F8_a,var_11_AF4_a,var_11_Fp2_a,var_11_Fz_a,var_11_Cz_a,sex,age_months
0,1,-11.0947,-39.574672,-11.122825,-44.410338,-43.448037,-19.790442,-25.778037,-30.380207,-11.238418,...,503.579841,730.976986,784.8698,721.226241,1370.065441,760.620246,929.182064,1065.772537,1,20
1,0,-10.255731,4.886056,-4.822772,2.250893,14.713139,-1.191946,-7.311456,6.309722,18.437361,...,941.338939,1208.099385,1280.764,1490.128614,2036.161613,1843.434216,1658.302136,1515.948233,0,20
2,1,-42.86275,-20.541033,-46.932457,-12.579665,-8.396244,-33.579034,-24.454217,-17.723062,-21.572698,...,1249.377675,900.42468,832.1268,645.467029,564.436353,605.366144,745.656255,1906.713129,0,20
3,1,-37.661116,-42.125754,-13.50291,-33.103372,-29.136874,-18.201769,-7.275028,-19.956663,-34.235385,...,989.609171,1009.165475,921.7093,636.801686,625.633195,755.2334,1225.743668,782.28392,1,18
4,1,24.722609,44.529421,18.75399,49.012522,52.431363,15.691906,12.532885,26.294814,21.042783,...,537.69612,557.030569,1363.116,2798.693892,1767.691503,2017.651641,1669.381724,458.075286,0,17
5,1,14.837136,-1.695921,4.502377,2.457229,14.347111,18.023014,12.07313,7.606589,9.965992,...,14683.377962,8702.502506,62413.61,16752.989582,23377.504234,9063.182196,13844.466336,10879.205411,1,19
6,1,-114.897729,-99.320951,-72.577558,-71.137474,-60.711771,-56.764269,-40.362471,-49.45482,-50.112234,...,1217.717552,957.389539,351.415,1008.717479,1727.657034,528.757315,999.88712,1549.942881,1,21
7,1,28.061978,44.306052,48.808774,38.364482,23.570651,44.139625,58.580132,43.368189,35.704214,...,994.373334,1080.185547,1471.881,1016.152712,977.294104,2211.740382,1536.969999,692.872617,1,17
8,1,62.661759,68.311398,53.358203,61.922873,39.727281,34.262291,40.425435,33.129862,8.731025,...,1164.608036,942.012104,1243.602,1470.014151,1063.431573,972.086706,948.139421,1007.78058,0,17
9,0,29.842044,32.006968,45.005228,67.691553,-4.687907,40.554216,19.159306,33.255319,32.839679,...,615.077184,507.035807,415.1826,898.905451,952.316884,247.857593,1540.12057,464.358244,0,16


## Split data

In [67]:
y = df['Group_AccToParents'].values # dependant variable
X = df.drop(['Group_AccToParents'],axis=1).values   # independant features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
y

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0],
      dtype=int64)

## Scale data

In [69]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## SVM model

In [70]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

SVC(C=0.1, kernel='linear', random_state=1)

In [71]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.556


In [72]:
y_pred

array([1, 0, 0, 1, 1, 1, 0, 1, 0], dtype=int64)

## Logistic Regression model

In [73]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [74]:
y_pred = lr.predict(X_test)

In [75]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.667


In [76]:
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 0], dtype=int64)

In [77]:
print(lr.coef_)

[[ 6.41811197e-02  2.75145732e-02  1.35146123e-02  2.05616046e-02
  -2.75276018e-02  4.20352233e-02  1.08929871e-01 -4.75028549e-03
  -4.79039788e-03 -2.67187759e-02  5.11330230e-03 -4.29447010e-02
   1.95960586e-02 -1.75116451e-02 -4.26822452e-02  3.95199879e-02
   3.98348827e-02  2.06267617e-02 -3.90275590e-02  2.17615617e-02
   1.65409526e-02  4.10138287e-02  2.72113939e-02  7.21841725e-02
   5.53025128e-02  1.73291966e-03 -2.36820467e-02  1.88735169e-02
   2.99252845e-02  4.51022390e-02 -1.34694137e-02 -4.22283722e-02
  -4.02823964e-02 -7.82539599e-02  2.85007800e-02 -1.63710260e-02
   1.79988452e-03  3.87799204e-02 -7.87185430e-03  2.93790330e-02
  -1.32680021e-02  8.08099140e-03  5.32882604e-02  1.11493094e-02
  -1.67280295e-02  1.45521099e-02  6.48038457e-02 -8.93906910e-03
   3.78827079e-02  3.45422069e-02  5.04167075e-02 -5.61144210e-02
  -5.93865594e-02 -2.45490493e-02 -5.71872133e-03 -2.48196450e-02
   1.84401571e-03 -1.90091989e-02 -3.12066130e-02  7.07290873e-02
  -3.08861

## Decision Tree model

In [78]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [79]:
y_pred = dt.predict(X_test)

In [80]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.778


In [81]:
y_pred

array([0, 0, 0, 0, 1, 1, 0, 1, 0], dtype=int64)