# Machine Learning models on Mismatch Response data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
#import initialization_functions #library to import data
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree

In [2]:
df = pd.read_csv('df_avg_mmr.csv', sep = ',')

In [3]:
df = df.iloc[:,0:643]
df

Unnamed: 0,Group_AccToParents,mean_2_Fp1_a,mean_2_AF3_a,mean_2_F7_a,mean_2_F3_a,mean_2_FC1_a,mean_2_FC5_a,mean_2_T7_a,mean_2_C3_a,mean_2_CP1_a,...,var_11_FC6_a,var_11_FC2_a,var_11_F4_a,var_11_F8_a,var_11_AF4_a,var_11_Fp2_a,var_11_Fz_a,var_11_Cz_a,sex,age_months
0,1,3.981499,11.639081,-18.811622,18.343454,10.023632,-4.024143,-13.999364,-17.671889,-29.638338,...,829.777806,1753.590043,1872.421,1177.183239,1552.274569,1118.361656,2046.250029,4111.674289,1,20
1,0,2.390371,-17.843979,-3.338771,7.53674,-10.998419,18.220947,16.625307,-4.156407,-17.93516,...,1716.894816,715.093446,1024.329,481.196607,709.788804,3294.746439,1030.659485,648.467398,0,20
2,1,-18.065883,-17.785199,-12.858838,-8.709094,9.764843,-1.953743,-2.150446,11.090374,0.289517,...,3921.895014,4897.122661,6606.174,5596.697825,6677.814226,6812.380543,8331.428201,5447.032441,0,20
3,1,52.196651,51.457712,22.673943,32.572107,5.180163,18.782386,20.845702,7.723524,5.208093,...,748.065695,533.67245,928.1573,896.222899,1581.711611,1264.485741,1650.761041,924.774325,1,18
4,1,-841.737108,-801.240011,-879.743229,-923.354318,-801.476465,-847.688256,-737.339387,-853.147914,-732.422316,...,4005.195721,3422.435494,2176.505,5619.503662,1758.419391,1983.67301,2476.343447,6167.777136,0,17
5,1,1.179806,6.366317,-22.585827,-16.469787,-26.659719,-20.820073,-44.714848,-5.861646,-4.621675,...,789.29627,603.855148,537.8956,615.502619,273.243307,306.952173,301.945278,546.46901,1,19
6,1,-104.473044,-114.661477,-115.992143,-104.786412,-97.066618,-104.369716,-116.949484,-105.987364,-99.901829,...,1045.064913,284.027252,400.1896,838.326418,439.621607,1428.227169,394.316814,306.835182,1,21
7,1,-36.590509,-35.888334,-32.303728,-41.009655,-39.813567,-35.966233,-32.699715,-45.066325,-43.378386,...,666.194308,648.813327,389.1094,449.531289,425.906571,333.908982,559.57066,1421.601679,1,17
8,1,-13.833605,-3.938048,11.836307,-27.299734,-8.398938,-128.276705,25.8612,-4.607624,-26.808643,...,4297.169544,2505.760442,176761.7,1957.545428,1909.534267,1745.253906,3390.764319,3640.787988,0,17
9,0,32.232969,33.826705,38.582741,10.956048,42.392605,35.946838,31.716506,53.21449,52.874939,...,1640.500114,646.921421,2145.911,3722.645729,1250.317139,548.790188,2443.149767,794.242652,0,16


## Split data

In [4]:
y = df['Group_AccToParents'].values # dependant variable
X = df.drop(['Group_AccToParents'],axis=1).values   # independant features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
y

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0], dtype=int64)

## Scale data

In [6]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## SVM model

In [7]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

SVC(C=0.1, kernel='linear', random_state=1)

In [8]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.667


In [9]:
y_pred

array([0, 1, 1, 1, 0, 1], dtype=int64)

## Logistic Regression model

In [10]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [11]:
y_pred = lr.predict(X_test)

In [12]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.833


In [13]:
y_pred

array([0, 0, 1, 1, 0, 1], dtype=int64)

In [18]:
print(lr.coef_)

[[-5.14953576e-03 -2.77462255e-03 -1.00069983e-02 -1.18191132e-02
  -9.94070994e-03 -1.30945375e-02 -2.58100193e-02 -1.69386666e-02
  -8.08934859e-03 -1.50886081e-02 -1.80055660e-02 -2.03021165e-02
  -4.78572162e-03 -8.53964509e-03 -6.53431313e-03 -7.96875862e-03
  -1.20840989e-02  1.53384910e-03 -4.33872643e-02 -2.19073796e-02
  -2.50808209e-02 -1.51817318e-02 -2.15998313e-02 -2.05969932e-02
  -1.80484757e-02 -1.14540060e-02 -4.06972570e-03 -1.36692870e-02
  -6.10947256e-03 -5.70386988e-03 -1.11813631e-02 -2.03130421e-02
  -1.40701783e-02  1.83243945e-02  1.25360948e-02  2.16307959e-02
   5.51880275e-02  6.53926746e-03  2.46286596e-02  4.74691494e-02
   7.68423470e-02  2.51163328e-02  4.38036359e-02  7.28593404e-02
   4.26400108e-02  4.39943164e-02  3.74946574e-02  2.62622062e-04
  -1.26153010e-02  8.32849246e-03 -4.37237176e-02  2.17883665e-02
   2.95457171e-02  3.46799062e-03  3.04534779e-02  3.44774526e-02
   4.47323045e-02  5.16134815e-02 -3.69683342e-02  3.92147799e-02
  -1.39614

## Decision Tree model

In [14]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [15]:
y_pred = dt.predict(X_test)

In [16]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.167


In [17]:
y_pred

array([0, 1, 0, 0, 0, 0], dtype=int64)