# Machine Learning models on Mismatch Response data

In this notebook: 
- Necessary imports
- SVM model 
- Logistic Regression model
- Decision Tree model

## Imports

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import copy     # Can Copy and Deepcopy files so original file is untouched.
import seaborn as sn
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../eegyolk') # path to helper functions
import helper_functions as hf # library useful for eeg and erp data cleaning
import epod_helper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold 

In [2]:
df = pd.read_csv('df_avg_mmr.csv', sep = ',')
df.head()

Unnamed: 0,Group_AccToParents,mean_AF3,mean_AF4,mean_C3,mean_C4,mean_CP1,mean_CP2,mean_CP5,mean_CP6,mean_Cz,...,var_P8,var_PO3,var_PO4,var_Pz,var_T7,var_T8,ParticipantID,test,sex,age_months
0,1,-0.307508,-1.117406,-0.265137,-2.490048,-0.344755,-1.952034,0.244291,-2.060369,-0.430365,...,4.081511,2.823812,3.707833,1.865558,1.246092,4.157734,101,a,1,20
1,0,-1.286638,-1.29736,-0.781005,0.5587,0.314373,0.903432,-1.469916,0.213751,0.833553,...,4.967815,2.167772,2.121805,0.864177,0.62834,2.540204,102,a,0,20
2,1,-1.565712,-0.48195,0.163128,2.782407,2.052576,3.038309,-2.244892,4.449279,1.505256,...,18.145947,5.509024,10.550081,10.769883,4.60933,10.190182,103,a,0,20
3,1,-0.059614,-0.635098,0.239789,0.663436,0.053435,-0.615029,0.61374,1.628879,-0.009549,...,3.938772,1.845088,2.675353,1.101257,1.635812,6.517099,104,a,1,18
4,1,1.518115,3.254148,0.475445,0.2399,0.444679,-0.945208,1.471047,0.360562,0.115756,...,5.143383,0.971683,1.68868,1.125295,11.47491,6.96574,105,a,0,17


In [3]:
features_of_interest = df[['mean_AF3',
 'mean_F3',
 'mean_F7',
 'mean_FC1',
 'mean_FC5',
 'mean_Fp1',
 'mean_Fz',
 'kurt_AF3',
 'kurt_F3',
 'kurt_F7',
 'kurt_FC1',
 'kurt_FC5',
 'kurt_Fp1',
 'kurt_Fz',
 'skew_AF3',
 'skew_F3',
 'skew_F7',
 'skew_FC1',
 'skew_FC5',
 'skew_Fp1',
 'skew_Fz',
 'std_AF3',
 'std_F3',
 'std_F7',
 'std_FC1',
 'std_FC5',
 'std_Fp1',
 'std_Fz',
 'var_AF3',
 'var_F3',
 'var_F7',
 'var_FC1',
 'var_FC5',
 'var_Fp1',
 'var_Fz',
 'sex',
 'age_months']]

In [4]:
dfcor = features_of_interest.corr()

In [5]:
dfcor

Unnamed: 0,mean_AF3,mean_F3,mean_F7,mean_FC1,mean_FC5,mean_Fp1,mean_Fz,kurt_AF3,kurt_F3,kurt_F7,...,std_Fz,var_AF3,var_F3,var_F7,var_FC1,var_FC5,var_Fp1,var_Fz,sex,age_months
mean_AF3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_F3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_F7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_FC1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_FC5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_Fp1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
mean_Fz,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99999,0.999419,0.999981,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.181848,0.059284
kurt_AF3,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,1.0,0.999511,0.99998,...,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,0.99999,-0.181755,0.05923
kurt_F3,0.999419,0.999419,0.999419,0.999419,0.999419,0.999419,0.999419,0.999511,1.0,0.99951,...,0.999419,0.999419,0.999419,0.999419,0.999419,0.999419,0.999419,0.999419,-0.177811,0.063848
kurt_F7,0.999981,0.999981,0.999981,0.999981,0.999981,0.999981,0.999981,0.99998,0.99951,1.0,...,0.999981,0.999981,0.999981,0.999981,0.999981,0.999981,0.999981,0.999981,-0.181131,0.061298


In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(dfcor, 50))

## Split data

In [None]:
y = df['Group_AccToParents'].values # dependant variable
X = df[['mean_AF3',
 'mean_F3',
 'mean_F7',
 'mean_FC1',
 'mean_FC5',
 'mean_Fp1',
 'mean_Fz',
 'kurt_AF3',
 'kurt_F3',
 'kurt_F7',
 'kurt_FC1',
 'kurt_FC5',
 'kurt_Fp1',
 'kurt_Fz',
 'skew_AF3',
 'skew_F3',
 'skew_F7',
 'skew_FC1',
 'skew_FC5',
 'skew_Fp1',
 'skew_Fz',
 'std_AF3',
 'std_F3',
 'std_F7',
 'std_FC1',
 'std_FC5',
 'std_Fp1',
 'std_Fz',
 'sex',
 'age_months']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Scale data

In [None]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## SVM model

In [None]:
svm = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[0.001, 1]}
clf = GridSearchCV(svm, parameters)
clf.fit(X_train, y_train)

sorted(clf.cv_results_.keys())

In [None]:
clf.best_params_

In [None]:
clf.score(X_train, y_train)

In [None]:
svm = SVC(C=1, kernel='linear', random_state=False)
svm.fit(X_train, y_train)

In [None]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
sn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['control', 'at risk']); ax.yaxis.set_ticklabels(['control', 'at risk']);

In [None]:
# with kfold cross validation
k = 3
kf = KFold(n_splits=k, random_state=None)
model = svm
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

## Logistic Regression model

In [None]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
y_pred

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
sn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['control', 'at risk']); ax.yaxis.set_ticklabels(['control', 'at risk']);

In [None]:
# with kfold cross validation
k = 3
kf = KFold(n_splits=k, random_state=None)
model = lr
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

## Decision Tree model

In [None]:
dt = tree.DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
y_pred

In [None]:
tree.plot_tree(dt)

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
sn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['control', 'at risk']); ax.yaxis.set_ticklabels(['control', 'at risk']);

In [None]:
# with kfold cross validation
k = 3
kf = KFold(n_splits=k, random_state=None)
model = tree.DecisionTreeClassifier(max_depth=5)
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))