## import packages

In [15]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

# general packages
import numpy as np
import matplotlib.pyplot as plt

# sklearn models
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# mne
import mne
import pickle
from mne.datasets import sample
from mne.decoding import (SlidingEstimator, GeneralizingEstimator,
                          cross_val_multiscore, LinearModel, get_coef)

## sklearn models

In [16]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## directories

In [17]:
# beh_dir = '../../data/decision-making/data/data_behav'
# neur_dir = '../../data/decision-making/data/data_ephys'
# preproc_dir = '../../data/decision-making/data/data_preproc'

## load preprocessed data

In [11]:
X = np.load(os.path.join(preproc_dir,'X_use_56_950ms.npy'))
y = np.load(os.path.join(preproc_dir,'y_use_56.npy'))

## modeling (adapted from Lillian and Adi)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
print ('Number of samples in training data:',len(x_train))
print ('Number of samples in test data:',len(x_test))

Number of samples in training data: 7017
Number of samples in test data: 1755


## naive model: mean number gambles

In [13]:
np.mean(y_train=='Gamble')

0.5699016673792219

In [14]:
np.mean(y_test=='Gamble')

0.5698005698005698

## logistic regression

In [10]:
# Name our logistic regression object
LogisticRegressionModel = linear_model.LogisticRegression()

# we create an instance of logistic Regression Classifier and fit the data.
print ('Training a logistic Regression Model...')
LogisticRegressionModel.fit(x_train, y_train)

training_accuracy=LogisticRegressionModel.score(x_train,y_train)
print ('Training Accuracy:',training_accuracy)

test_accuracy=LogisticRegressionModel.score(x_test,y_test)
print('Accuracy of the model on unseen test data: ',test_accuracy)

Training a logistic Regression Model...
Training Accuracy: 0.6009690751033205
Accuracy of the model on unseen test data:  0.5863247863247864


## confusion matrix

In [15]:
#CONFUSION MATRIX
y_true = y_test
y_pred = LogisticRegressionModel.predict(x_test)

In [14]:
ConfusionMatrix=pd.DataFrame(confusion_matrix(y_true, y_pred),columns=['Predicted Gamble','Predicted Safe'],index=['Actual Gamble','Actual Safe'])
print ('Confusion matrix of test data is: \n',ConfusionMatrix)

Confusion matrix of test data is: 
                Predicted Gamble  Predicted Safe
Actual Gamble               805             195
Actual Safe                 531             224


In [18]:
# y_test.value_counts()

## precision and recall

In [19]:
from sklearn.metrics import precision_score
print("Average precision for the 2 classes is - ", precision_score(y_true, y_pred, average = None) )

from sklearn.metrics import recall_score
print("Average recall for the 2 classes is - ", recall_score(y_true, y_pred, average = None) )

Average precision for the 2 classes is -  [0.60254491 0.53460621]
Average recall for the 2 classes is -  [0.805      0.29668874]


## perceptron

In [20]:
perceptron = Perceptron(max_iter=100)
perceptron.fit(x_train, y_train)
perceptron_train_acc = perceptron.score(x_train, y_train)
perceptron_test_acc = perceptron.score(x_test, y_test)
print ('perceptron training acuracy= ',perceptron_train_acc)
print('perceptron test accuracy= ',perceptron_test_acc)

perceptron training acuracy=  0.5091919623770842
perceptron test accuracy=  0.5088319088319089


Terrible. Below chance.

In [22]:
# Adaboost
adaboost = AdaBoostClassifier()
adaboost.fit(x_train, y_train)
adaboost_train_acc = adaboost.score(x_train, y_train)
adaboost_test_acc = adaboost.score(x_test, y_test)
print ('adaboost training acuracy= ',adaboost_train_acc)
print('adaboost test accuracy= ',adaboost_test_acc)

adaboost training acuracy=  0.6340316374519025
adaboost test accuracy=  0.5709401709401709


OK, better than perceptron, but worse than logistic regression.

In [23]:
# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)
random_forest_train_acc = random_forest.score(x_train, y_train)
random_forest_test_acc = random_forest.score(x_test, y_test)
print('random_forest training acuracy= ',random_forest_train_acc)
print('random_forest test accuracy= ',random_forest_test_acc)

random_forest training acuracy=  0.9851788513609805
random_forest test accuracy=  0.5943019943019943


Hooray! Best performance yet on test data, but overfitting *a lot* on train data.

In [25]:
len(y)

8772

In [None]:
# from sklearn.svm import SVC, LinearSVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import Perceptron
# from sklearn.linear_model import SGDClassifier
# from sklearn.tree import DecisionTreeClassifier

## are there more models imported above that i didn't try, that would lend themselves to this?

## Note from Lillian and Adi: We need to run k-fold validation. Too small a dataset. 