In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN

from sklearn import metrics
#from sklearn.metrics import confusion_matrix
#sfrom sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight

from sklearn.utils.multiclass import unique_labels

from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

# Keras
from keras import models
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics as km
from keras import losses
from keras.wrappers.scikit_learn import KerasClassifier


pd.options.mode.chained_assignment = None  # default='warn'
%matplotlib inline

In [17]:
def printTree(feature_cols,tree):
    dot_data = StringIO()
    export_graphviz(tree, out_file=dot_data,  
                    filled=True, rounded=True,
                    special_characters=True, feature_names = feature_cols,class_names=['0','1'])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png('magic_tree.png')
    Image(graph.create_png())

In [18]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


In [70]:
def calcMetrics(Yt, Yp, classes, model, norm=True, cols=None, print_metrics=False):
    
    if(len(classes) == 2):
        average = "binary"
    else:
        average = None
    
    # Metrics
    acc = metrics.accuracy_score(Yt, Yp)
    acc_bal = metrics.balanced_accuracy_score(Yt, Yp)
    prec = metrics.precision_score(Yt, Yp, average=average)
    recall = metrics.recall_score(Yt, Yp, average=average)
    f1 = metrics.f1_score(Yt, Yp, average=average)
    #roc_auc = roc_auc_score(Yt, Yp, average=average)
    roc_auc = 0.0
    
    feature_imp = pd.Series(model.feature_importances_,index=cols).sort_values(ascending=True)
    
    if(print_metrics):
        print("")
        plot_confusion_matrix(Yt, Yp, classes, normalize=norm)
        print("")
        print("Accuracy:", acc)
        print("Balanced Accuracy: ", acc_bal)
        print("Precision: ", prec)
        print("Recall: ", recall)
        print("F1 Score: ", f1)
        print("ROC AuC Score: ", roc_auc)
       
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.barh(feature_imp.index,feature_imp.values, align='center')
        ax.set_xlabel('Performance')
        ax.set_title('How fast do you want to go today?')
        plt.grid(True)
        
        plt.show()
    
    return acc, acc_bal, prec, recall, f1, roc_auc, feature_imp
    
    

In [50]:
def decisionTreeFactory(depth=4, criterion='gini', weights=None, classes=[0,1]):
    
    def trainDecisionTree(Xtr, Xte, Ytr, Yte, feat_cols, print_metrics=False):
        model = DecisionTreeClassifier(max_depth=depth, criterion=criterion, class_weight=weights)
        model = model.fit(Xtr,Ytr)
        y_pred = model.predict(Xte)    
        return calcMetrics(Yte, y_pred, classes, model, norm=True, cols=feat_cols, 
                           print_metrics=print_metrics), model
    
    return trainDecisionTree

In [51]:
def randomForestFactory(n_estimators=1000, depth=None, weights=None, classes=[0,1]):
    
    def trainRandomForest(Xtr, Xte, Ytr, Yte, feat_cols, print_metrics=False):
        model=RandomForestClassifier(n_estimators=n_estimators, class_weight=weights, max_depth=depth)
        model.fit(Xtr,Ytr)    
        y_pred=model.predict(Xte)

        return calcMetrics(Yte, y_pred, classes, model, norm=True, cols=feat_cols,
                           print_metrics=print_metrics), model
    
    return trainRandomForest

In [52]:
def ADAfactory(estimator=None, n_estim=1000, classes=[0,1]):

    def trainADA(Xtr, Xte, Ytr, Yte, feat_cols, print_metrics=False):
        model = AdaBoostClassifier(base_estimator=estimator, n_estimators=n_estim)
        model.fit(Xtr , Ytr)
        y_pred = model.predict(Xte)

        return calcMetrics(Yte, y_pred, classes, model, norm=True,
                           cols=feat_cols, print_metrics=print_metrics), model
    
    return trainADA


In [53]:
def gradientBoostingFactory(n_estim=1000, classes=[0,1]):
    
    def trainGradientBoosting(Xtr, Xte, Ytr, Yte, feat_cols, print_metrics=False):
        model = GradientBoostingClassifier(n_estimators=n_estim)
        model.fit(Xtr , Ytr)
        y_pred = model.predict(Xte)

        return calcMetrics(Yte, y_pred, classes, model, norm=True,
                           cols=feat_cols, print_metrics=print_metrics), model
    
    return trainGradientBoosting

In [45]:
def trainDNN(Xtr, Xte, Xv, Ytr, Yte, Yv, feat_cols, print_metrics=False):
    model = models.Sequential()
    model.add(Dense(13,input_dim = 31, activation='relu'))
    model.add(Dense(13, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='softmax'))
    model.compile(loss="binary_crossentropy", optimizer = "rmsprop", metrics = ['accuracy'])

    history = model.fit(Xtr, Ytr,
                       epochs=7,
                       batch_size=20,
                       validation_data=(Xv, Yv))
    
    y_pred = model.predict(Xte)
    print(y_pred)

    return classification_report(Yte, y_pred)

    #h_dict = history.history
    #loss_values = h_dict['loss']
    #valid_loss_values = h_dict['val_loss']
    #acc_values = h_dict['acc']
    #valid_acc_values = h_dict['val_acc']
    #epochs = range(1, len(loss_values) +1)
    #fig, axes = plt.subplots(2, figsize=(10,10))
    #axes[0].plot(epochs, loss_values, 'bo', label='Training loss')
    #axes[0].plot(epochs, valid_loss_values, 'b', label='Validation loss')
    #axes[1].plot(epochs, acc_values, 'bo', label='Training Accuracy')
    #axes[1].plot(epochs, valid_acc_values, 'b', label='Validation Accuracy')
    #plt.legend()
    #plt.show()

In [25]:
class ModelMetricAggregator:
    
    def __init__(self, model, mode=''):
        self.model = model
        self.mode = mode
        self.accuracy = []
        self.balanced_accuracy = []
        self.precision = []
        self.recall = []
        self.f1_score = []
        self.roc_auc = []
        
    def addMetrics(self,acc, bal_acc, prec, rec, f1, roc_auc):
        self.accuracy.append(acc)
        self.balanced_accuracy.append(bal_acc)
        self.precision.append(prec)
        self.recall.append(rec)
        self.f1_score.append(f1)
        self.roc_auc.append(roc_auc)
        
    def getMetrics(self):
        return [
            self.model,
            self.mode,
            np.mean(self.accuracy),
            np.mean(self.balanced_accuracy),
            np.mean(self.precision),
            np.mean(self.recall),
            np.mean(self.f1_score),
            np.mean(self.roc_auc)
        ]

In [26]:
def crossValSMOTE(X, y, cols, model_names, train_eval_methods, split_mode='stratified'):

    metrics_cols = ['baseline','model', 'accuracy', 'bal_accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']
    mDF = pd.DataFrame(columns=metrics_cols)
    
    if(len(model_names) != len(train_eval_methods)):
        return
    
    splitter = StratifiedKFold(n_splits=4)
    split_gen = splitter.split(X, y)
    
    if(split_mode == 'kfold'):
        splitter = KFold(n_splits=4, random_state=11)
        split_gen = splitter.split(X)
    
    aggs = []
    
    for i in range(0, len(model_names)):
        aggregator = ModelMetricAggregator("sub", model_names[i])
        aggs.append(aggregator)
        
    
    for train_index, test_index in split_gen:
        # Generate Train and Test set
        X_train = X.iloc[train_index]        
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]
    
        # SMOTE inside the fold
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        
        for i in range(0, len(model_names)):
            (acc, bal_acc, prec, rec, f1, auc, best), model = \
                train_eval_methods[i](X_train, X_test, y_train, y_test, cols)
            
            aggs[i].addMetrics(acc, bal_acc, prec, rec, f1, auc)            
     
    
    for agg in aggs:
        row = agg.getMetrics()
        series = pd.Series(row, index=metrics_cols)
        mDF = mDF.append(series, ignore_index=True)
    
    return mDF

In [27]:
# 0 	tree 	sub 	0.637019 	0.524242 	0.199107 	0.354167 	0.245040 	0.35

# 3 	ada 	sub 	0.763636 	0.616389 	0.300000 	0.40 	0.320000 	0.6250

In [28]:
feature_files = [ #sub is better, from the literature
    "features/features_sub.csv",
    #"features/features_div.csv"
]

feature_cols = [
        #'duration',
        #'show_order',
        'card_class',
        'fix_freq','sacc_freq',
        'pupil_diam_right_mean',
        'pupil_diam_right_std',
        'pupil_diam_right_min',
        'pupil_diam_right_max',
        'pupil_diam_left_mean',
        'pupil_diam_left_std',
        'pupil_diam_left_min',
        'pupil_diam_left_max',
    
        'sre_fix_freq','sre_sacc_freq',
        'sre_pupil_diam_right_mean',
        'sre_pupil_diam_right_std',
        'sre_pupil_diam_right_min',
        'sre_pupil_diam_right_max',
        'sre_pupil_diam_left_mean',
        'sre_pupil_diam_left_std',
        'sre_pupil_diam_left_min',
        'sre_pupil_diam_left_max',
    
        'srl_fix_freq','srl_sacc_freq',
        'srl_pupil_diam_right_mean',
        'srl_pupil_diam_right_std',
        'srl_pupil_diam_right_min',
        'srl_pupil_diam_right_max',
        'srl_pupil_diam_left_mean',
        'srl_pupil_diam_left_std',
        'srl_pupil_diam_left_min',
        'srl_pupil_diam_left_max',
]

In [29]:
def computeWeights(Ytr):
    class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(Ytr),
                                                 Ytr)
    return class_weights

In [30]:
file = "features/features_none.csv"
file_test = "features/PILOT/features_none.csv"
f = file.split('/')[-1]
mode = file.split('/')[-1].split('.')[0].split('_')
    
# Load the file
data = pd.read_csv(file, sep='\t')
test = pd.read_csv(file_test, sep='\t')
    
if(mode[1] == "sub"):
    data = data.fillna(0)
    test = test.fillna(0)
else:
    data = data.fillna(1)
    test = test.fillna(1)
    
# Extract Feature Columns
X = data[feature_cols]
y = data['label']

w_list = computeWeights(y)
w_train = {
    0:w_list[0],
    1:w_list[1]
}

X_test = test[feature_cols]
y_test = test['label']

print("============ VALIDATION =============")

valDF = crossValSMOTE(X, y,
                      feature_cols,
                      ['tree', 'forest', 'ada', 'gb'],
                      [decisionTreeFactory(),
                       randomForestFactory(),
                       ADAfactory(),
                       gradientBoostingFactory()
                      ],
                      split_mode='stratified')
valDF.head()

#print("============ TEST =============")
# SMOTE X y
#X, y = SMOTE().fit_resample(X, y)
#e_metr, model = trainRandomForest(X, X_test, y, y_test, feature_cols, print_metrics=True)





Unnamed: 0,baseline,model,accuracy,bal_accuracy,precision,recall,f1_score,roc_auc
0,sub,tree,0.558148,0.531358,0.182765,0.4875,0.262522,0.531358
1,sub,forest,0.703704,0.543128,0.291667,0.3,0.255357,0.543128
2,sub,ada,0.723704,0.58033,0.305556,0.3625,0.294185,0.58033
3,sub,gb,0.704444,0.52408,0.170455,0.25,0.188095,0.52408


In [82]:
from sklearn.preprocessing import LabelEncoder

card_features_col = [
        #'duration',
        'fix_freq','sacc_freq',
        #'pupil_diam_right_mean',
        'pupil_diam_right_std',
        'pupil_diam_right_min',
        #'pupil_diam_right_max',
        #'pupil_diam_left_mean',
        'pupil_diam_left_std',
        'pupil_diam_left_min',
        #'pupil_diam_left_max',
    
        #'sre_fix_freq','sre_sacc_freq',
        #'sre_pupil_diam_right_mean',
        #'sre_pupil_diam_right_std',
        #'sre_pupil_diam_right_min',
        #'sre_pupil_diam_right_max',
        #'sre_pupil_diam_left_mean',
        #'sre_pupil_diam_left_std',
        #'sre_pupil_diam_left_min',
        #'sre_pupil_diam_left_max',
    
        #'srl_fix_freq','srl_sacc_freq',
        #'srl_pupil_diam_right_mean',
        #'srl_pupil_diam_right_std',
        #'srl_pupil_diam_right_min',
        #'srl_pupil_diam_right_max',
        #'srl_pupil_diam_left_mean',
        #'srl_pupil_diam_left_std',
        #'srl_pupil_diam_left_min',
        #'srl_pupil_diam_left_max',
]

file = "features/features_sub.csv"
file_test = "features/PILOT/features_sub.csv"

# Load the file
data = pd.read_csv(file, sep='\t')
test = pd.read_csv(file_test, sep='\t')

encoder = LabelEncoder()
encoder.fit(data['card_class'])
data['card_class_enc'] = encoder.transform(data['card_class'])
test['card_class_enc'] = encoder.transform(test['card_class'])
    
if(mode[1] == "sub"):
    data = data.fillna(0)
    test = test.fillna(0)
else:
    data = data.fillna(1)
    test = test.fillna(1)
    
# Extract Feature Columns
X = data[card_features_col]
y = data['card_class']

X_test = test[card_features_col]
y_test = test['card_class']

#classes = ['unicorn', 'hedge', 'pepper', 'aliens', 'minion', 'pig']
classes = [1,2,3,4,5,6]

print("============ VALIDATION =============")

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
#randomForestFactory(classes=classes)(X_train, X_test, y_train, y_test, card_features_col, print_metrics=True)

model = models.Sequential()
model.add(Dense(8,input_dim = len(card_features_col), activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer = "rmsprop", metrics = ['accuracy'])

history = model.fit(X_train, y_train,
                       epochs=20,
                       batch_size=10,
                       validation_data=(X_val, y_val))
    
#res = model.evaluate(X_test, y_test)

h_dict = history.history
loss_values = h_dict['loss']
valid_loss_values = h_dict['val_loss']
acc_values = h_dict['acc']

valid_acc_values = h_dict['val_acc']
epochs = range(1, len(loss_values) +1)
fig, axes = plt.subplots(2, figsize=(10,10))

axes[0].plot(epochs, loss_values, 'bo', label='Training loss')
axes[0].plot(epochs, valid_loss_values, 'b', label='Validation loss')
axes[1].plot(epochs, acc_values, 'bo', label='Training Accuracy')
axes[1].plot(epochs, valid_acc_values, 'b', label='Validation Accuracy')
plt.legend()
plt.show()




ValueError: Error when checking target: expected dense_12 to have shape (6,) but got array with shape (1,)