In [None]:
import time
import sys


import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import joblib

#from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD
from keras.regularizers import l2, l1


In [None]:
trainDataFrame = pd.read_csv('./data/trainLevel1Preds.csv')
trainLabels = trainDataFrame['TARGET']
trainFeatures = trainDataFrame.drop(['TARGET'], axis=1)

In [None]:
print trainDataFrame.shape
print trainLabels.shape
print trainFeatures.shape

In [None]:
np.array_equal(trainLabels.index, trainFeatures.index)

In [None]:
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((2, 1))
    e[j] = 1.0
    return e

In [None]:
dataTarget0 = trainDataFrame[trainDataFrame.TARGET == 0]
dataTarget1 = trainDataFrame[trainDataFrame.TARGET == 1]
np.random.seed(1)

def getBalancedTrainAndValidationSets(scale=False):
    # shuffle
    dt0 = dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dt1 = dataTarget1.reindex(np.random.permutation(dataTarget1.index))
    trn0 = dt0[0:3000]
    trn1 = dt1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    if scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_train = pd.DataFrame(X_train)
    val0 = dt0[3000:6000]
    val1 = dt1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    if scale:
        X_val = scaler.transform(X_val)
        X_val = pd.DataFrame(X_val)
    return X_train, y_train, X_val, y_val

In [None]:
def get_model():
    # CREATE THE MODEL
    # set model_id and checkpointer object
    model_id = "./models/keras/last_keras_model_checkpoint"
    checkpointer = None
    # to monitor validation accuracy:
    # checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", monitor='val_loss', verbose=1, save_best_only=False)
    # to monitor validation loss:
    # checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", verbose=1, save_best_only=False)
    # create Sequential model
    model = Sequential()
    # 1. hidden layer
    model.add(Dense(output_dim=2, input_dim=188))#, W_regularizer=l1(0.01))) #W_regularizer=l2(0.1), 
    model.add(Activation('sigmoid'))
    # 2. hidden
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    #model.add(Dense(output_dim=100, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
    #model.add(Activation('sigmoid'))
    # output layer
    model.add(Dense(output_dim=2))#, W_regularizer=l1(0.01))) #, W_regularizer=l2(0.1)
    model.add(Activation("softmax"))
    # SGD(lr=0.005, momentum=0.1, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model, checkpointer

## CROSS VALIDATION

In [None]:
# CROSS VALIDATE
n_folds = 100
start = time.time()
trn_scores = []
val_scores = []
for i in range(n_folds):
    #print "Running Fold", i+1, "/", n_folds
    model = None # Clearing the NN.
    model, checkpointer = get_model()
    X_train, y_train, X_valid, y_valid = getBalancedTrainAndValidationSets()
    y_train_vect = np.array([vectorized_result(x) for x in y_train])
    y_train_vect = np.reshape(y_train_vect, (y_train_vect.shape[0], 2))
    y_valid_vect = np.array([vectorized_result(x) for x in y_valid])
    y_valid_vect = np.reshape(y_valid_vect, (y_valid_vect.shape[0], 2))
    model.fit(X_train.as_matrix(), y_train_vect, verbose=0, batch_size=10, nb_epoch=400, validation_data=(X_valid.as_matrix(), y_valid_vect)) #, callbacks=[checkpointer]
    trn_score = roc_auc_score(y_train, model.predict_proba(X_train.as_matrix())[:,1])
    val_score = roc_auc_score(y_valid, model.predict_proba(X_valid.as_matrix())[:,1])
    #print "train auc: %.4f" % trn_score
    #print "validation auc: %.4f" % val_score
    print "(%d/%d) Train (AUC): %.4f Validation (AUC): %.4f" % (i, n_folds, trn_score, val_score)
    trn_scores.append(trn_score)
    val_scores.append(val_score)
    
stop = time.time()
print "-----------------------"
print "Results for %d fold cross validation:" % (n_folds)
print "Train Mean: %.4f" % np.mean(trn_scores)
print "Train std: %.4f" % np.std(trn_scores)
print "Validation Mean: %.4f" % np.mean(val_scores)
print "Validation std: %.4f" % np.std(val_scores)
print "Total time (mins): %.1f (%.1f/fold)" % ((stop-start)/60., (stop-start)/(60.*n_folds))