In [39]:
import numpy as np
import pandas as pd
from sklearn import cross_validation, metrics, preprocessing
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from nolearn.lasagne import NeuralNet
import lasagne
import theano
import theano.tensor as T

%matplotlib inline

np.random.seed(8888)

## Load different feature sets

In [None]:
train_dat = pd.read_csv('features/basic_train.csv')
test_dat = pd.read_csv('features/basic_test.csv')

train_labels = np.load('train_labels_parsed.npy')

In [None]:
train_dat = pd.read_csv('features/zeros_ones_train.csv')
test_dat = pd.read_csv('features/zeros_ones_test.csv')

train_labels = np.load('features/train_labels.npy')

In [3]:
train_dat = pd.read_csv('features/cat_sums_train.csv')
test_dat = pd.read_csv('features/cat_sums_test.csv')

train_labels = np.load('features/train_labels.npy')

In [None]:
#train_dat = pd.read_csv('features/trimmed_train.csv')
test_dat = pd.read_csv('features/trimmed_test.csv')

#train_labels = np.load('features/train_labels.npy')

## StratKFold

In [5]:
skf = cross_validation.StratifiedKFold(train_labels, 3, shuffle=True)

In [120]:
clf = xgb.XGBClassifier(n_estimators=100,
                        learning_rate=0.023,
                        objective='binary:logistic',
                        max_depth=6,
                        colsample_bytree=0.77,
                        subsample=0.83)

clf = ExtraTreesClassifier(n_estimators=100, max_depth=6)

n_features = len(train_dat.columns)
n_classes = 2

X = train_dat.as_matrix().astype(np.float32)
scalar = preprocessing.StandardScaler()
X = scalar.fit_transform(X)

Y = train_labels.astype(np.int32)

X_test = test_dat.as_matrix().astype(np.float32)

layers = [
            ('input', lasagne.layers.InputLayer),
            ('dense0', lasagne.layers.DenseLayer),
            ('dropout0', lasagne.layers.DropoutLayer),
            #('dense1', lasagne.layers.DenseLayer),
            #('dropout1', lasagne.layers.DropoutLayer),
            #('dense2', lasagne.layers.DenseLayer),
            #('dense3', lasagne.layers.DenseLayer),
            ('output', lasagne.layers.DenseLayer)
        ]
    
clf = NeuralNet(layers=layers,
                 input_shape=(None, n_features),
                 dense0_num_units=512,
                 dropout0_p=0.3,
                 #dense1_num_units=128,
                 #dropout1_p=0.1,
                 #dense2_num_units=64,
                 #dense3_num_units=16,
                 output_num_units=n_classes,
                 output_nonlinearity=lasagne.nonlinearities.softmax,
                 update=lasagne.updates.adagrad,
                 update_learning_rate=0.01,
                 #train_split=0.0,
                 # objective_loss_function = binary_accuracy,
                 custom_score=('auc', lambda y_true, y_proba: auc_score2(y_true, y_proba[:,1])),
                 verbose=1,
                 max_epochs=100)

#clf = AdaBoostClassifier(n_estimators=100)

In [121]:
def auc_score(clf, X, y):
    pred_prob = clf.predict_proba(X)[:,1]
    return metrics.roc_auc_score(y, pred_prob)

def auc_score2(Y,y):
    #pred_prob = clf.predict_proba(X)[:,1]
    try:
        score = metrics.roc_auc_score(Y, y)
    except ValueError:
        score = 0.0
    return score

In [122]:
scores = cross_validation.cross_val_score(clf, X, Y, scoring=auc_score, cv=2)
#clf.fit(X,Y)

# Neural Network with 158722 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input        307
  1  dense0       512
  2  dropout0     512
  3  output         2

# Neural Network with 158722 learnable parameters

## Layer information

  #  name        size
---  --------  ------
  0  input        307
  1  dense0       512
  2  dropout0     512
  3  output         2

  epoch    train loss    valid loss    train/val    valid acc      auc  dur
-------  ------------  ------------  -----------  -----------  -------  ------
      1       [36m0.24000[0m       [32m0.19827[0m      1.21045      0.91543  0.94775  12.69s
  epoch    train loss    valid loss    train/val    valid acc      auc  dur
-------  ------------  ------------  -----------  -----------  -------  ------
      1       [36m0.24000[0m       [32m0.19827[0m      1.21045      0.91543  0.94775  12.69s
      2       [36m0.19520[0m       [32m0.19043[0m      1.02505      0.91945  0.9

In [123]:
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.955573 (+/- 0.000178)
