In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import keras

import random as rn

np.random.seed(69)
rn.seed(2468)

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)


from keras import backend as K

tf.set_random_seed(13579)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

generate_data = True


x = np.load("data/x.npy")
y = np.load("data/y.npy")
# Use this when generating the actual roc score
if generate_data:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization

In [3]:
# 2nd submission: test_size 0.3, batch_size=32, epochs=10, binary_crossentropy, adam
# model = Sequential()
# model.add(BatchNormalization(input_shape=(len(x_train[0]),)))
# model.add(Dense(300))
# model.add(Activation('relu'))
# model.add(Dropout(0.6))

# model.add(Dense(200))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
# model.add(Dropout(0.6))


# model.add(Dense(100))
# model.add(BatchNormalization())
# model.add(Activation('relu'))
# model.add(Dropout(0.3))




model = Sequential()
model.add(BatchNormalization(input_shape=(len(x[0]),)))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dropout(0.6))

model.add(Dense(200))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.6))


model.add(Dense(100))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

In [4]:
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [5]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_1 (Batch (None, 382)               1528      
_________________________________________________________________
dense_1 (Dense)              (None, 300)               114900    
_________________________________________________________________
activation_1 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               60200     
_________________________________________________________________
batch_normalization_2 (Batch (None, 200)               800       
_________________________________________________________________
activation_2 (Activation)    (None, 200)               0         
__________

In [6]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

if generate_data:
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
else:
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy', auroc])

In [7]:
if generate_data:
    fit = model.fit(x_train, y_train, batch_size=32, nb_epoch=12, verbose=1)
else:
    # batch_size can be at most 33 when 
    # np.random.seed(69)
    # rn.seed(2468)
    # tf.set_random_seed(13579)
    # (at least for this run... seems like putting validation split screws over the ROC calc)
    
    fit = model.fit(x, y, validation_split=0.33, batch_size=32, nb_epoch=33, verbose=1)

  


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [8]:
def generate_csv(model, x_data, name):
    ids = [i for i, _ in enumerate(x_data)]
    ids = np.array(ids)
    
    predictions = model.predict(x_data).flatten()
    
    df = pd.DataFrame()
    df['id'] = ids
    df['target'] = predictions
    df.to_csv(name + ".csv", index=False)

#     print df.head()


if generate_data:
    # Use only when split earlier
    y_scores = model.predict(x_test)
    temp = y_scores.flatten()
    print(roc_auc_score(y_test, temp))
    
    x_test = np.load("data/x_test.npy")
    print(x_test.shape)

    generate_csv(model, x_test, "submissions/third_test")

0.7732769078355981
(16000, 382)


In [9]:
# import matplotlib.pyplot as plt

# # history = model.fit(x, y, validation_split=0.25, epochs=50, batch_size=16, verbose=1)

# # Plot training & validation accuracy values
# plt.plot(fit.history['acc'])
# # plt.plot(fit.history['val_acc'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# # plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()

# # Plot training & validation loss values
# plt.plot(fit.history['loss'])
# # plt.plot(fit.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# # plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()


if not generate_data:
    # Use when validation was done in keras
    import matplotlib.pyplot as plt

    # history = model.fit(x, y, validation_split=0.25, epochs=50, batch_size=16, verbose=1)

    # Plot training & validation accuracy values
    plt.plot(fit.history['auroc'])
    plt.plot(fit.history['val_auroc'])
    plt.title('Model AUCROC')
    plt.ylabel('AUCROC')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    print('Test AUCROC maximized at iteration = {} with value of {}'.format(np.argmax(fit.history['val_auroc']) + 1, fit.history['val_auroc'][np.argmax(fit.history['val_auroc'])]))
    plt.savefig("plot.png")

    # Plot training & validation loss values
    plt.plot(fit.history['loss'])
    plt.plot(fit.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    print('Test error minimized at iteration = {} with value of {}'.format(np.argmin(fit.history['val_loss']) + 1, fit.history['val_loss'][np.argmin(fit.history['val_loss'])]))
    plt.savefig("ploooot.png")
