In [1]:
from __future__ import absolute_import, division, print_function
from sklearn.model_selection import train_test_split
from math import sqrt
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import keras
import os
import time

Using TensorFlow backend.


## Validate with Neural Network

In [2]:
valid_files = open('valid-list.txt')
filenames = []
for filename in valid_files:
    filenames.append(filename[:-1])
data = []
numindex = []
nanindex = []
filecount = 0.0
datacount = -1
for filename in filenames:
    filecount += 1
    print("Loading {:.2%}".format(filecount/len(filenames)), end="\r")
    file = open(filename)
    for line in file:
        datacount += 1
        features = line.split()
        features = list(map(float, features[:-1]))
        if np.isnan(sum(features)):
            nanindex.append(datacount)
            continue
        numindex.append(datacount)
        data.append(features)
    file.close()

Loading 100.00%

In [3]:
len(data)

24798521

In [5]:
%%time
data = np.array(data)
# discard column 0, 1, 3, 4, 5
X = data[:, [0, 1, 2]+list(range(6,35))]

# normalize data
X = keras.utils.normalize(X, axis=0, order=2)
y = [int(x==9999) for x in data[:,4]]

CPU times: user 1min 27s, sys: 5.6 s, total: 1min 33s
Wall time: 1min 32s


In [6]:
# Hyper-parameters
epochs = 20         # This should be adjusted
batch_size = 64    # The larger the faster
learning_rate = 0.001

In [7]:
class Model_FC:    
    def forward(inputs):
        x = keras.layers.Dense(128, activation=tf.nn.relu)(inputs)
        x = keras.layers.Dense(64, activation=tf.nn.relu)(x)
        x = keras.layers.Dense(32, activation=tf.nn.relu)(x)
        x = keras.layers.Dense(16, activation=tf.nn.relu)(x)
        x = keras.layers.Dense(1, activation=tf.nn.sigmoid)(x)
        return x
    
    def build():
        inputs = keras.layers.Input(shape=(X.shape[1],))
        outputs = Model_FC.forward(inputs)
        model = keras.Model(inputs=inputs, outputs=outputs)
        return model

model = Model_FC.build()
optimizer = keras.optimizers.Adam(learning_rate)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               4224      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 15,105
Trainable params: 15,105
Non-trainable params: 0
_______________________________________________________

In [8]:
model.load_weights('weightsAfterUS_multi_FINALWEIGHTS.hdf5')
pred_NN = model.predict(X)

## Validate with Boosting Trees 

In [12]:
# BT for boosting trees
file = open('scores.txt')
pred_BT = []
for score in file:
    pred_BT.append(float(score[:-1]))
print(len(pred_BT))

pred_BT = np.array(pred_BT)
numindex = np.array(numindex)
pred_BT_nonan = pred_BT[numindex]
print(len(pred_BT_nonan))

30485738
24798521


## Train an Ensemble Model

In [48]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X = np.stack((pred_BT_nonan, pred_NN.squeeze())).T
y = [int(x==9999) for x in data[:,4]]
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict_proba(X)

array([[0.99415873, 0.00584127],
       [0.99448336, 0.00551664],
       [0.99395738, 0.00604262],
       ...,
       [0.95098441, 0.04901559],
       [0.99350139, 0.00649861],
       [0.99404092, 0.00595908]])

In [127]:
label_NN = (pred_NN>0.5).squeeze().astype(int)
label_BT = (pred_BT_nonan<0.5).astype(int)
label_EN = clf.predict(X)
print('NN Accuracy: ', (label_NN==y).sum()/len(y))
print('BT Accuracy: ', (label_BT==y).sum()/len(y))
print('EN Accuracy: ', (label_EN==y).sum()/len(y))

NN Accuracy:  0.9530953075790286
BT Accuracy:  0.9459002413893958
EN Accuracy:  0.9738177530829358


## Inference on Test Data

In [58]:
test_files = os.listdir('/home/ubuntu/data/test/')
filenames = ['/home/ubuntu/data/test/'+x for x in test_files]
test_data = []
numindex = []
nanindex = []
filecount = 0.0
datacount = -1
for filename in filenames:
    filecount += 1
    print("Loading {:.2%}".format(filecount/len(filenames)), end="\r")
    file = open(filename)
    for line in file:
        datacount += 1
        features = line.split()
        features = list(map(float, features[:-1]))
        if np.isnan(sum(features)):
            nanindex.append(datacount)
            continue
        numindex.append(datacount)
        test_data.append(features)
    file.close()

Loading 100.00%

In [60]:
test_data = np.array(test_data)
X_test = test_data[:, [0, 1, 2]+list(range(6,35))]
X_test = keras.utils.normalize(X_test, axis=0, order=2)
test_pred_NN = model.predict(X_test)
test_pred_NN = test_pred_NN.squeeze()

In [69]:
test_pred_BT = np.load('final_test_scores.npz')['scores']

In [73]:
numindex = np.array(numindex)
nanindex = np.array(nanindex)
test_pred_BT_nonan = test_pred_BT[numindex]
X = np.stack((test_pred_BT_nonan, test_pred_NN)).T
final_pred_nonan = clf.predict_proba(X)[:,0]

In [95]:
final_pred = np.zeros(len(test_pred_BT))
final_pred[numindex] = final_pred_nonan
final_pred[nanindex] = test_pred_BT[nanindex]

In [96]:
final_pred = np.array(final_pred)
np.save('final_pred_ensemble.npy', final_pred)

In [97]:
final_pred

array([0.8550002 , 0.88063872, 0.8397234 , ..., 0.99463934, 0.99470902,
       0.99469946])

In [99]:
clf.coef_

array([[-6.37796625,  0.44220557]])