In [1]:
import numpy as np

from keras.layers import Input, Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model, Sequential
import keras.backend as K

from keras.models import load_model

import xgboost as xgb

Using TensorFlow backend.


###### Load the data

In [2]:
features_ID_y = np.genfromtxt("features_ID_y.csv", delimiter=',')
np.random.shuffle(features_ID_y)
n_rows, n_cols = features_ID_y.shape
print(features_ID_y[0:10,0:5])
print("Number of rows: " + str(n_rows))
print("Number of cols: " + str(n_cols))

[[  2.85900000e+03   9.13199997e+01   4.50000000e+01   1.00000000e+00
    9.00000000e+00]
 [  3.07800000e+03   1.11419998e+02   4.90000000e+01   1.40000000e+01
    5.00000000e+00]
 [  8.73000000e+02   8.96600037e+01   4.00000000e+01   1.30000000e+01
    1.90000000e+01]
 [  6.11900000e+03   1.11050003e+02   5.00000000e+01   2.30000000e+01
    2.90000000e+01]
 [  3.10300000e+03   1.11150002e+02   2.30000000e+01   1.00000000e+01
    1.90000000e+01]
 [  7.08400000e+03   7.63199997e+01   2.80000000e+01   2.50000000e+01
    3.00000000e+00]
 [  5.31400000e+03   1.08860001e+02   6.00000000e+00   2.00000000e+01
    1.90000000e+01]
 [  7.46800000e+03   8.90500031e+01   4.60000000e+01   1.00000000e+00
    5.00000000e+00]
 [  8.39000000e+03   9.99300003e+01   5.10000000e+01   1.60000000e+01
    3.70000000e+01]
 [  6.36500000e+03   8.08199997e+01   2.40000000e+01   1.30000000e+01
    3.80000000e+01]]
Number of rows: 4209
Number of cols: 378


###### Load the encoder

In [3]:
model_file = "encoder_dim_64_mean_absolute_error.h5"
# model_file = "encoder_dim_48_mean_absolute_error.h5"
encoder = load_model(model_file)



In [4]:
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
autoencoder_input (InputLaye (None, 376)               0         
_________________________________________________________________
encoder_first_layer (Dense)  (None, 128)               48256     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
encoder_first_layer_dropout  (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
encoder (Dropout)            (None, 64)                0         
Total para

In [5]:
print(features_ID_y[:,2:].shape)
encoded_features = encoder.predict(features_ID_y[:,2:])
print(encoded_features.shape)

(4209, 376)
(4209, 64)


###### Loss function

In [6]:
def R2(y_true, y_pred):
    S_res = K.sum(K.square(y_true - y_pred))
    y_bar = K.mean(y_true)
    S_tot = K.sum(K.square(y_true - y_bar))
    return -(1.0 - (S_res/S_tot))

def R2_np(y_true, y_pred):
    S_res = np.sum((y_true - y_pred)*(y_true - y_pred))
    y_bar = np.mean(y_true)
    S_tot = np.sum((y_true - y_bar)*(y_true - y_bar))
    return (1.0 - (S_res/S_tot))

In [7]:
a1 = K.random_normal_variable(shape=(3, 1), mean=0.0, scale=1.0) 
a2 = K.random_normal_variable(shape=(3, 1), mean=0.0, scale=1.0)

r = R2(a1, a2)
K.eval(r)

7.1304522

In [8]:
b1 = np.array([1,2,3,4])
b2 = np.array([1,2.1,3,3.9])

R2_np(b1, b2)

0.996

###### Simple Network

In [9]:
class Network:
    
    def __init__(self, 
                 input_shape=16, 
                 output_shape=1, 
                 layers = []):

        dropout_rate = 0.5
        alpha = 0.3
        
        model_input = Input(shape=(input_shape,))
        x = model_input
        for l in range(len(layers)):
            x = Dense(layers[l], activation="linear")(x)
            x = LeakyReLU(alpha=alpha)(x)
            x = Dropout(dropout_rate)(x)
        
        x = Dense(output_shape, activation='linear')(x)
        
        self.model = Model(model_input, x)
        self.model.compile(optimizer="adadelta", loss=R2)
        
    def fit(self, x, y, xv, yv, batch_size=128, epochs=100, verbose=0):
        self.model.fit(x=x, 
                       y=y, 
                       batch_size=batch_size, 
                       epochs=epochs, 
                       verbose=verbose,
                       validation_data=(xv, yv))
        
    def predict(self, x, batch_size=128):
        return self.model.predict(x, batch_size=batch_size)
    
    def evaluate(self, x, y, verbose=0):
        return self.model.evaluate(x=x, y=y, verbose=verbose)
        

In [10]:
network_model = Network(input_shape=364, layers=[32, 32, 32, 32])
network_model.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 364)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                11680     
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 32)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
__________

###### Many model cross validation approach

In [11]:
def corss_validation(data,
                     n_sections,
                     ts=0.2,
                     epochs=10,
                     layers=[32, 32, 32, 32]):
    n_sections = 10
    n_rows = len(data)
    step = int(n_rows/n_sections)
    print("step (validation size): " + str(step))
    ptr = 0
    network_models = []
    
    
    train_valid = data[0:int((1 - ts)*n_rows),:]
    test = data[int((1 - ts)*n_rows):,:]
    
    y_test = test[:, 0]
    x_test = test[:, 1:]
    y_test = np.reshape(y_test, (-1, 1))
    y_test_pred_avarage = np.zeros((len(y_test), 1))
    
    print("train_valid shape: " + str(train_valid.shape))
    print("test shape: " + str(test.shape))
    
    for i in range(n_sections):
        print("\nIn section i = " + str(i))
        
        local_valid = train_valid[ptr:(ptr + step), :]

        rows_to_delete = np.arange(ptr, ptr + step)
        local_train = np.delete(train_valid, rows_to_delete, 0)
        ptr = ptr + step
        
        y_valid = local_valid[:, 0]
        x_valid = local_valid[:, 1:]
        
        y_train = local_train[:, 0]
        x_train = local_train[:, 1:]
        
        lrv, lcv = x_valid.shape
        lrt, lct = x_train.shape
        
        if (lcv != lct):
            print("x_valid and x_train dimension mismatch!")
            break
        
        local_network_model = Network(input_shape=lcv, layers=layers)
        local_network_model.fit(x_train, 
                                y_train, 
                                x_valid, 
                                y_valid, 
                                epochs=epochs, 
                                verbose=0)
        
        y_valid_pred = local_network_model.predict(x_valid)
        y_train_pred = local_network_model.predict(x_train)
        
        y_valid = np.reshape(y_valid, (-1, 1))
        local_valid_R2 = R2_np(y_valid, y_valid_pred)
        
        y_train = np.reshape(y_train, (-1, 1))
        local_train_R2 = R2_np(y_train, y_train_pred)
        
        print("local_train_R2: " + str(local_train_R2))
        print("local_valid_R2: " + str(local_valid_R2))
        print("local_valid shape: " + str(local_valid.shape))
        print("local_train shape: " + str(local_train.shape))
        
        network_models.append(local_network_model)
    
    
        y_test_pred = local_network_model.predict(x_test)
        local_test_R2 = R2_np(y_test, y_test_pred)
        
        y_test_pred_avarage = y_test_pred_avarage + y_test_pred
        print("local_test_R2: " + str(local_test_R2))
        
    y_test_pred_avarage = y_test_pred_avarage/n_sections
    average_R2 = R2_np(y_test, y_test_pred_avarage)
    print("\naverage R2: " + str(average_R2))
    
    return network_models

In [12]:
y = np.reshape(features_ID_y[:,1],(-1, 1))
features_y = np.hstack((y, encoded_features))

print(features_y.shape)

many_models = corss_validation(data=features_y, 
                               n_sections=10,
                               ts=0.1,
                               epochs=2000,
                               layers=[64, 64, 64, 64])

(4209, 65)
step (validation size): 420
train_valid shape: (3788, 65)
test shape: (421, 65)

In section i = 0
local_train_R2: 0.57435781179
local_valid_R2: 0.695356057214
local_valid shape: (420, 65)
local_train shape: (3368, 65)
local_test_R2: 0.566097912475

In section i = 1
local_train_R2: 0.595267972407
local_valid_R2: 0.450356332061
local_valid shape: (420, 65)
local_train shape: (3368, 65)
local_test_R2: 0.567660546192

In section i = 2
local_train_R2: 0.574437083792
local_valid_R2: 0.550496028076
local_valid shape: (420, 65)
local_train shape: (3368, 65)
local_test_R2: 0.545233451579

In section i = 3
local_train_R2: 0.585851985914
local_valid_R2: 0.495668631224
local_valid shape: (420, 65)
local_train shape: (3368, 65)
local_test_R2: 0.558006394812

In section i = 4
local_train_R2: 0.577496343767
local_valid_R2: 0.599821858762
local_valid shape: (420, 65)
local_train shape: (3368, 65)
local_test_R2: 0.543777240878

In section i = 5
local_train_R2: 0.593926774037
local_valid_R2: 



local_train_R2: 0.586824907206
local_valid_R2: 0.656831560681
local_valid shape: (8, 65)
local_train shape: (3780, 65)
local_test_R2: 0.562288580852

average R2: 0.560490945917


# XGBoost models

# Dispatch on real test data

###### Read the test data

In [20]:
test_features_ID_y = np.genfromtxt("test_features_ID_y.csv", delimiter=',')
n_rows_ts, n_cols_ts = test_features_ID_y.shape
print("Test featuers shape: " + str(test_features_ID_y.shape))

Test featuers shape: (4209, 378)


###### Encoder the test data

In [21]:
encoded_test_features = encoder.predict(test_features_ID_y[:,2:])
print(encoded_test_features.shape)

(4209, 64)


In [22]:
def predict_with_many_models(many_models, data):
    
    n_rows_data, n_cols_data = data.shape
    res = np.zeros((n_rows_data, 1))
    
    n_models = len(many_models)
    for i in range(n_models):
        res = res + many_models[i].predict(data)
    res = res/n_models
    
    return res

def save_res(file_name, res, id_col):

    f = open(file_name, "w")
    f.write("ID,y\n")
    for i in range(len(res)):
        s = str(int(id_col[i])) + "," + str(res[i][0]) + "\n"
        f.write(s)
    f.close()

In [23]:
res = predict_with_many_models(many_models, encoded_test_features)

In [24]:
save_res("temp.csv", res, test_features_ID_y[:,0])

In [18]:
res[0:10]

array([[  78.41011658],
       [  93.10310135],
       [  78.26832428],
       [  78.28930206],
       [ 109.22426605],
       [  90.99972382],
       [ 107.75654449],
       [  94.8952652 ],
       [ 115.06167526],
       [  93.6342926 ]])

# Scratchpad

In [19]:
test_features_ID_y[:,0:1]

array([[  1.00000000e+00],
       [  2.00000000e+00],
       [  3.00000000e+00],
       ..., 
       [  8.41300000e+03],
       [  8.41400000e+03],
       [  8.41600000e+03]])