In [1]:
import numpy as np
import pandas as pd
import itertools

#feature scaling
from sklearn.preprocessing import StandardScaler, RobustScaler

# Utils
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
# limit tensorflow GPU memory usage

import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

#random seeds for stochastic parts of neural network 
np.random.seed(43)
tf.set_random_seed(43)

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Merge, Reshape, Dropout
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [4]:
df_train = pd.read_csv('train.gz')
df_test = pd.read_csv('test.gz')

In [5]:
X_train, y_train = df_train.iloc[:,3:], df_train.target
X_test = df_test.iloc[:,1:]

In [6]:
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]

In [7]:
X_train = X_train[cols_use]
X_test = X_test[cols_use]

In [8]:
col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}

In [9]:
embed_cols = []
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions

print('\n')

ps_ind_02_cat: 5 values
ps_ind_04_cat: 3 values
ps_ind_05_cat: 8 values
ps_car_01_cat: 13 values
ps_car_02_cat: 3 values
ps_car_03_cat: 3 values
ps_car_04_cat: 10 values
ps_car_05_cat: 3 values
ps_car_06_cat: 18 values
ps_car_07_cat: 3 values
ps_car_09_cat: 6 values
ps_car_10_cat: 3 values
ps_car_11_cat: 104 values




In [10]:
def build_embedding_network():
    
    models = []
    
    model_ps_ind_02_cat = Sequential()
    model_ps_ind_02_cat.add(Embedding(5, 3, input_length=1))
    model_ps_ind_02_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_ind_02_cat)
    
    model_ps_ind_04_cat = Sequential()
    model_ps_ind_04_cat.add(Embedding(3, 2, input_length=1))
    model_ps_ind_04_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_ind_04_cat)
    
    model_ps_ind_05_cat = Sequential()
    model_ps_ind_05_cat.add(Embedding(8, 5, input_length=1))
    model_ps_ind_05_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_ind_05_cat)
    
    model_ps_car_01_cat = Sequential()
    model_ps_car_01_cat.add(Embedding(13, 7, input_length=1))
    model_ps_car_01_cat.add(Reshape(target_shape=(7,)))
    models.append(model_ps_car_01_cat)
    
    model_ps_car_02_cat = Sequential()
    model_ps_car_02_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_02_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_02_cat)
    
    model_ps_car_03_cat = Sequential()
    model_ps_car_03_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_03_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_03_cat)
    
    model_ps_car_04_cat = Sequential()
    model_ps_car_04_cat.add(Embedding(10, 5, input_length=1))
    model_ps_car_04_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_car_04_cat)
    
    model_ps_car_05_cat = Sequential()
    model_ps_car_05_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_05_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_05_cat)
    
    model_ps_car_06_cat = Sequential()
    model_ps_car_06_cat.add(Embedding(18, 8, input_length=1))
    model_ps_car_06_cat.add(Reshape(target_shape=(8,)))
    models.append(model_ps_car_06_cat)
    
    model_ps_car_07_cat = Sequential()
    model_ps_car_07_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_07_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_07_cat)
    
    model_ps_car_09_cat = Sequential()
    model_ps_car_09_cat.add(Embedding(6, 3, input_length=1))
    model_ps_car_09_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_car_09_cat)
    
    model_ps_car_10_cat = Sequential()
    model_ps_car_10_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_10_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_10_cat)
    
    model_ps_car_11_cat = Sequential()
    model_ps_car_11_cat.add(Embedding(104, 10, input_length=1))
    model_ps_car_11_cat.add(Reshape(target_shape=(10,)))
    models.append(model_ps_car_11_cat)
    
    model_rest = Sequential()
    model_rest.add(Dense(16, input_dim=24))
    models.append(model_rest)

    model = Sequential()
    model.add(Merge(models, mode='concat'))
    model.add(Dense(80))
    model.add(Activation('relu'))
    model.add(Dropout(.35))
    model.add(Dense(20))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(10))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [11]:
#converting data to list format to match the network structure
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    #the rest of the columns
    other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    input_list_train.append(X_train[other_cols].values)
    input_list_val.append(X_val[other_cols].values)
    input_list_test.append(X_test[other_cols].values)
    
    return input_list_train, input_list_val, input_list_test 

In [12]:
#gini scoring function from kernel at: 
#https://www.kaggle.com/tezdhar/faster-gini-calculation
def ginic(actual, pred):
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
    return giniSum / n

In [13]:
def gini_normalizedc(a, p):
    return ginic(a, p) / ginic(a, a)

In [14]:
#network training
K = 8
runs_per_fold = 3
n_epochs = 15

cv_ginis = []
full_val_preds = np.zeros(np.shape(X_train)[0])
y_preds = np.zeros((np.shape(X_test)[0],K))

kfold = StratifiedKFold(n_splits = K, 
                            random_state = 231, 
                            shuffle = True)    

In [15]:
for i, (f_ind, outf_ind) in enumerate(kfold.split(X_train, y_train)):

    X_train_f, X_val_f = X_train.loc[f_ind].copy(), X_train.loc[outf_ind].copy()
    y_train_f, y_val_f = y_train[f_ind], y_train[outf_ind]
    
    X_test_f = X_test.copy()
    
    #upsampling adapted from kernel: 
    #https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283
    pos = (pd.Series(y_train_f == 1))
    
    # Add positive examples
    X_train_f = pd.concat([X_train_f, X_train_f.loc[pos]], axis=0)
    y_train_f = pd.concat([y_train_f, y_train_f.loc[pos]], axis=0)
    
    # Shuffle data
    idx = np.arange(len(X_train_f))
    np.random.shuffle(idx)
    X_train_f = X_train_f.iloc[idx]
    y_train_f = y_train_f.iloc[idx]
    
    #preprocessing
    proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(X_train_f, X_val_f, X_test_f)
    
    #track oof prediction for cv scores
    val_preds = 0
    
    for j in range(runs_per_fold):
    
        NN = build_embedding_network()
        NN.fit(proc_X_train_f, y_train_f.values, epochs=n_epochs, batch_size=4096)
   
        val_preds += NN.predict(proc_X_val_f)[:,0] / runs_per_fold
        y_preds[:,i] += NN.predict(proc_X_test_f)[:,0] / runs_per_fold
        
    full_val_preds[outf_ind] += val_preds
        
    cv_gini = gini_normalizedc(y_val_f.values, val_preds)
    cv_ginis.append(cv_gini)
    print ('\nFold %i prediction cv gini: %.5f\n' %(i,cv_gini))





Fold 0 prediction cv gini: 0.28914


Fold 1 prediction cv gini: 0.25813


Fold 2 prediction cv gini: 0.28737


Fold 3 prediction cv gini: 0.27478


Fold 4 prediction cv gini: 0.26562


Fold 5 prediction cv gini: 0.28751


Fold 6 prediction cv gini: 0.27874


Fold 7 prediction cv gini: 0.28062



In [16]:
print('Mean out of fold gini: %.5f' % np.mean(cv_ginis))
print('Full validation gini: %.5f' % gini_normalizedc(y_train.values, full_val_preds))

y_pred_final = np.mean(y_preds, axis=1)

Mean out of fold gini: 0.27774
Full validation gini: 0.27738


In [18]:
df_sub = pd.DataFrame({'id' : df_test.id, 
                       'target' : y_pred_final},
                       columns = ['id','target'])
df_sub.to_csv('NN_EntityEmbed_10fold-sub.csv', index=False)

pd.DataFrame(full_val_preds).to_csv('NN_EntityEmbed_10fold-val_preds.csv',index=False)

In [None]:
import keras.backend as K

class WeightedBinaryCrossEntropy(object):

    def __init__(self, pos_ratio):
        neg_ratio = 1. - pos_ratio
        self.pos_ratio = tf.constant(pos_ratio, tf.float32)
        self.weights = tf.constant(neg_ratio / pos_ratio, tf.float32)
        self.__name__ = "weighted_binary_crossentropy({0})".format(pos_ratio)

    def __call__(self, y_true, y_pred):
        return self.weighted_binary_crossentropy(y_true, y_pred)

    def weighted_binary_crossentropy(self, y_true, y_pred):
            # Transform to logits
            epsilon = tf.convert_to_tensor(K.common._EPSILON, y_pred.dtype.base_dtype)
            y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
            y_pred = tf.log(y_pred / (1 - y_pred))

            cost = tf.nn.weighted_cross_entropy_with_logits(y_true, y_pred, self.weights)
            return K.mean(cost * self.pos_ratio, axis=-1)

In [None]:
# Autoencoder to train first 2 layers
inputs=tf.keras.layers.Input(shape=(133,))
#x=tf.keras.layers.Dense(37, kernel_initializer='he_normal', bias_initializer='he_normal')(inputs)
#x=tf.keras.layers.ELU()(x)
#x=tf.keras.layers.BatchNormalization()(x)
#x=tf.keras.layers.Dropout(0.5)(x)
#x=tf.keras.layers.add([inputs,x])
encoded=tf.keras.layers.Dense(8000, kernel_initializer='he_normal', bias_initializer='he_normal')(inputs)
encoded=tf.keras.layers.ELU()(encoded)
encoded=tf.keras.layers.BatchNormalization()(encoded)
encoded=tf.keras.layers.Dropout(0.5)(encoded)
encoded=tf.keras.layers.Dense(1000, kernel_initializer='he_normal', bias_initializer='he_normal')(encoded)
encoded=tf.keras.layers.ELU()(encoded)
encoded=tf.keras.layers.BatchNormalization()(encoded)
encoded=tf.keras.layers.Dropout(0.5)(encoded)

decoded=tf.keras.layers.Dense(1000, kernel_initializer='he_normal', bias_initializer='he_normal')(encoded)
decoded=tf.keras.layers.ELU()(decoded)
decoded=tf.keras.layers.BatchNormalization()(decoded)
decoded=tf.keras.layers.Dropout(0.5)(decoded)
decoded=tf.keras.layers.Dense(8000, kernel_initializer='he_normal', bias_initializer='he_normal')(decoded)
decoded=tf.keras.layers.ELU()(decoded)
decoded=tf.keras.layers.BatchNormalization()(decoded)
decoded=tf.keras.layers.Dropout(0.5)(decoded)
decoded=tf.keras.layers.Dense(133, activation='sigmoid')(decoded)
#loss=WeightedBinaryCrossEntropy(0.036)
autoencoder = tf.keras.models.Model(inputs, decoded)
autoencoder.compile(optimizer='nadam', loss='mean_squared_error')

In [None]:
print (autoencoder.summary())

In [None]:
from time import time

tensorboard=tf.keras.callbacks.TensorBoard(log_dir="logs/{}".format(time()))

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    %time autoencoder.fit(combined_scaled, combined_scaled, epochs=10, batch_size=1000, shuffle=True, callbacks=[tensorboard], validation_data=(x2_scaled, x2_scaled))
    #save model
    autoencoder_json = autoencoder.to_json()
    with open("nn_autoencoder.json", "w") as json_file:
        json_file.write(autoencoder_json)
    # serialize weights to HDF5
    autoencoder.save_weights("nn_autoencoder.h5")

In [None]:
# Freeze encoder layer and predict
encoded.trainable=False
x=tf.keras.layers.Dense(500, kernel_initializer='he_normal', bias_initializer='he_normal')(encoded)
x=tf.keras.layers.ELU()(x)
x=tf.keras.layers.BatchNormalization()(x)
x=tf.keras.layers.Dropout(0.5)(x)
predictions=tf.keras.layers.Dense(1, activation='sigmoid')(x)
model=tf.keras.models.Model(inputs=inputs, outputs=predictions)
loss=WeightedBinaryCrossEntropy(0.036)
model.compile(optimizer=tf.keras.optimizers.Nadam(),  
              loss=loss,
              #loss=pair_loss,
              #optimizer=tf.keras.optimizers.SGD(lr=0.03, momentum=0.9, nesterov=True),
              #metrics=['binary_accuracy']
             )


In [None]:
print (model.summary())

In [None]:
from time import time

tensorboard=tf.keras.callbacks.TensorBoard(log_dir="logs/{}".format(time()))

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    %time temp=model.fit (x1_scaled, y1, validation_data=(x2_scaled,y2), callbacks=[tensorboard], epochs=10, batch_size=100)
    %time prediction=model.predict(test_scaled)
    %time val_prediction=model.predict(x2_scaled)
    %time train_prediction=model.predict(x1_scaled)
    scores=model.evaluate(x2_scaled,y2,verbose=1)
    
    #save model
    model_json = model.to_json()
    with open("nn_model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("nn_model.h5")

