Change Log:
1. No expm1
2. No log1p
3. Only +ve dependent vars values
6. Added segment and country embeddings
7. Added MinMaxScaler instead of Standard Scaler
8. Added RMSE calculations at the end
9. Removed last dropout
15. Scaling X_test same as X_train
16. Connecting Embeddings properly
17. More categorical cols as embeddings
18. New Architecture
19. Same loss weights but longer value branch

In [71]:
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Activation, Reshape, Dropout, PReLU, Concatenate, concatenate, multiply
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam, Nadam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, CSVLogger, ModelCheckpoint, LearningRateScheduler
from keras.constraints import nonneg

import pickle
import sys
import pandas as pd
import h5py
from math import sqrt
import numpy
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, OneHotEncoder
from sklearn.metrics import mean_squared_error, roc_curve, confusion_matrix, accuracy_score

numpy.random.seed(321)

In [72]:
# Importing Data
df = pd.read_csv("../data/train.csv", sep=",")

In [73]:
# Importing Data
df_test = pd.read_csv("../data/test.csv", sep=",")

In [83]:
model = load_model('../results/BestModel1.h5')

In [75]:
# Sanity Checks
print("df-",df.shape)
print("df-",df.columns)

print(df.columns[0:2])
print(df.columns[2:])

df- (200000, 202)
df- Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=202)
Index(['ID_code', 'target'], dtype='object')
Index(['var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7',
       'var_8', 'var_9',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=200)


In [76]:
X = df[df.columns[2:]].astype(float).values
y_clf = np.array(df['target']).astype(float)

X_test = df_test[df_test.columns[1:]].astype(float).values

In [77]:
X_backup = X

In [78]:
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

scaler_x_test = MinMaxScaler(feature_range=(0, 1))
X_test = scaler.fit_transform(X_test)

In [79]:
y_clf

array([0., 0., 0., ..., 0., 0., 0.])

In [80]:
print(X.shape)
print(y_clf.shape)

print(X_test.shape)

(200000, 200)
(200000,)
(200000, 200)


In [81]:
dropout_value = 0.2

In [82]:
input_model = Input(shape = (X.shape[1],))

output_model = Dense(512, activation='relu')(input_model)
output_model = Dense(256, activation='relu')(output_model)
output_model = Dense(128, activation='relu')(output_model)
output_model = Dense(1, activation='sigmoid')(output_model)

model = Model(inputs=input_model, 
              outputs=output_model)

lr1 = Adam(lr=0.001)

model.compile(loss='binary_crossentropy',
              optimizer=lr1,
              metrics = ['binary_accuracy'])

MCP = ModelCheckpoint(filepath = "../results/BestModel1.h5", monitor='val_loss', verbose=0, save_best_only=True, 
                      save_weights_only=False, mode='auto', period=1)

RLROP = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, 
                          mode='auto', min_delta=0.00001, cooldown=1, min_lr=0)

CSVL = CSVLogger(filename = "../results/LogFile1.txt", separator=',', append=False)

In [27]:
epochs = 100 # 20

model.fit(x = X, 
          y = y_clf,
          validation_split = 0.3,
          epochs=epochs, 
          batch_size= 1024,
          callbacks = [MCP, RLROP, CSVL],
          verbose = 1,
          shuffle = True)

Train on 140000 samples, validate on 60000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100

Epoch 00047: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100

Epoch 00052: ReduceLROnPlateau redu

Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100

Epoch 00097: ReduceLROnPlateau reducing learning rate to 1.0000001095066122e-16.
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x16b014912e8>

In [40]:
epochs = 100 # 20

model.fit(x = X, 
          y = y_clf,
          validation_split = 0.3,
          epochs=epochs, 
          batch_size= 1024,
          callbacks = [MCP, RLROP, CSVL],
          verbose = 1,
          shuffle = True)

Train on 140000 samples, validate on 60000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 00021: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-10.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100

Epoch 00031: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-11.
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

E

Epoch 90/100
Epoch 91/100

Epoch 00091: ReduceLROnPlateau reducing learning rate to 1.0000000944832675e-23.
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100

Epoch 00096: ReduceLROnPlateau reducing learning rate to 1.0000000787060494e-24.
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x16b025c2da0>

In [84]:
# On Training Data
model.evaluate(x = X, 
               y = y_clf,
              batch_size = 1024)



[0.22326479040145875, 0.91733]

In [85]:
best_model = load_model("../results/BestModel1.h5")

In [86]:
# On Training Data
best_model.evaluate(x = X, 
               y = y_clf,
              batch_size = 1024)



[0.22326479040145875, 0.91733]

In [87]:
old_model = model
model = best_model

In [90]:
y_preds = model.predict(X, batch_size = 1024)

y_test_preds = model.predict(X_test, batch_size = 1024)

In [91]:
y_preds.shape

(200000, 1)

In [92]:
(y_clf,y_preds.reshape(y_preds.shape[0],))

(array([0., 0., 0., ..., 0., 0., 0.]),
 array([0.02684054, 0.7223495 , 0.04017243, ..., 0.09523162, 0.02365717,
        0.00561482], dtype=float32))

In [93]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_clf,y_preds.reshape(y_preds.shape[0],))

0.8720867789797286

In [94]:
from keras.models import load_model

old_model.save('../results/Trial1.h5')

In [95]:
# How to threshold for an imbalanced problem
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

In [97]:
threshold = Find_Optimal_Cutoff(y_clf, y_preds)[0]
threshold

0.09574279189109802

In [98]:
y_p = np.where(y_preds>threshold,1,0).reshape(-1)
y_test_p = np.where(y_test_preds>threshold,1,0).reshape(-1)

In [99]:
confusion_matrix(y_clf, y_p)

array([[142280,  37622],
       [  4204,  15894]], dtype=int64)

In [100]:
np.array(y_clf == y_p).mean()

0.79087

In [101]:
accuracy_score(y_clf,y_p, normalize = False)

158174

In [104]:
Train_results = pd.DataFrame({'ID_code' : df['ID_code'].tolist(),'actual' : y_clf,'pred' : y_preds[:,0]})

Train_results.to_csv("../results/TrainResults_Trial1.csv", index = False)

Test_results = pd.DataFrame({'ID_code' : df_test['ID_code'].tolist(),'target' : y_test_p})

Test_results.to_csv("../results/TestResults_Trial1.csv", index = False)

In [105]:
Train_results.columns

Index(['ID_code', 'actual', 'pred'], dtype='object')

In [106]:
Test_results.columns

Index(['ID_code', 'target'], dtype='object')

In [108]:
Train_results.head()

Unnamed: 0,ID_code,actual,pred
0,train_0,0.0,0.026841
1,train_1,0.0,0.72235
2,train_2,0.0,0.040172
3,train_3,0.0,0.244595
4,train_4,0.0,0.08465


In [107]:
Test_results.head()

Unnamed: 0,ID_code,target
0,test_0,1
1,test_1,1
2,test_2,0
3,test_3,1
4,test_4,0
