In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping
from keras import regularizers
import time
from datetime import datetime

In [None]:
traindata = pd.read_csv('../input/train.csv')
traindata = traindata.sample(frac=1).reset_index(drop=True)
train_lable = traindata['target']
train_input = traindata.drop(['target', 'ID_code'], axis=1)
print("loading done")

In [None]:
def split(X, Y, splitFrac):
    n = (int)(X.shape[0] * splitFrac)
    n1 = X.shape[0] - n
    n2 = n1 - n
    
    Xd = X.tail(n)
    Yd = Y.tail(n)
    
    X1 = X.tail(n1)
    Y1 = Y.tail(n1)
    
    Xt = X1.tail(n)
    Yt = Y1.tail(n)
    
    X2 = X.tail(n2)
    Y2 = Y.tail(n2)
    
    return X2, Y2, Xd, Yd, Xt, Yt

In [None]:
def augument(X, Y):
    nCount = Y[Y == 0].shape[0]
    pCount = Y[Y == 1].shape[0]

    xt = []
    addCount = 0
    batchCount = 0
    if nCount > pCount :
        xFrom = X[Y == 1].copy()
        yFrom = Y[Y == 1].copy()
        xTo = X[Y == 0].copy()
        yTo = Y[Y == 0].copy()
        addCount = nCount
        batchCount = pCount
    else:
        xFrom = X[Y == 0].copy()
        xTo = X[Y == 1].copy()
        yFrom = Y[Y == 0].copy()
        yTo = Y[Y == 1].copy()
        addCount = pCount
        batchCount = nCount

    while addCount > 0:
        toAdd = batchCount if addCount > batchCount else addCount
        xTo = xTo.append(xFrom[:toAdd], ignore_index=True)
        yTo = yTo.append(yFrom[:toAdd], ignore_index=True)
        addCount -= toAdd    
    
    xTo['target'] = yTo
    xTo = xTo.sample(frac=1).reset_index(drop=True)
    yTo = xTo['target']
    xTo = xTo.drop(['target'], axis=1)
    return xTo, yTo


In [None]:
def normalize(df, withGiven=False, meanIn = 0, varIn = 0):
    orig_columns = df.columns
    mean = meanIn if withGiven else df.mean(axis=1)
    var = varIn if withGiven else df.var(axis=1)
    newDf = pd.DataFrame()
    for i in range(len(orig_columns)):
        f = orig_columns[i]
        newDf[f] = (df[f] - mean[i])/var[i]
        newDf['sq_'+f] = np.power(newDf[f], 2)
        newDf['sqrt_'+f] = np.power(np.abs(newDf[f]), 0.5)
        newDf['qube_'+f] = np.power(np.abs(newDf[f]), 3)
    return newDf, mean, var

def calcWeights(Y):
    nCount = Y[Y == 0].shape[0]
    pCount = Y[Y == 1].shape[0]
    if nCount > pCount:
        return { 0: 1.0, 1: nCount/pCount}
    else:
        return { 0: pCount/nCount, 1: 1.0}

In [None]:
X, mean, var = normalize(train_input)
Y = train_lable

Xtrain, Ytrain, Xdev, Ydev,Xtest, Ytest = split(X, Y,0.01)
Xtrain, Ytrain = augument(Xtrain, Ytrain)

#weights = calcWeights(Ytrain)
print("data preparation done")
epsilon = 0.0001

In [None]:
def fitModel(layer_dims, Xt, Yt, Xd, Yd, e, b, rl1, rl2, dr0, dr1):
    model = Sequential()
    for i in range(len(layer_dims)):
        act = 'relu' if i!= (len(layer_dims)-1) else 'sigmoid'
        kr11 = regularizers.l1(rl1) if rl1 > epsilon else None
        kr12=  regularizers.l2(rl2) if rl2 > epsilon else None
        if i==0:
            model.add(Dense(layer_dims[i], activation=act, input_dim=X.shape[1], kernel_regularizer=kr12, activity_regularizer=kr11))
            if dr0 > epsilon:
                model.add(Dropout(dr0))
        else:
            model.add(Dense(layer_dims[i], activation=act, kernel_regularizer=kr12, activity_regularizer=kr11))
            if dr1 > epsilon:
                model.add(Dropout(dr1))

    es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)        
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    start = time.time()
    hist = model.fit(Xt, Yt, epochs=e, batch_size=b, verbose=1, shuffle=True, validation_data=(Xd,Yd), callbacks = [es])#, class_weight=weights)
    end = time.time()
    val_acc = hist.history['val_acc'][-1] #last element
    print("(",e,b,")", "took", (int)((end-start)*1000/1000), "sec, acc", val_acc, "and loss acc hist ", hist.history['acc'])
    return (model, hist, (end-start)*1000)


In [None]:
print(Xtrain.shape, Ytrain.shape, Xdev.shape, Ydev.shape, Xtest.shape, Ytest.shape)
print(Ytrain[Ytrain == 0].shape[0], Ydev[Ydev == 0].shape[0])
print(Ytrain[Ytrain == 1].shape[0], Ydev[Ydev == 1].shape[0])

In [None]:
epochs = 40
params = [([1024, 512, 64, 8, 1], epochs, 300, 0.00, 0.00, 0.0, 0.0)]
hist = []
for (ldims, e, b, rl1, rl2, dr0, dr1) in params:    
    hist.append(fitModel(ldims, Xtrain, Ytrain, Xdev, Ydev, e, b, rl1, rl2, dr0, dr1))
print(hist)

In [None]:
def calcNewStats(ht):
    fn = (ht[(ht['t'] == 0) & (ht['o'] == 1)]).shape[0]
    tn = (ht[(ht['t'] == 0) & (ht['o'] != 1)]).shape[0]
    fp = (ht[(ht['t'] == 1) & (ht['o'] != 1)]).shape[0]
    tp = (ht[(ht['t'] == 1) & (ht['o'] == 1)]).shape[0]
    pOrig = ht[(ht['o'] == 1)].shape[0]
    nOrig = ht[(ht['o'] == 0)].shape[0]
    pTarg = ht[(ht['t'] == 1)].shape[0]
    nTarg = ht[(ht['t'] == 0)].shape[0]

    #print(tn, fn, tp, fp, pOrig, pTarg)
    prec = 0 if (tp + fp == 0) else tp / (tp + fp)
    recl = 0 if (pOrig == 0) else tp / pOrig
    f1 = 0 if(prec + recl == 0) else (prec*recl)/(prec+recl)
    return(fp, fn, tp, tn, pOrig, nOrig, pTarg, nTarg, f1)

In [None]:
def predictAndPrintStats(model, X, Y):
    devRaw = model.predict(X, batch_size=300, steps=None)
    devOut = (devRaw > 0.5)*1
    devInfo = pd.DataFrame(np.column_stack([devRaw, devOut, Y]), columns=['p','t','o'])
    stats = calcNewStats(devInfo)
    return(devInfo, stats)

In [None]:
for i in range(len(hist)):
    model = hist[i][0] 
    res1 = predictAndPrintStats(model, Xtrain, Ytrain)
    res2 = predictAndPrintStats(model, Xdev, Ydev)
    res3 = predictAndPrintStats(model, Xtest, Ytest)
    print(i, "train", res1[1], "dev", res2[1], "test", res3[1])


In [None]:
import matplotlib.pyplot as plt
clrs1=['b-', 'g-', 'r-', 'c-', 'm-', 'y-', 'k-', 'w-']
clrs2=['b--','g--','r--','c--','m--','y--','k--','w--']
bestModel = None
bestAcc = 0
for i in range(len(hist)):
    plt.plot(hist[i][1].history['acc'], clrs1[i])
    plt.plot(hist[i][1].history['val_acc'], clrs1[i]+'-')
    if bestAcc < hist[i][2]:
        bestModel = hist[i][0] 
        bestAcc = hist[i][2]

In [None]:
print("ready to test")
test_input = pd.read_csv('../input/test.csv')
test_input = test_input.drop(['ID_code'], axis=1)
Xsub = normalize(test_input, True, mean, var)[0]
targets = (bestModel.predict(Xsub, batch_size=300, steps=None) > 0.5)*1
print("saving output")
toSubmit = pd.read_csv('../input/sample_submission.csv')
toSubmit['target'] = targets
filename = "sub-{:%y%m%d%H%M}.csv".format(datetime.now())
toSubmit.to_csv(filename, index=False)
print("saved", filename)