In [None]:
import numpy as np
import pandas as pd
import os
import keras
import tensorflow as tf
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
import matplotlib.pyplot as plt
from time import time, sleep
import seaborn as sns
from IPython.display import display, clear_output
sns.set()
gc.enable()

In [None]:
def trainingPlot(hh):
    ks = list(hh.keys())
    fig = plt.figure(figsize=(20,9))
    fig.add_subplot(1,2,1)
    plt.plot(hh[ks[2]], label="train")
    plt.plot(hh[ks[0]], label="valid")
    plt.xlabel("epoch")
    plt.ylabel(ks[2])
    plt.legend()
    plt.grid(True)
    fig.add_subplot(1,2,2)
    plt.plot(hh[ks[3]], label="train")
    plt.plot(hh[ks[1]], label="valid")
    plt.xlabel("epoch")
    plt.ylabel(ks[3])
    plt.grid(True)
    plt.legend()
    plt.show()

## Load the preencoded data based on https://www.kaggle.com/bogorodvo/lightgbm-baseline-model-using-sparse-matrix
Because of the memory and training and predicting time we will use just a 2.5M rows of the data.

In [None]:
%%time
#%% Load the preencoded data based on https://www.kaggle.com/bogorodvo/lightgbm-baseline-model-using-sparse-matrix
X = pd.read_pickle("../input/forked-from-lightgbm-to-get-all-as-category/train.pkl")
# test = pd.read_pickle("../input/forked-from-lightgbm-to-get-all-as-category/test.pkl")

#%% Feature selection based on https://www.kaggle.com/jiegeng94/everyone-do-this-at-the-beginning and others.
used_cols = [i for i in X.columns if i not in ["MachineIdentifier", "HasDetections",
                                                "PuaMode", "Census_ProcessorClass",  # mostly missing
                                                "Census_IsWIMBootEnabled","IsBeta",
                                                "Census_IsFlightsDisabled","Census_IsFlightingInternal",
                                                "AutoSampleOptIn","Census_ThresholdOptIn",
                                                "SMode","Census_IsPortableOperatingSystem",
                                                "Census_DeviceFamily","UacLuaenable", "Census_IsVirtualDevice",  # too skewed columns
                                                "Census_OSSkuName",    # hightly-correlated features
                                                "Processor", "Census_OSInstallLanguageIdentifier", "train"]]

y = X["HasDetections"]
X = X[used_cols]

encoding_map = {}
embedded_layer_parameters = {}
for col in used_cols:
    key_map = {i:n for n,i in enumerate(X[col].cat.categories)}
    encoding_map[col] = key_map
    X[col] = X[col].map(key_map).astype("category")
    embedded_layer_parameters[col] = len(X[col].unique())
    
chunk_size = int(2.5e6)
X = X.loc[:chunk_size-1]
y = y[:chunk_size]

gc.collect()

### Unfortunately keras doesn't have a AUC/ROC as metrics. Therefore I use the the following two functions as a workaround. 

In [None]:
#%% Some helper function
def my_metric_func(y_true, y_pred):
    try:
        score = roc_auc_score(y_true, y_pred)
    except:
        score = 0.5
    return score

def auroc(y_true, y_pred):
    return tf.py_func(my_metric_func, (y_true, y_pred), tf.double)

### The following function creates a NN model with ```keras.layers.Embedding``` for all categorical features followed by two ```Dense``` ```Dropout``` ```BatchNormalization``` layers. The ```use_in_prediction``` parameter controls the dropout-rate while predicting. ```use_in_prediction = False``` --> Dropoute-rate == 0.0 | ```use_in_prediction = True``` --> Dropoute-rate != 0.0

In [None]:
#%% Model creating based on https://www.kaggle.com/learn/embeddings
def create_model(embedded_layer_parameters, use_in_prediction=True):
    
    hidden_units = (500,500)
    dpo_values = (0.8, 0.8)
    embedding_size = 4
    embedded = []
    inputs = []
    
    for col in embedded_layer_parameters.keys():
        input_layer = keras.Input(shape=(1,), name=col)
        embedded_layer = keras.layers.Embedding(input_dim=embedded_layer_parameters[col], 
                                                output_dim=embedding_size, 
                                                input_length=1, name=f"{col}_emb")(input_layer)
        
        inputs.append(input_layer)
        embedded.append(embedded_layer)
    
    concatenated = keras.layers.Concatenate()(embedded)
    out = keras.layers.Flatten()(concatenated)
    
    for n_hidden, dpo_val in zip(hidden_units, dpo_values):
        out = keras.layers.Dense(n_hidden, activation='relu')(out)
        out = keras.layers.Dropout(dpo_val)(out, training=use_in_prediction)
        out = keras.layers.BatchNormalization()(out)
    
    out = keras.layers.Dense(1, activation='sigmoid')(out)
    
    model = keras.Model(inputs = inputs, outputs = out,)
    adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[auroc]) #'accuracy'
    return model

### Now we have to know the indices of the dropout layers.

In [None]:
model = create_model(embedded_layer_parameters, use_in_prediction=True)

for n,i in enumerate(model.layers):
    if "Dropout" in str(i):
        print(n,i)

## Control the dropout-range while training
### The idea behind the ```dropout_control``` callback class is to add more variance to the model. Before every batch the dropout-rate changes in a specific range.

In [None]:
class dropout_control(keras.callbacks.Callback):
    def __init__(self, rate_min=0.3, rate_max=1):
#         super(printAUC, self).__init__()
        self.rate_min = rate_min
        self.rate_max = rate_max
        
    def on_batch_begin(self, batch, logs={}):
        self.model.layers[133].rate = np.random.uniform(self.rate_min, self.rate_max, 1)[0]
        self.model.layers[136].rate = np.random.uniform(self.rate_min, self.rate_max, 1)[0]
        return

## Training
### The large size of the NN shoud lead to more bias and the stochastically dropout-range while predicting should add more variance to the model.

In [None]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
dpo = dropout_control(0.7, 0.95)
model = create_model(embedded_layer_parameters, use_in_prediction=True)
history = model.fit([X[col].values for col in X.columns], y, batch_size=100000, epochs=100, verbose=0, callbacks=[dpo], validation_split=0.01)
model.save(f'my_model.h5')
trainingPlot(history.history)

## Testing
### Now lets create some different models out of the trained model. 
>  ```model_normal``` with ```dropout_rate = 0.0```  <br>
> ```model_dpo``` with ```dropout_rate = 0.8``` <br>
> ```dpo_with_range``` with ```dropout_rate_range = 0.0 - 0.8```

In [None]:
model_bagg = {}

model_normal = create_model(embedded_layer_parameters, use_in_prediction=False)
model_normal.load_weights("my_model.h5")
model_bagg["normal"] = model_normal

model_dpo_on = create_model(embedded_layer_parameters, use_in_prediction=True)
model_dpo_on.load_weights("my_model.h5")
model_bagg["dpo_on"] = model_dpo_on

model_bagg["dpo_with_range"] = model_dpo_on

In [None]:
X_test = [X_test[col].values for col in X_test.columns]

In [None]:
%%time
dpo = dropout_control(0, 0.9)
fig = plt.figure(figsize=(20,9))
for name,model in model_bagg.items():

    n_estimators = 200
    scores = []
    predictions = np.zeros((X_test[0].shape[0], n_estimators))

    if name == "normal":
        n_estimators = 2
    for n in range(n_estimators):
        
        predictions[:,n] = model.predict(X_test, verbose=0, batch_size=100000)[:,0]
        if name == "dpo_with_range":
            model.layers[133].rate = np.random.uniform(dpo.rate_min, dpo.rate_max, 1)[0]
            model.layers[136].rate = np.random.uniform(dpo.rate_min, dpo.rate_max, 1)[0]
        
        y_pred = np.mean(predictions[:,:n+1], axis=1)
        res = roc_auc_score(y_test, y_pred)
        print(f"\r{name} {n+1} | auroc: {res*100:.4f}%", flush=True, end="")
        scores.append(res*100)
    print("")
    x_ax = np.linspace(1,predictions.shape[1]+1, len(scores))
    plt.plot(x_ax, scores, "-", label=name)

plt.xlabel("estimators")
plt.ylabel("roc score")
plt.grid(True)
plt.legend()
plt.show()


## Conclusion
### The results show that it's possible to increase the accuracy of a NN by using the dropout-layer while training as well as while predicting. The increase of the accuracy in this simpe example isn't very high but with some tuning it's possible to get more out of the NN, I believe.

## Feedback
### It's basicly my first kernel with the intend to share information. To improve my kernel skills I would be happy to get some feedback on the content as well as on the every thing you think it's important like style or the English language.

In [None]:
del X, X_test, y, y_test
gc.collect()

# Let's make a prediction for LB with 100 estimators

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
%%time
chunk = int(2e6) #7853253
###################### Load, encode and transform test data ############################
test = pd.read_pickle("../input/forked-from-lightgbm-to-get-all-as-category/test.pkl")
test = test[used_cols]

for col, key_map in encoding_map.items():
    print(f"\r{col}", flush=True, end="")
    test[col] = test[col].map(key_map).astype("category")
    
predictions = np.zeros(test.shape[0])
model = model_bagg["dpo_with_range"]
print("Predicting.")
for m,x in enumerate(chunker(test, chunk)):
    print(f"\rchunk: {m}", flush=True, end="")
    start = x.index[0]
    end = x.index[-1]+1
    
    x = [x[col].values for col in test.columns]
    gc.collect()    

    n_estimators = 100
    blend = np.zeros((x[0].shape[0], n_estimators))
    for j in range(n_estimators):
        blend[:,j] = model.predict(x, verbose=0, batch_size=100000)[:,0]

    predictions[start:end] = np.mean(blend, axis=1)

print("\nDone")
b = plt.hist(predictions, bins=200)

In [None]:
%%time
my_submission = pd.read_csv(f'../input/microsoft-malware-prediction/sample_submission.csv')
my_submission['HasDetections'] = predictions
my_submission.to_csv(f'submission.csv', index=False)
my_submission.head()