In [None]:
import gc
import pandas as pd 
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

## Modiying our data for tensorflow neural networks 

The datasets we use are too big to pass into the neural networks, so we need to transform and preprocess our data for better performance in training our models. 

Most of this code follows the [Load a pandas DataFrame](https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example) preprocessing guide.

In [None]:
X_train = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_train_agg.pkl', compression='gzip')
target = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/y_train_agg.pkl', compression='gzip')

In [None]:
# split the feature names into lists based on their type
binary_features = ['D_63_last', 'D_64_last']
binary_feature_names = [col for col in X_train if '_'.join(col.split("_", 3)[:3]) in binary_features]
numeric_feature_names = [col for col in X_train if col not in binary_feature_names]

In [None]:
# turn each column of the dataframe into a tf.keras.Input() object 
inputs = {}
for name, column in X_train.items():
  if (name in binary_feature_names):
    dtype = tf.int64
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)

## Preprocessing inputs 

To preserve memory and be more efficient, we need to pass each feature through a preprocessing pipeline. For our purposes, we essentially just convert every column into `tf.float32`. 

In [None]:
preprocessed = []

# preprocess binary features by converting to float32
for name in binary_feature_names:
  inp = inputs[name]
  inp = inp[:, tf.newaxis]
  float_value = tf.cast(inp, tf.float32)
  preprocessed.append(float_value)

In [None]:
def stack_dict(inputs, fun=tf.stack):
    """
    Stacks numerical tf.keras.Input() objects as tf.float32 

    Parameters
    ----------
    inputs: dict{string: tf.keras.Input()}
        Python dictionary where keys are the column name and values are the previously made tf.keras.Input() objects
    fun: function()
        Function to manipulate inputs. Default is tf.stack()

    Returns
    -------
    tf.Tensor
        A stacked float32 Tensor
    """   
    
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

In [None]:
numeric_inputs = {}
for name in numeric_feature_names:
  numeric_inputs[name] = inputs[name]

# preprocess numeric inputs by stacking them and converting to float32
numeric_inputs = stack_dict(numeric_inputs)
preprocessed.append(numeric_inputs)

preprocessed

In [None]:
# concat the preprocessed binary and numerical features
preprocesssed_result = tf.concat(preprocessed, axis=-1)
preprocesssed_result

In [None]:
# create a tf.keras.Model out of the preprocessor so it can be reused later
preprocessor = tf.keras.Model(inputs, preprocesssed_result)

In [None]:
# verify with the preprocessed first example of the dataset 
preprocessor(dict(X_train.iloc[:1]))

## Creating the neural network model

In [None]:
num_features = preprocesssed_result.shape.as_list()[1]
hidden_nodes_l1 = (num_features + 1) // 2
hidden_nodes_l2 = (hidden_nodes_l1 + 1) // 2
hidden_nodes_l3 = (hidden_nodes_l2 + 1) // 2
hidden_nodes_l4 = (hidden_nodes_l3 + 1) // 2

nn = tf.keras.Sequential()

# hidden layers
nn.add(tf.keras.layers.Dense(hidden_nodes_l1, activation='tanh'))
nn.add(tf.keras.layers.Dense(hidden_nodes_l2, activation='tanh'))
nn.add(tf.keras.layers.Dense(hidden_nodes_l3, activation='tanh'))
nn.add(tf.keras.layers.Dense(hidden_nodes_l4, activation='tanh'))

# output layer
nn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
# use the saved preprocessor model to generate our features 
x = preprocessor(inputs)

In [None]:
# create the output tensor to pass to Model() object 
result = nn(x)
result

In [None]:
model = tf.keras.Model(inputs, result)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(dict(X_train), target, validation_split=0.3, epochs=500, batch_size=10000)

In [None]:
del X_train, target
gc.collect()

## Loss and accuracy plots

In [None]:
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['accuracy'])
plt.title("Accuracy")
plt.legend(["Val", "Train"])
plt.show()

In [None]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title("Loss")
plt.legend(["Val", "Train"])
plt.show()

In [None]:
model.save('DNN_agg.h5')

In [None]:
X_test = pd.read_pickle('/kaggle/input/amex-imputed-aggregate-data/X_test_agg.pkl', compression='gzip')

submission = pd.DataFrame(model.predict(dict(X_test), batch_size=5000, verbose=1), index=X_test.index, columns=['prediction'])

# index needs to be removed from submission csv
submission = submission.reset_index()
submission.to_csv('DNN_submission.csv', index=False)