# Learning a Neural Network

We can now try to learn Neural Network models to predict the aggregated outputs.

First, let's load the new datasets:

In [None]:
import numpy as np
import keras
import pandas as pd
import os

In [None]:
# Load the data
data_fname = os.path.join('shared', 'za_processed.h5')
with pd.HDFStore(data_fname) as store:
    data_tr = store['data_tr']
    data_vl = store['data_vl']
    data_ts = store['data_ts']
    means_in = store['means_in']
    stds_in = store['stds_in']
    sim_in = store['sim_in']
    sim_out = store['sim_out']
    in_defaults = store['in_defaults']
    pop_size = store['meta']['pop_size']

# Separate input and output
X_tr = data_tr[sim_in]
Y_tr = data_tr[sim_out]
X_vl = data_vl[sim_in]
Y_vl = data_vl[sim_out]
X_ts = data_ts[sim_in]
Y_ts = data_ts[sim_out]

## Scalar Output

Let's first see what happens with scalar outputs. Let's start directly from the deeper model:

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K
import tensorflow
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Seed the RNGs
np.random.seed(42)
tensorflow.set_random_seed(42)

# Custom R2 metric (courtesy of https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34019)
def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Input shape
input_shape = (X_tr.shape[1],)

# Handle outputs
max_epochs = 50
for target in sim_out:
    y_tr = Y_tr[target].values
    y_vl = Y_vl[target].values
    y_ts = Y_ts[target].values

    # Define a Neural Network model to predict the number of infected
    model = Sequential()
    model.add(Dense(16, activation='relu', input_shape=input_shape))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='linear'))

    # Compile
    model.compile(optimizer='rmsprop',
                  loss='mse',
                  metrics=[r2_score])

    # Setup and perform training
    weight_fname = os.path.join('shared', 'nn_reg_%s.h5' % target)
    callbacks = [EarlyStopping(monitor='val_loss', patience=5),
                 ModelCheckpoint(filepath=weight_fname, monitor='val_loss', save_best_only=True)]
    model.fit(X_tr, y_tr, epochs=max_epochs, batch_size=32, callbacks=callbacks,
              validation_data=(X_vl, y_vl), verbose=0)
    
    # Save the model architecture
    arch_fname = os.path.join('shared', 'nn_reg_%s.json' % target)
    with open(arch_fname, 'w') as fp:
        fp.write(model.to_json())

    # Evaluate on the test set
    print('=== Results for target "{}"'.format(target))
    res_tr = model.evaluate(X_tr, y_tr, batch_size=len(X_tr))
    print('Loss and R2 on the training set: {}, {}'.format(*res_tr))
    res_vl = model.evaluate(X_vl, y_vl, batch_size=len(X_vl))
    print('Loss and R2 on the validation set: {}, {}'.format(*res_vl) )
    res_ts = model.evaluate(X_ts, y_ts, batch_size=len(X_ts))
    print('Loss and R2 on the test set: {}, {}'.format(*res_ts))

Way better! There's still a little bit of overfitting, unfortunately.

Training is faster, because the aggregation has effectively reduced by 20 times the number of examples (which is both good and bad).

## Vector Output

And here's our vector output version:

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K
import tensorflow
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Seed the RNGs
np.random.seed(42)
tensorflow.set_random_seed(42)

# Custom R2 metric (courtesy of https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34019)
def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Input shape
input_shape = (X_tr.shape[1],)

# Both outputs at the same time
max_epochs = 50

# Define a Neural Network model to predict the number of infected
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=input_shape))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='linear'))

# Compile
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=[r2_score])

# Setup and perform training
weight_fname = os.path.join('shared', 'nn_reg.h5')
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath=weight_fname, monitor='val_loss', save_best_only=True)]
model.fit(X_tr, Y_tr, epochs=max_epochs, batch_size=32, callbacks=callbacks,
          validation_data=(X_vl, Y_vl), verbose=0)

# Save the model architecture
arch_fname = os.path.join('shared', 'nn_reg.json')
with open(arch_fname, 'w') as fp:
    fp.write(model.to_json())

# Evaluate on the test set
res_tr = model.evaluate(X_tr, Y_tr, batch_size=len(X_tr))
print('Loss and R2 on the training set: {}, {}'.format(*res_tr))
res_vl = model.evaluate(X_vl, Y_vl, batch_size=len(X_vl))
print('Loss and R2 on the validation set: {}, {}'.format(*res_vl) )
res_ts = model.evaluate(X_ts, Y_ts, batch_size=len(X_ts))
print('Loss and R2 on the test set: {}, {}'.format(*res_ts))

Even better!

In principle we should do something about the overfitting. Actually, it is mostly due to the reduced size of the validation and the test sets, so generating more data should be the way to go.

For now, let's just move to the next step: we will train a different Machine Learning model.