# Test to Ensure GPU Functional and Available

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from  IPython import display
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn import datasets
import pandas as pd
import numpy as np
import datetime
import os
from matplotlib import pyplot as plt
import pathlib
import shutil
import tempfile
from IPython.display import Image
import time
print(tf.__version__)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

__DD:__ Check to see if CUDA GPU is available:

In [None]:
tf.test.is_gpu_available(
    cuda_only=True, min_cuda_compute_capability=None
)

__DD:__ Try new function based on warnings:

In [None]:
tf.config.list_physical_devices()

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

__DD:__ I was wondering how to make sure GPU does the processing rather than CPU. I found the following excerpt from Tensorflow documentation:  

_"If a TensorFlow operation has both CPU and GPU implementations, by default the GPU devices will be given priority when the operation is assigned to a device. For example, tf.matmul has both CPU and GPU kernels. On a system with devices CPU:0 and GPU:0, the GPU:0 device will be selected to run tf.matmul unless you explicitly request running it on another device."_  

## Case Study 12

__DD:__ To get the tensorflow docs packages working I had to clone the github repository to my local file system and run:  
_pip install -q C:\Users\danie\Documents\GitHub\docs_

In [None]:
#get data
gz = tf.keras.utils.get_file('HIGGS.csv.gz', 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz')
FEATURES = 28
ds = tf.data.experimental.CsvDataset(gz,[float(),]*(FEATURES+1), compression_type="GZIP")

def pack_row(*row):
  label = row[0]
  features = tf.stack(row[1:],1)
  return features, label

packed_ds = ds.batch(10000).map(pack_row).unbatch()

In [None]:
for features,label in packed_ds.batch(1000).take(1):
  print(features[0])
  plt.hist(features.numpy().flatten(), bins = 101)

__DD:__ This is where we will tweek to mimic the article

In [None]:
#"parameters were chosen using a subset of the HIGGS data 
#consisting of 2.6 million training examples and 100,000
#validation examples."

N_VALIDATION = int(100000)
N_TRAIN = int(2500000)
BUFFER_SIZE = int(2500000)
BATCH_SIZE = 1000
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [None]:
validate_ds = packed_ds.take(N_VALIDATION).cache()
train_ds = packed_ds.skip(N_VALIDATION).take(N_TRAIN).cache()
validate_ds = validate_ds.batch(BATCH_SIZE)
train_ds = train_ds.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

__DD:__ Original study's code has ".MLP"; which indicates they are using a multilayer perceptron as per pylearn documentation.  

According to Tensorflow documentation Sequential() is the equivalent to a multilayer perceptron.  

_"To build a simple, fully-connected network (i.e. multi-layer perceptron): model = tf.keras.Sequential()"_

__DD:__ Excerpts from the paper:

In [None]:
#"We selected a ﬁve-layer neural network with 300 hidden units
#in each layer, a learning rate of 0.05, and a weight decay 
#coeﬃcient of 1×10−5. 

#"Hidden units all used the tanh activation function."

#Weights were initialized from a normal distribution with
#zero mean and standard deviation 0.1 in the ﬁrst layer,
#0.001 in the output layer, and 0.05 all other hidden layers. 

#"Gradient computations were made on mini-batches of size 100.
#A momentum term increased linearly over the ﬁrst 200 epochs 
#from 0.9 to 0.99, at which point it remained constant."

#"The learning rate decayed by a factor of 1.0000002 every
#batch update until it reached a minimum of 10−6"

#"Training ended when the momentum had reached its maximum
#value and the minimum error on the validation set (500,000
#examples) had not decreased by more than a factor of 
#0.00001 over 10 epochs. This early stopping prevented 
#overﬁtting and resulted in each neural network being trained
#for 200-1000 epochs."

# 'Computations were performed using machines with 16 Intel Xeon
# 'cores, an NVIDIA Tesla C2070 graphics processor, and 64 GB 
# 'memory. All neural networks were trained using the GPU-accelerated
# 'Theano and Pylearn2 software libraries [24, 25]. Our code is 
# 'available at https://github.com/uci-igb/higgs-susy.

__DD:__ For reference: [higgs/analysis/template.py](higgs/analysis/template.py) is code from the Github described in the article. I used it to verify some of the parameters.

The article explains that they used _"Weight Decay 10^-5"_ which is equivalent to L2 regularization in tensorflow hence we have __kernel_regularizer=regularizers.l2(0.00001))__ in our code.

_L2 regularization is also called weight decay in the context of neural networks_

__DD__ We set __patience=10__ in call backs to mirror the original i.e. _"minimum error on the validation set (500,000 #examples) had not decreased by more than a factor of #0.00001 over 10 epochs"_

In [None]:
reproduced_model = tf.keras.Sequential([
    layers.Dense(300, activation='tanh',
                 kernel_regularizer=regularizers.l2(0.00001),
                 input_shape=(FEATURES,)),
    layers.Dense(300, activation='tanh'),
    layers.Dense(300, activation='tanh'),
    layers.Dense(300, activation='tanh'),
   # layers.Dense(300, activation='tanh'),
   # Using 5 layers gave us 370,201 parameters. 
   # I suspect that the paper is counting the input layer therefore they actually only used 4 so we only use 4
    layers.Dense(1)
])

callback = tf.keras.callbacks.EarlyStopping(monitor='binary_crossentropy', patience=10)

reproduced_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.05), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True,name='binary_crossentropy'),
                  'accuracy'], run_eagerly=False)



__DD:__ Using 5 layers gave us 370,201 parameters.  
__I suspect that the paper is counting the input layer therefore they actually only used 4.__

_The largest shallow network had 300,001 parameters, slightly more than the 279,901 parameters in the largest deep network_

Setting us up with 4 layers yields 279,901 parameters. Here is our reproduced the model; as best we can given the information available in the paper and the time that we have:

In [None]:
reproduced_model.summary()

In [None]:
%%time
reproduced_model.fit(train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=10000,
    callbacks=[callback],
    validation_data=validate_ds,
    verbose=0)

In [None]:
Image("GPU_Activity.png")

In [None]:
Image("CPU_Activity.png")

__DD__: Here are our results;

In [None]:
reproduced_model.evaluate(validate_ds)

__DD:__ Model took approximately 3 minutes to run. Results are terrible in comparison to the original study. This is likely due to:  

- We did notexplicitly replicate the weights used in the originial study
- We are using tensorflow 2.1.0, the original used Theano and Pylearn2.

In [None]:
# result=reproduced_model(features)
# regularization_loss=tf.add_n(reproduced_model.losses)

__DD:__ Given functional more powerful hardware than was available in 2014 we could attempt a gridsearch to determine number of layers.  
Here is excerpt of code that uses a grid search from https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/ 

Original paper's hardware specs:

In [None]:
Image("OriginalHardware.PNG")

Our hardware specs:

In [None]:
Image("OurHardware.PNG")

__DD:__ Try adjusting parameters based on tutorial:

In [None]:
adjusted_model_shallow = tf.keras.Sequential([
    layers.Dense(300, activation='elu',
                 kernel_regularizer=regularizers.l2(0.001),
                 input_shape=(FEATURES,)),
    layers.Dropout(0.1),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(0.5),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(0.5),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(0.5),
    layers.Dense(1)
])



# adjusted_model = tf.keras.Sequential([
#     layers.Dense(300, activation='elu',
#                  kernel_regularizer=regularizers.l2(0.001),
#                  input_shape=(FEATURES,)),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(.1),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(.1, activation='elu'),
#     layers.Dense(300, activation='elu'),
#     layers.Dropout(.1, activation='elu'),
#    # layers.Dense(300, activation='tanh'),
#    # Using 5 layers gave us 370,201 parameters. 
#    # I suspect that the paper is counting the input layer therefore they actually only used 4 so we only use 4
#     layers.Dense(1)
# ])

callback = tf.keras.callbacks.EarlyStopping(monitor='binary_crossentropy', patience=10)

adjusted_model_shallow.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.05), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True,name='binary_crossentropy'),
                  'accuracy'], run_eagerly=False)

adjusted_model_shallow.summary()

In [None]:
%%time
adjusted_model_shallow.fit(train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=10000,
    callbacks=[callback],
    validation_data=validate_ds,
    verbose=0)

__DD__: With only 1 layer we have significant improvement  but still only 70% accurate:

In [None]:
adjusted_model_shallow.evaluate(validate_ds)

__DD__: Lets try same as above but deep with 5 layers:

In [None]:
adjusted_model_deep5 = tf.keras.Sequential([
    layers.Dense(300, activation='relu',
                 kernel_regularizer=regularizers.l2(0.001),
                 input_shape=(FEATURES,)),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1)
])

callback = tf.keras.callbacks.EarlyStopping(monitor='binary_crossentropy', patience=10)

adjusted_model_deep5.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.05), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True,name='binary_crossentropy'),
                  'accuracy'], run_eagerly=False)


In [None]:
adjusted_model_deep5.summary()

In [None]:
%%time
adjusted_model_deep5.fit(train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=10000,
    callbacks=[callback],
    validation_data=validate_ds,
    verbose=0)

In [None]:
adjusted_model_deep5.evaluate(validate_ds)

__DD__: Lets try same as above but with stopping having more patience: __patience=1000__

In [None]:
adjusted_model_deep5_patience1000 = tf.keras.Sequential([
    layers.Dense(300, activation='relu',
                 kernel_regularizer=regularizers.l2(0.001),
                 input_shape=(FEATURES,)),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(300, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1)
])

callback = tf.keras.callbacks.EarlyStopping(monitor='binary_crossentropy', patience=1000)

adjusted_model_deep5_patience1000.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.05), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[tf.keras.losses.BinaryCrossentropy(from_logits=True,name='binary_crossentropy'),
                  'accuracy'], run_eagerly=False)

In [None]:
adjusted_model_deep5_patience1000.summary()

In [None]:
%%time
adjusted_model_deep5_patience1000.fit(train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=10000,
    callbacks=[callback],
    validation_data=validate_ds,
    verbose=0)

In [None]:
adjusted_model_deep5_patience1000.evaluate(validate_ds)

In [None]:
# # Use scikit-learn to grid search the batch size and epochs
# import numpy
# from sklearn.model_selection import GridSearchCV
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasClassifier
# # Function to create model, required for KerasClassifier
# def create_model():
# 	# create model
# 	model = Sequential()
# 	model.add(Dense(12, input_dim=8, activation='relu'))
# 	model.add(Dense(1, activation='sigmoid'))
# 	# Compile model
# 	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 	return model
# # fix random seed for reproducibility
# seed = 7
# numpy.random.seed(seed)
# # load dataset
# dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# # split into input (X) and output (Y) variables
# X = dataset[:,0:8]
# Y = dataset[:,8]
# # create model
# model = KerasClassifier(build_fn=create_model, verbose=0)
# # define the grid search parameters
# batch_size = [10, 20, 40, 60, 80, 100]
# epochs = [10, 50, 100]
# param_grid = dict(batch_size=batch_size, epochs=epochs)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
# grid_result = grid.fit(X, Y)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))