# MNIST playground

## Overview
In this notebook we will use **Tensorflow 2** to build simple network and train it on the well known
[MNIST dataset](http://yann.lecun.com/exdb/mnist/). Later, we will improve created model by tuning hyperparameters, first ''by hand'' and later using **Keras Tuner**

## Import modules

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
import sklearn
import random
import time
import keras
import IPython

## Download and prepare the dataset

In [None]:
mnist = tf.keras.datasets.mnist
dataset = mnist.load_data()

In [None]:
# Load already randomized data
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

print("Training set data shape:", X_train.shape)
print("Training set target shape:", Y_train.shape)
# Convert rank one array to column array
Y_train = Y_train.reshape(-1, 1)
print("Labels shape after transformation:", Y_train.shape)
print("Test set data shape:", X_test.shape)
Y_test = Y_test.reshape(-1, 1)
print("Test set target shape:", Y_test.shape)

### Single example from dataset

In [None]:
plt.imshow(X_train[10].reshape(28, 28), interpolation='nearest')
print(Y_train[10])

### Flatten input data

In [None]:
X_train = X_train.reshape(X_train.shape[0],-1)
print(X_train.shape)
X_test = X_test.reshape(X_test.shape[0],-1)
print(X_test.shape)

### Normalize data
In case of the **MNIST** dataset it is common to scale the input data by dividing them by 255.0.  We will instead use Z normalization — this method transfer easier to other **ML** problems. 

In [None]:
scaler = sklearn.preprocessing.StandardScaler()
print(X_train.mean(), X_test.mean())
# Find mean and standard deviation
scaler.fit(X_train)
# print(scaler.mean_)
# Use same mean, sd for scaling training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.mean(), X_test.mean())

### Change output to categorial
We will use One-hot Encoding to represent target

In [None]:
print(Y_train[0])
Y_train = tf.keras.utils.to_categorical(Y_train)
Y_test = tf.keras.utils.to_categorical(Y_test)
print(Y_train[0])

## Ploting functions

In [None]:
def plot_curve(epochs, hist, list_of_metrics):
  """Plot a curve of  metrics vs. epoch.
  Arguments:
  epochs -- epochs list
  hist -- training history given as pd.DataFrame
  list_of_metics -- list of metrics to plot

  metrics names should be as given in: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#define_the_model_and_metrics
  """  
  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Value")

  # Plot given metrics
  for m in list_of_metrics:
    x = hist[m]
    plt.plot(epochs[1:], x[1:], label=m)

  plt.legend()

## First Model

In [None]:
# Layers dimensions
input_dim = X_train.shape[1]
hl1_dim = 64
hl2_dim = 64
hl3_dim = 64
output_layer_dim = Y_train.shape[1]

# Create model
model = tf.keras.models.Sequential()

# Add hidden layers  
model.add(tf.keras.layers.Dense(units=hl1_dim,
                                     input_dim=input_dim,
                                     activation='relu'))
model.add(tf.keras.layers.Dense(units=hl2_dim,
                                      activation='elu'))
model.add(tf.keras.layers.Dense(units=hl3_dim,
                                      activation='relu'))

# Output layer
# We will use softmax -- is standard for classification problem
# Network output looks like probabilistic distribution
model.add(tf.keras.layers.Dense(units=output_layer_dim, activation='softmax'))
model.summary()

### Compile model

In [None]:
learning_rate = 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# Use loss="categorical_crossentropy" for One-hot Encoding
model.compile(optimizer=optimizer,
              loss="categorical_crossentropy", metrics=['accuracy', tf.keras.metrics.MSE])

### Trainig

In [None]:
# First batch gradient descent
batch_size = X_train.shape[0]
epochs_number = 50
print("Batch size used to train gradient:", batch_size)
validation_split_size = 0.2
history = model.fit(x=X_train, y=Y_train, batch_size=batch_size,
                      epochs=epochs_number, shuffle=True, 
                      validation_split=validation_split_size)

### Model accuracy
Show learning history and plot learning curves

In [None]:
epochs = history.epoch
hist = pd.DataFrame(history.history)
print(hist.head())
print(hist[['accuracy', 'val_accuracy']].iloc[[-1]])
# Plot a graph of the metric vs. epochs
list_of_metrics_to_plot = ['accuracy', 'val_accuracy']
plot_curve(epochs, hist, list_of_metrics_to_plot)

### Evaluete model on test set

In [None]:
print("Accuracy:", model.evaluate(X_test, Y_test)[1])

## Building model with regularization
In our case difference between training accuracy and dev set accuracy was relatively large. Adding regularization to the model might diminish the difference.

In [None]:
# For now set constant regularyzation and droput rates
regularization_rate = 0.01
droput_rate = 0.2
# Build simple network
# Regularyzation rates are same in every layer
# just as droput rates
def build_model():
    """Function builds simple network with regularization
    
    using global variables:
    hl1_dim, hl2_dim, hl3_dim, regularization_rate and droput_rate
    """
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(units=hl1_dim,
                                        input_dim=input_dim,
                                        activation='relu',
                                        kernel_regularizer=tf.keras.regularizers.l2(regularization_rate)),
        tf.keras.layers.Dropout(droput_rate),
        tf.keras.layers.Dense(units=hl2_dim,
                                        activation='elu',
                                        kernel_regularizer=tf.keras.regularizers.l1(regularization_rate)),
        
        tf.keras.layers.Dropout(droput_rate),
        tf.keras.layers.Dense(units=hl3_dim,
                                        activation='relu',
                                        kernel_regularizer=tf.keras.regularizers.l2(regularization_rate)),
        tf.keras.layers.Dense(units=output_layer_dim, activation='softmax')
    ])
    return model


def compile_model(model):
    """Compile model"""
    model.compile(optimizer=optimizer,
                loss="categorical_crossentropy", metrics=['accuracy', tf.keras.metrics.MSE])


def train_model(model, verbose=1):
    """Train model
    
    using global variables:
    X_train, Y_train, batch_size, epochs_number, validation_split_size
    """
    history = model.fit(x=X_train, y=Y_train, batch_size=batch_size,
                        epochs=epochs_number, shuffle=True, 
                        validation_split=validation_split_size, verbose=verbose)
    return history

### Test model

In [None]:
epochs_number = 15
# Use mini batch gradient descent
# batch size should fit in GPU memory
batch_size = 64
# Build and train model with regularization
model = build_model()
compile_model(model)
history = train_model(model)
epochs = history.epoch
hist = pd.DataFrame(history.history)
print(hist[['accuracy', 'val_accuracy']].iloc[[-1]])
# Plot a graph of the metric vs. epochs.
plot_curve(epochs, hist, list_of_metrics_to_plot)

## Simple hyperparameters tuning
We will start by simple hyperparameters tuning. In each iteration choose simultaneously (this way we can test more models) new set of hyperparameter, use them to create and train model. After a given time function will return best working hyperparameters. Because of randomness included in training, values found by function might perform poorly. 

In [None]:
def random_learning_rate():
    """Generate random learning rate from logaritmic distribution"""

    # Firs choose r from (-4, 0)
    r = random.random() * -4
    # Return 10^{r}, that way will generate on average
    # smaller learning rates than simple linear distribution
    return 10**r

def random_regularization_rate():
    """Generate random regularization rate from linear distirbution"""
    return random.random()

def random_droput_rat():
    """Generate random droput rate from linear distirbution"""
    return random.random() * 0.75

def tune_hyperparameters(avaible_time):
    """Function returns best found set of hyperparameters in given time
    
    Returns dictionary where:
    'lr' -- best found learning rate
    'rr' -- best found regularization_rate
    'dr' -- best found droput_rate
    'acc' -- best trained model accuracy
    """
    # Initiate best hyperparameters
    best_learning_rate = 0
    best_regularization_rate = 0
    best_droput_rate = 0
    best_val_accuracy = 0
    st_time = time.time()
    while time.time() - st_time < avaible_time:
        # Generate new set of hyperparameters
        learning_rate = random_learning_rate()
        regularization_rate = random_regularization_rate()
        droput_rate = random_droput_rat()
        # Create and train new model
        model = build_model()
        compile_model(model)
        history = train_model(model, verbose=0)
        hist = pd.DataFrame(history.history)
        new_model_val_accuracy = hist['val_accuracy'].iloc[-1]
        # If new model has higher accuracy than currently higest
        # update hyperparameters
        if new_model_val_accuracy > best_val_accuracy:
            best_learning_rate = learning_rate
            best_regularization_rate = regularization_rate
            best_droput_rate = droput_rate
            best_val_accuracy = new_model_val_accuracy
    
    return {
        'lr':best_learning_rate,
        'rr':best_regularization_rate,
        'dr':best_droput_rate,
        'acc':best_val_accuracy
        }



In [None]:
avaible_time = 3600
parameters = tune_hyperparameters(avaible_time)

In [None]:
print(parameters)

### Build model using found hyperparametes

In [None]:
learning_rate, regularization_rate, droput_rate = parameters['lr'], parameters['rr'], parameters['dr']
epochs_number = 50
batch_size=64
model = build_model()
compile_model(model)
history = train_model(model)
epochs = history.epoch
hist = pd.DataFrame(history.history)
print(hist[['accuracy', 'val_accuracy']].iloc[[-1]])
# Plot a curve of the metric vs. epochs.
list_of_metrics_to_plot = ['accuracy', 'val_accuracy']
plot_curve(epochs, hist, list_of_metrics_to_plot)

In [None]:
print("Accuracy:", model.evaluate(X_test, Y_test)[1])

## Hyperparameters tuning with Keras Tuner

In [None]:
import kerastuner as kt

### Define model

In [None]:
def build_model_hp(hp):
  """KT model builder function"""
  model = keras.Sequential()
  
  # Tuned variables
  hp_learning_rate = hp.Float('learning_rate', 1e-4, 1e-1, sampling='log')
  hp_regularization_rate = hp.Float('regularization_rate', 0, 1, sampling='linear')
  hp_droput_rate = hp.Float('droput_rate', 0, 0.75, sampling='linear')
  hp_units1 = hp.Int('units1', min_value = 32, max_value = 512, step = 32)
  hp_units2 = hp.Int('units2', min_value = 32, max_value = 512, step = 32)
  hp_units3 = hp.Int('units3', min_value = 32, max_value = 512, step = 32)
  hp_first_activation = hp.Choice('activation1', ['relu', 'tanh'])

  # Build model using tuned variables  
  model.add(tf.keras.layers.Dense(units=hp_units1,
                                        input_dim=input_dim,
                                        activation=hp_first_activation,
                                        kernel_regularizer=tf.keras.regularizers.l2(hp_regularization_rate))  )
  model.add(tf.keras.layers.Dropout(hp_droput_rate))
  model.add(tf.keras.layers.Dense(units=hp_units2,
                                        activation='elu',
                                        kernel_regularizer=tf.keras.regularizers.l1(hp_regularization_rate))  )
  model.add(tf.keras.layers.Dropout(hp_droput_rate))
  model.add(tf.keras.layers.Dense(units=hp_units3,
                                        activation='relu',
                                        kernel_regularizer=tf.keras.regularizers.l2(hp_regularization_rate))  )
  model.add(tf.keras.layers.Dense(units=output_layer_dim, activation='softmax'))

  # Compile created model
  model.compile(optimizer = keras.optimizers.Adam(learning_rate = hp_learning_rate),
                loss = "categorical_crossentropy", 
                metrics = ['accuracy'])
  
  return model

### Initiate the tuner

In [None]:
# Save logs and checkpoints in /keras_tuners/mist_playgrand
tuner = kt.Hyperband(build_model_hp,
                     objective = 'val_accuracy', 
                     max_epochs = 10,
                     factor = 3,
                     directory = 'keras_tuners',
                     project_name = 'mist_playgrand') 

### Define callback to clear output after every step

In [None]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

### Create validation set used by Keras Tuner

In [None]:
from sklearn.model_selection import train_test_split
# Randomize and split labeled data into a new train set and dev set.
# Up till now TF took care of that during training
new_X_train, X_dev_test, new_Y_train, Y_dev_test = train_test_split(
    X_train, Y_train, test_size=0.20, random_state=17)

### Hyperparameter search

In [None]:
tuner.search(new_X_train, new_Y_train, epochs = 10, validation_data = (X_dev_test, Y_dev_test), callbacks = [ClearTrainingOutput()])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]
print(f"""
The hyperparameter search is complete. The optimal parameters found are:\n
learninig rate: {best_hps.get('learning_rate')} \n
regularyzation rate: {best_hps.get('regularization_rate')} \n
droput rate: {best_hps.get('droput_rate')}
units in first layer: {best_hps.get('units1')} \n
units in second layer: {best_hps.get('units2')} \n
units in third layer: {best_hps.get('units3')} \n
first activation function: {best_hps.get('activation1')} \n
""")

### Build model using hyperparameters found by Keeras Tuner

In [None]:
# Build the model with the optimal hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, Y_train, epochs = 50, validation_data = (X_test, Y_test))

### Evaluate final model

In [None]:
print("Accuracy:", model.evaluate(X_test, Y_test)[1])