In [2]:
#keras.backend.clear_session()
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sklearn
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

np.random.seed(23)
tf.random.set_seed(23)
keras.backend.clear_session()

# Subclassing API to Build Dynamic Models

In [4]:
class WideAndDeepModel(keras.Model):
    def __init__(self,units=30,activation = "relu", **kwargs):
        self.__init__(**kwargs)
        self.hidden1 = keras.layers.Dense(units,activation=activation)
        self.hidden1 = keras.layers.Dense(units,activation=activation)
        self.main_output = keras.layers.Dense(1)
        self.aux_output = keras.layers.Dense(1)
    def call(self,inputs):
        input_A, input_B = inputs
        hidden1 = self.hidden1(input_B)
        hidden2 = self.hidden2(hidden1)
        concat = keras.layers.concatenate([input_A,hidden2])
        main_output = self.main_output(concat)
        aux_output = self.aux_output(hidden2)
        return main_output, aux_output
model = WideAndDeepModel()

### Saving and Restoring a Model
model.save()
keras.model.load_model("") # only workds for models with Sequential or Functional API

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

X_train_A, X_train_B = X_train[:, :5], X_train[:, 2:]
X_valid_A, X_valid_B = X_valid[:, :5], X_valid[:, 2:]
X_test_A, X_test_B = X_test[:, :5], X_test[:, 2:]
X_new_A, X_new_B = X_test_A[:3], X_test_B[:3]

In [3]:

input_A = keras.layers.Input(shape=[5], name="wide_input")
input_B = keras.layers.Input(shape=[6], name="deep_input")
hidden1 = keras.layers.Dense(30, activation="relu")(input_B)
hidden2 = keras.layers.Dense(30, activation="relu")(hidden1)
concat = keras.layers.concatenate([input_A, hidden2])
output = keras.layers.Dense(1, name="output")(concat)
model = keras.models.Model(inputs=[input_A, input_B], outputs=[output])

model.compile(loss="mse", optimizer=keras.optimizers.SGD(lr=1e-3))

## Using CallBacks

<p>The fit() method accepts a callbacks argument that lets you specify a list of objects
that Keras will call at the start and end of training, at the start and end of each epoch,
and even before and after processing each batch</p>
<br>
<p>
You can combine both callbacks to save checkpoints of your
model (in case your computer crashes) and interrupt training early when there is no
more progress (to avoid wasting time and resources)
</p>

```python
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
                                                 restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100,
                    validation_data=(X_valid, y_valid),
                    callbacks=[checkpoint_cb, early_stopping_cb])
```

Write custom callbacks
As you might expect, you can implement on_train_begin(), on_train_end(),
on_epoch_begin(), on_epoch_end(), on_batch_begin(), and on_batch_end(). Callbacks
can also be used during evaluation and predictions, should you ever need them
(e.g., for debugging). For evaluation, you should implement on_test_begin(),
on_test_end(), on_test_batch_begin(), or on_test_batch_end() (called by evaluate()), and for prediction you should implement on_predict_begin(), on_predict_end(), on_predict_batch_begin(), or on_predict_batch_end() (called by
predict()).

```python
class PrintValTrainRatioCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        print("\nval/train: {:.2f}".format(logs["val_loss"] / logs["loss"]))
```

In [4]:
EPOCHS = 10
checkpoint_filepath = './jupyter_images/acheck'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    save_best_only=True)

In [None]:
model.fit((X_train_A, X_train_B), y_train, epochs=20,
                    validation_data=((X_valid_A, X_valid_B), y_valid),callbacks=[model_checkpoint_callback])

In [None]:
mse_test = model.evaluate((X_test_A, X_test_B), y_test)
y_pred = model.predict((X_new_A, X_new_B))

## Using Tensorboard for visualization

In [4]:
import os
root_logdir = os.path.join(os.curdir,"my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)
run_logdir = get_run_logdir() # e.g., './my_logs/run_2019_06_07-15_15_22'

tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)
history = model.fit((X_train_A, X_train_B), y_train, epochs=20,
                    validation_data=((X_valid_A, X_valid_B), y_valid),callbacks=[tensorboard_cb])

Train on 11610 samples, validate on 3870 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<h1 style="color:red;">Fine Tuning Hyperparameters</h1>

In [3]:

fashion_mnist = keras.datasets.fashion_mnist
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()


X_valid, X_train = X_train_full[:5000] / 255., X_train_full[5000:] / 255.
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test / 255.

<h3 style="color:orange">To use GridSearchCV or RandomizedSearchCV to expolore the hyperparameter space, we need to wrap the keras model in objects that mimic regular Scikit-Learn regressors.</h3>

In [4]:

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()

X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)
X_new = X_test[:3]

In [5]:
def build_model(n_hidden=1, n_neurons=30, learning_rate =3e-3, input_shape=[8]):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons,activation="relu"))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(lr=learning_rate)
    model.compile(loss="mse",optimizer=optimizer)
    return model

In [6]:
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

In [7]:
keras_reg.fit(X_train, y_train, epochs=100,
              validation_data=(X_valid,y_valid),
              callbacks=[keras.callbacks.EarlyStopping(patience=10)],
              verbose=0)

<tensorflow.python.keras.callbacks.History at 0x14382cfd0>

In [11]:
mse_test = keras_reg.score(X_test, y_test,verbose=0)
y_pred = keras_reg.predict(X_new)

<h2 style="color:> ## QUESTION: In relation to a models hyperparameters why is it preferable to use a randomized search rather than grid search <br>
## ANSWER: Since there are many combinations of hyperparametrs and we want to train hundreds of variants and see which one performs best on the validation set </h2>

<h2 sytle="color:#0078d7"> QUESTION: Why is it preferable to use a randomized search rather than a grid search for hyperparameter tuning<br> ANSWER: `RandomizedSearchCV` Because when working with neural networks we want to try many combinations of hyperparameters and see which one works best on the validation sytle</h2>

In [11]:
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

param_distribs = {
    "n_hidden": [0,1,2,3],
    "n_neurons": np.arange(1,100),
    "learning_rate": reciprocal(3e-4,3e-2)
}

rnd_search_cv = RandomizedSearchCV(keras_reg,param_distribs, n_iter=10, cv=3)
rnd_search_cv.fit(X_train,y_train,epochs=100,
                  validation_data=(X_valid,y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10)],verbose=0)




RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x1407023d0>,
                   iid='warn', n_iter=10, n_jobs=None,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1434abc90>,
                                        'n_hidden': [0, 1, 2, 3],
                                        'n_neurons': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,...
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_tr

In [19]:
rnd_search_cv.best_score_

-0.3140690041399536

<p style="color:yellow">Restrict the search space and prototype will benefit from developing understanding of the hyperparameters involved in neural networks.</p>

<h2 style="color:orange;"> Number of Hidden Layers</h2>

### QUESTION: Describe an example hierarchical architecture of DNN for example what would the lower-level,intermediate-leve, and high-level structures of a DNN **model** in the lower hidden layers, intermediate, and highest hidden layers respectively in the case of recognizing faces. Hint: Think hierarchial what it takes to draw a face.
### POSSIBLE ANSWER:lower hidden layers model low-level structures (e.g., line segments of various shapes and orientations), intermediate hidden layers combine these low-level structures to model intermediate-level structures (e.g., squares, circles), and the highest hidden layers and the output layer combine these intermediate structures to model high-level structures (e.g., faces).

### QUESTION: How does DNN hierarchical architecture improve their ability to generalize to new datasets. For example what parts of a tranined neural network model used to recognize faces in pictures can be used to train a new neural network to recognize hairstyles. 
### ANSWER: You can use the weights and biases of the lower layers of the already trained model this way the network will not have to learn from scracth all the low-level structures that occur in most pictures, it will only have to learn the higher-level structures (e.g., hairstyles) This is called transfer learnning.

<h2 style="color:orange"> Number of Neurons per Hidden Layer</h2>

### QUESTION: For the number of Neurons per hidden layer you could try increasing the number of neurons gradually until the network starts overfitting. But can you think of a more simpler time efficient approach which avoids bottleneck layers that could ruin your model?
### POSSIBLE ANSWER: Pick a model with more layers and neurons than you actually need, then use early stopping, and other regularization techniques to prevent it from overfitting.

### QUESTION: In general the optimal learning rate is about half of the maximum learning rate (i.e., the learning rate above which the training algorithm starts to diverge). But if you had the computational power and time how could you train a model to find a good learning rate?
### POSSIBLE ANSWER: Ttrain the model for a few hundred iterations, starting with a very low learning rate (e.g., 10^-5) and gradually increasing it up to a very large value (e.g., 10). This is done by multiplying the learning rate by a constant factor at each iteration (e.g., by exp(log(10^6)/500) to go from 10^-5 to 10 in 500 iterations). If you plot the loss as a function of the learning rate (using a log scale for the learning rate), The optimal learning rate will be typically 10 times lower than the turning point where the loss starts to climb from previously dropping.

### QUESTION: The batch size can have a significant impact on your model's performance and training time. If Large batch sizes lead to training instabilities, especially at the beginning of training, and the resulting model may not generalize as well as a model trained with a small batch size. What strategy with the learning rate hyperparameter can be used.
### ANSWER: As proposed by papers by Elad Hoffer et al.25 and Priya Goyal et al.26 showed that it was possible to use very large batch sizes (up to 8,192) using various techniques such as warming up the learning rate (i.e., starting training with a small learning rate, then ramping it up.