# Chapter 11
https://github.com/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb

In [21]:
from tensorflow import keras
import tensorflow as tf
import numpy as np

# Vanishing Gradients and Exploding Gradients

Gradients often get smaller and smaller as the algorithm progresses down to the lower layers. As a result, the Vanishing Gradients \ 
We should initialize all the parameters using the following functions: \
Normal distribution whose varience is $\sigma^{2} = \frac{2}{fan_{in}+fan_{out}}$ or a uniform distribution $[-\sqrt{3/fan_{avg}},\sqrt{3/fan_{avg}}]$ \
when you use Non, tanh, logistic, softmax as activation func





### Total Initializer

In [66]:
[name for name in dir(keras.initializers) if not name.startswith("_")]

['Constant',
 'GlorotNormal',
 'GlorotUniform',
 'Identity',
 'Initializer',
 'Ones',
 'Orthogonal',
 'RandomNormal',
 'RandomUniform',
 'TruncatedNormal',
 'VarianceScaling',
 'Zeros',
 'constant',
 'deserialize',
 'get',
 'glorot_normal',
 'glorot_uniform',
 'he_normal',
 'he_uniform',
 'identity',
 'lecun_normal',
 'lecun_uniform',
 'ones',
 'orthogonal',
 'serialize',
 'zeros']

In [2]:
keras.layers.Dense(10,activation='relu',kernel_initializer = 'he_normal')

<tensorflow.python.keras.layers.core.Dense at 0x14141cc10>

Or

In [None]:
he_avg_init = keras.initializers.VarianceScaling(scale=2,mode='fan_avg',distribution='uniform')
keras.layers.Dense(10,activation='relu',kernel_initializer=he_avg_init)

# Nonsaturating Activation Functions

### total activation func:

In [67]:
[m for m in dir(keras.activations) if not m.startswith("_")]

['deserialize',
 'elu',
 'exponential',
 'get',
 'hard_sigmoid',
 'linear',
 'relu',
 'selu',
 'serialize',
 'sigmoid',
 'softmax',
 'softplus',
 'softsign',
 'tanh']

### ReLU family

In [68]:
[m for m in dir(keras.layers) if "relu" in m.lower()]


['LeakyReLU', 'PReLU', 'ReLU', 'ThresholdedReLU']

leaky ReLU

In [None]:
keras.layers.Dense(10,kernel_initializtion='he_normal')
keras.layers.LeakyReLU(alpha=0.2)

PreLU

In [None]:
keras.layers.Dense(10,kernel_initializtion='he_normal')
keras.layers.PReLU()

SELU

In [None]:
keras.layers.Dense(10,activation='selu',kernel_initializer='lecun_normal')

# Batch Normaliztion

In [3]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300,activation='elu',kernel_initializer = 'he_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100,activation='elu',kernel_initializer = 'he_normal'),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10,activation='softmax'),
])

In [4]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 784)               3136      
_________________________________________________________________
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
batch_normalization_1 (Batch (None, 300)               1200      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1

In [5]:
[(var.name,var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [6]:
model.layers[1].updates

[<tf.Operation 'cond/Identity' type=Identity>,
 <tf.Operation 'cond_1/Identity' type=Identity>]

In [7]:
model.layers

[<tensorflow.python.keras.layers.core.Flatten at 0x143e67390>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x143e66650>,
 <tensorflow.python.keras.layers.core.Dense at 0x143e66210>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x143e65c90>,
 <tensorflow.python.keras.layers.core.Dense at 0x143e49410>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x143e40ed0>,
 <tensorflow.python.keras.layers.core.Dense at 0x143e2a3d0>]

### Adding the BN before activatioin function

In [9]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300,kernel_initializer = 'he_normal',use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('elu'),
    keras.layers.Dense(100,kernel_initializer = 'he_normal',use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('elu'),
    keras.layers.Dense(10,activation='softmax'),
])

# Gradient Clipping

In [None]:
optimizer = keras.optimizers.SGD(clipvalue=1.)
optimizer = keras.optimizers.SGD(clipnorm=1.)
model.compile(loss='mse',optimizer=optimizer)

# ReUse PreTrained Layers

#### pretrain a layer

In [18]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

In [19]:
def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A),
            (X[y_5_or_6], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

In [22]:
tf.random.set_seed(42)
np.random.seed(42)

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, kernel_initializer="he_normal"),
    keras.layers.LeakyReLU(),
    keras.layers.Dense(100, kernel_initializer="he_normal"),
    keras.layers.LeakyReLU(),
    keras.layers.Dense(10, activation="softmax")
])

#### model A

In [None]:
tf.random.set_seed(42)
np.random.seed(42)
model_A = keras.models.Sequential()
model_A.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A.add(keras.layers.Dense(8, activation="softmax"))
model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=keras.optimizers.SGD(lr=1e-3),
                metrics=["accuracy"])
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                    validation_data=(X_valid_A, y_valid_A))

In [36]:
model_A.save("my_model_A.h5")

#### model B

In [27]:
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))
model_B.add(keras.layers.Dense(1, activation="sigmoid"))

In [28]:
model_B.compile(loss="binary_crossentropy",
                optimizer=keras.optimizers.SGD(lr=1e-3),
                metrics=["accuracy"])

history = model_B.fit(X_train_B, y_train_B, epochs=20,
                      validation_data=(X_valid_B, y_valid_B))

In [42]:
model_B.evaluate(X_test_B,y_test_B)



[0.14263126027584075, 0.9695]

#### model B on A

In [37]:
model_A = keras.models.load_model("my_model_A.h5")
model_B_on_A = keras.models.Sequential(model_A.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))

#### If we train model model_B_on_A, it will also affect model A, so we need a clone of model A
This layer is only reuse the layer in model A, except the output layer. Also, because the parameters in the output layer is initialized randomly, so we can have large loss, we need to train 4 epoches to train this layer.

In [38]:
model_A_clone = keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [39]:
#all the layers except the very up one.
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False
    
# train the very up layer by 4 epoches
model_B_on_A.compile(loss="binary_crossentropy",
                     optimizer=keras.optimizers.SGD(lr=1e-3),
                     metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4,
                           validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [40]:
# reset them to be trainabel
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

model_B_on_A.compile(loss="binary_crossentropy",
                     optimizer=keras.optimizers.SGD(lr=1e-3),
                     metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                           validation_data=(X_valid_B, y_valid_B))

Train on 200 samples, validate on 986 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [41]:
#higher than model B
model_B_on_A.evaluate(X_test_B,y_test_B)



[0.06506970712542534, 0.9935]

# Optimizer
### Momentum optimization

$$m = \beta m - \eta \nabla_{\theta}J(\theta)$$ \
$$\theta = \theta + m$$



In [46]:
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)

### Nesterov Accelerated Gradient
$$m = \beta m - \eta\nabla_{\theta}J(\theta+\beta m)$$ \
$$\theta = \theta + m$$

In [48]:
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9,nesterov = True)

### AdaGrad
$$s = s + \nabla_{\theta}J(\theta) \otimes \nabla_{\theta}(\theta)$$  
$$ \theta = \theta - \eta\nabla_{\theta}J(\theta) \oslash \sqrt{s+\epsilon}$$ \
$\otimes$ means element-wise mutiplicatoin \
$\oslash$ means element-wise division \
$\epsilon$ is smoothing term, very small

In [49]:
optimizer = keras.optimizers.Adagrad()

#### RMSProp
$$s = \beta s + (1-\beta)\nabla_{\theta}J(\theta) \otimes \nabla_{\theta}J(\theta)$$
$$\theta = \theta - \eta\nabla_{\theta}J(\theta)\oslash \sqrt{s+\epsilon}$$

In [50]:
optimizer = keras.optimizers.RMSprop(lr=0.001,rho=0.9)

### Adam
$$m = \beta_{1}m-(1-\beta_{1})\nabla_{\theta}J(\theta)$$
$$s = \beta_{2}s + (1-\beta_{2})\nabla_{\theta}J(\theta) \otimes \nabla_{\theta}J(\theta)$$
$$\hat{m} = \frac{m}{1-\beta_{1}^{t}}$$
$$\hat{s} = \frac{s}{1-\beta_{2}^{t}}$$
$$\bf{\theta} = \theta +\eta\hat{m}\oslash\sqrt{\hat{s}+\epsilon}$$

In [51]:
optimizer = keras.optimizers.Adam(lr=0.001,beta_1=0.9,beta_2=0.999)

### Nadam
Nadam is a combination of Adam and Nesterov trick

$$m = \beta_{1}m-(1-\beta_{1})\nabla_{\theta}J(\theta+\beta m)$$
$$s = \beta_{2}s + (1-\beta_{2})\nabla_{\theta}J(\theta) \otimes \nabla_{\theta}J(\theta)$$
$$\hat{m} = \frac{m}{1-\beta_{1}^{t}}$$
$$\hat{s} = \frac{s}{1-\beta_{2}^{t}}$$
$$\bf{\theta} = \theta +\eta\hat{m}\oslash\sqrt{\hat{s}+\epsilon}$$

In [53]:
optimizer = keras.optimizers.Nadam(lr=0.001,beta_1=0.9,beta_2=0.999)

# Learning Rate Scheduling

### Power Scheduling
set the lerning rate to a function of the iteration number $\eta(t) = \eta_{0}/(1+t/s)^{c}$

In [55]:
optimizer = keras.optimizers.SGD(lr= 0.01, decay = 1e-4)

### Exponential Scheduling
set the learning rate to $\eta(t) = \eta_{0} 0.1^{t/s}$ (drop by a factor of 10 every s step gradually)

In [None]:
def exponential_decay_fn(epoch):
    return 0.01*0.1**(epoch/20)
#or
def exponential_decay(lr0,s):
    def exponential_decay_fn(epoch):
        return lr0*0.1**(epoch/s)
    return exponential_decay_fn
exponential_decay_fun = exponential_decay(0.01,20)

lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(X_train_scaled, y_train,callbacks = [lr_scheduler])

### Piecewice constant Scheduling
Use a constant learning rate for a number of epochs

In [None]:
def piecewide_constant_fn(epoch):
    if epoch<5:
        return 0.01
    if epoch<15:
        return 0.005
    else:
        return 0.001

### Performance Scheduling
Measure the validation error every N steps and reduce the learning rate by factor $\lambda$ when the errors stops droping

# Avoid Overfitting Through Regularization

### l1 and l2 Regularization

In [60]:
# l1
layer = keras.layers.Dense(100,activation='elu',kernel_initializer='he_normal',
                           kernel_regularizer=keras.regularizers.l1(0.01))
# l2
layer = keras.layers.Dense(100,activation='elu',kernel_initializer='he_normal',
                           kernel_regularizer=keras.regularizers.l2(0.01))
# l1 and l2
layer = keras.layers.Dense(100,activation='elu',kernel_initializer='he_normal',
                           kernel_regularizer=keras.regularizers.l1_l2(0.01))

### Dropout
At every training step, every neuron(including the input layer, but excluding the output layer), has a probability $p$ of being temperorily 'drop out', but it will be activitive in the next step.

In [65]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(300,activation='elu',kernel_initializer = 'he_normal'),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100,activation='elu',kernel_initializer = 'he_normal'),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(10,activation = 'softmax')
])

### AlphaDropout
A varient of Dropout, which perserves the mean and std of its input. Useful for SELU

In [71]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means) / pixel_stds
X_valid_scaled = (X_valid - pixel_means) / pixel_stds
X_test_scaled = (X_test - pixel_means) / pixel_stds

In [None]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(300, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(10, activation="softmax")
])
optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
n_epochs = 20
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
                    validation_data=(X_valid_scaled, y_valid))

### MC Dropout
make 100 prediction using the X_test_scaled, set the train=True, so that Dropout layer is active.

In [73]:
y_probas = np.stack([model(X_test_scaled, training=True)
                     for sample in range(100)])
y_proba = y_probas.mean(axis=0)
y_std = y_probas.std(axis=0)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [87]:
np.round(y_proba[0],2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.06, 0.  , 0.1 , 0.  , 0.84],
      dtype=float32)

In [86]:
np.round(model.predict(X_test_scaled[:1]),3)

array([[0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.002, 0.   ,
        0.998]], dtype=float32)

In [88]:
y_pred = np.argmax(y_proba, axis=1)

In [89]:
accuracy = sum(y_pred==y_test)/len(y_test)

In [90]:
accuracy

0.8634

### Max Norm Regularization
for each neuron, it constrains the weights $w$ of the incoming connections such that ||$w$|| < r, means that, if ||$w$||> r, then $w = w*\frac{r}{||w||_2}$, change the Norm2 to 1

In [91]:
layer = keras.layers.Dense(100, activation="selu", kernel_initializer="lecun_normal",
                           kernel_constraint=keras.constraints.max_norm(1.))

In [94]:
MaxNormDense = partial(keras.layers.Dense,
                       activation="selu", kernel_initializer="lecun_normal",
                       kernel_constraint=keras.constraints.max_norm(1.))

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    MaxNormDense(300),
    MaxNormDense(100),
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
n_epochs = 2
history = model.fit(X_train_scaled, y_train, epochs=n_epochs,
                    validation_data=(X_valid_scaled, y_valid))

NameError: name 'partial' is not defined