### Tensorflow Addons
TensorFlow SIG Addons is a repository of community contributions that conform to well-established API patterns, but implement new functionality not available in core TensorFlow.

In [11]:
import tensorflow_addons as tfa
import tensorflow as tf

In [12]:
# default use of AdamW
step = tf.Variable(0, trainable=False)
schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [10000, 15000], [1e-0, 1e-1, 1e-2])
# lr and wd can be a function or a tensor
lr = 1e-1 * schedule(step)
wd = lambda: 1e-4 * schedule(step)
optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

In [13]:
# another way to use AdamW
Lr_type=[tf.keras.experimental.CosineDecay(1e-3,800,),
         tf.keras.experimental.CosineDecayRestarts(0.001, 0, t_mul=2.0, m_mul=1.0, alpha=0.0),
         tf.keras.experimental.NoisyLinearCosineDecay(0.01, 0.8, initial_variance=1.0, variance_decay=0.55,num_periods=0.5, alpha=0.0, beta=0.001)]

opt = tfa.optimizers.AdamW(Lr_type[0], learning_rate=1e-4)

In [14]:
# default use of RectifiedAdam
opt = tfa.optimizers.RectifiedAdam(lr=1e-3,total_steps=10000,warmup_proportion=0.1,min_lr=1e-5)

In [15]:
# another way to use RectifiedAdam using Lookahead

import tensorflow_addons as tfa
opt = tfa.optimizers.RectifiedAdam()
hard_opt=tfa.optimizers.Lookahead(opt, sync_period=10)

#also we can use all the opimizers of tensorflow with Lookahead

opt = tf.keras.optimizers.SGD(lr=0.001)
opt = tfa.optimizers.Lookahead(opt, sync_period=10)
##############################################################
opt = tf.keras.optimizers.Adam(lr=0.001)
opt = tfa.optimizers.Lookahead(opt, sync_period=10)


### SGDW
It computes the update step of tf.keras.optimizers.SGD and additionally decays the variable. Note that this is different from adding L2 regularization on the variables to the loss. Decoupling the weight decay from other hyperparameters (in particular the learning rate) simplifies hyperparameter search.
for further infos see [SGDW](https://arxiv.org/abs/1711.05101) 

In [16]:
step = tf.Variable(0, trainable=False)
schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [10000, 15000], [1e-0, 1e-1, 1e-2])
# lr and wd can be a function or a tensor
lr = 1e-1 * schedule(step)
wd = lambda: 1e-4 * schedule(step)

optimizer = tfa.optimizers.SGDW(learning_rate=lr, weight_decay=wd, momentum=0.9)

### how to use MultiOptimizer
Each optimizer will optimize only the weights associated with its paired layer. This can be used to implement discriminative layer training by assigning different learning rates to each optimizer layer pair. (tf.keras.optimizers.Optimizer, List[tf.keras.layers.Layer]) pairs are also supported. Please note that the layers must be instantiated before instantiating the optimizer.

In [17]:
#how to use MultiOptimizer
model = tf.keras.Sequential([
    tf.keras.Input(shape=(4,)),
    tf.keras.layers.Dense(8),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Dense(32),
])

optimizers = [
    tf.keras.optimizers.Adam(learning_rate=1e-4),
    tf.keras.optimizers.SGD(learning_rate=1e-2)
]
optimizers_and_layers = [(optimizers[0], model.layers[0]), (optimizers[1], model.layers[1:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
model.compile(optimizer=optimizer, loss="mse")

#note that model here is your model that you gonna use to solve your problem (classification for example)

[for more optimizers check this link](https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers)

### how to change the optimizer during the training process

In [None]:
#SGDR tooked from this link: https://github.com/YeongHyeon/ResNet-with-SGDR-TF2/blob/master/function_sgdr.py
import numpy as np
import matplotlib.pyplot as plt

LR = 0.0008
WEIGHT_DECAY = 0
EPOCHS = 100
WARMUP = 25

def get_cosine_schedule_with_warmup(lr,num_warmup_steps, num_training_steps, num_cycles=0.5):
    """
    Modified the get_cosine_schedule_with_warmup from huggingface for tenserflow
    (https://huggingface.co/transformers/_modules/transformers/optimization.html#get_cosine_schedule_with_warmup)

    Create a schedule with a learning rate that decreases following the
    values of the cosine function between 0 and `pi * cycles` after a warmup
    period during which it increases linearly between 0 and 1.
    """

    def lrfn(epoch):
        if epoch < num_warmup_steps:
            return float(epoch) / float(max(1, num_warmup_steps)) * lr
        progress = float(epoch - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) * lr

    return tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)

lr_schedule= get_cosine_schedule_with_warmup(lr=LR,num_warmup_steps=WARMUP,num_training_steps=EPOCHS)

In [None]:
# here at each 25 epochs we change the optimizer, i believe that we the use of Stochastic Gradient Descent with Warm Restarts (SGDR), in the callbacks
# if we fixe the new warm restart at every 25 epochs, meaning in parallel with the changement of the optimizer a new warm restart. 
#(this can help us to know wich optimizer performs better with the SGDR)
optimizers = [tf.keras.optimizers.Adam(learning_rate=1e-4), 
              tf.keras.optimizers.SGD(learning_rate=1e-2), 
              tf.keras.optimizers.Adam(learning_rate=1e-6), 
              tf.keras.optimizers.SGD(learning_rate=1e-4)]

opts=['Adam', 'SGD', 'Adam', 'SGD']
lrs=[1e-4,1e-2,1e-6,1e-4]
epochs = [25, 25, 25, 25]
EPOCH_STEPS=250

for i in range(len(optimizers)):
    print('Using optimizer: ' + opts[i] +' with lr: '+str(lrs[i])+', Epoch: ' + str(epochs[i]))
    
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.3, 
                                                               reduction=tf.keras.losses.Reduction.AUTO,
                                                               name='categorical_crossentropy'), 
                  optimizer=optimizer[i], 
                  metrics=[tf.keras.metrics.CategoricalAccuracy()])
    
    train_history = model.fit_generator(
            train_generator,
            steps_per_epoch=EPOCH_STEPS,
            epochs=epochs[i],
            callbacks=[get_cosine_schedule_with_warmup(lr,num_warmup_steps, num_training_steps, num_cycles=0.5)],
            shuffle=True
            )

### TFlearn
TFlearn is a modular and transparent deep learning library built on top of Tensorflow. It was designed to provide a higher-level API to TensorFlow in order to facilitate and speed-up experimentations, while remaining fully transparent and compatible with it.

In [None]:
#install tflearn
!pip install tflearn
import tflearn

In [None]:
opt=tflearn.optimizers.SGD (learning_rate=0.001, lr_decay=0.0, decay_step=100, staircase=False, use_locking=False, name='SGD')

In [None]:
opt=tflearn.optimizers.Adam (learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')

In [None]:
opt=tflearn.optimizers.AdaGrad (learning_rate=0.001, initial_accumulator_value=0.1, use_locking=False, name='AdaGrad')

In [None]:
opt=tflearn.optimizers.Ftrl (learning_rate=3.0, learning_rate_power=-0.5, initial_accumulator_value=0.1, l1_regularization_strength=0.0, l2_regularization_strength=0.0, use_locking=False, name='Ftrl')

In [None]:
opt=tflearn.optimizers.ProximalAdaGrad (learning_rate=0.001, initial_accumulator_value=0.1, use_locking=False, name='AdaGrad')

In [None]:
opt=tflearn.optimizers.Nesterov (learning_rate=0.001, momentum=0.9, lr_decay=0.0, decay_step=100, staircase=False, use_locking=False, name='Nesterov')