In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
tf.__version__

'2.0.0'

In [65]:
def use_svg_display():
    """Use the svg format to display plot in jupyter."""
    display.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)):
    """Change the default figure size"""
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize
    
def show_trace_2d(f, results):
    """Show the trace of 2D variables during optimization."""
    set_figsize((3.5, 2.5))
    plt.plot(*zip(*results), '-o', color='#ff7f0e')
    x1, x2 = np.meshgrid(np.arange(-5.5, 1.0, 0.1), np.arange(-3.0, 1.0, 0.1))
    plt.contour(x1, x2, f(x1, x2), colors='#1f77b4')
    plt.xlabel('x1')
    plt.ylabel('x2')
    
def train_2d(trainer):
    """Optimize a 2-dim objective function with a customized trainer."""
    # s1 and s2 are internal state variables and will 
    # be used later in the chapter
    x, s1, s2 =tf.constant([-5.0, -2.0],shape=(1,2)) , tf.constant([0.0,0.0],shape=(1,2)),tf.constant([0.0,0.0],shape=(1,2))
    results = [x.numpy()]
    for i in range(20):
        x, s1, s2 = trainer(x, s1, s2)
        results.append(x.numpy())
    print('epoch :',i + 1,'  x: ', x.numpy())
    return results

## 介绍
AdaDelta 也是解决在adagrad算法中在后期学习率变小不能找到合适解的问题，有趣的是在AdaDetla中没有学习率这个超参数.
AdaDelta算法也像RMSProp算法一样，使用了小批量随机梯度$g_t$按元素平方的指数加权移动平均变量$s_t$。在时间步0，它的所有元素被初始化为0。给定超参数$0 \leq \rho < 1$ （对应RMSProp算法中的 γ ），在时间步 t>0 ，同RMSProp算法一样计算
$$\boldsymbol{s}_t \leftarrow \rho \boldsymbol{s}_{t-1} + (1 - \rho) \boldsymbol{g}_t \odot \boldsymbol{g}_t.$$

与RMSProp算法不同的是，AdaDelta算法还维护一个额外的状态变量$\Delta\boldsymbol{x}_t$ ，其元素同样在时间步0时被初始化为0。我们使用 $\Delta\boldsymbol{x}_{t-1}$ 来计算自变量的变化量
$$\boldsymbol{g}_t' \leftarrow \sqrt{\frac{\Delta\boldsymbol{x}_{t-1} + \epsilon}{\boldsymbol{s}_t + \epsilon}}   \odot \boldsymbol{g}_t,$$
接着更新自变量:
$$\boldsymbol{x}_t \leftarrow \boldsymbol{x}_{t-1} - \boldsymbol{g}'_t.$$
最后，我们使用$\Delta\boldsymbol{x}_t$ 来记录自变量变化量$\boldsymbol{g}'_t$按元素平方的指数加权移动平均：
$$\Delta\boldsymbol{x}_t \leftarrow \rho \Delta\boldsymbol{x}_{t-1} + (1 - \rho) \boldsymbol{g}'_t \odot \boldsymbol{g}'_t.$$

可以看到，如不考虑 ϵ 的影响，AdaDelta算法与RMSProp算法的不同之处在于使用$\sqrt{\Delta\boldsymbol{x}_{t-1}}$来替代超参数 η 

In [38]:
def fx(x):
    w = tf.constant([4,-3],dtype=tf.float32)
    
    return tf.reduce_sum(x*w,axis=1)+5.0 + tf.random.normal(shape=(x.shape[0],))

x = tf.random.normal(shape=(1000,2))
y = fx(x)

def create_ds(x,y,batch_size):
    ds = tf.data.Dataset.from_tensor_slices((x,y))
    return ds.shuffle(1000).batch(batch_size).repeat()

ds = create_ds(x,y,20)

temp_x,temp_y = next(iter(ds))

In [39]:
class MyLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super(MyLayer,self).__init__(**kwargs)
        self.w = self.add_weight(name="w",shape=(1,2),trainable = True,dtype=tf.float32,initializer=tf.keras.initializers.he_normal())
        self.b = self.add_weight(name="b",shape=(1,),trainable = True,dtype=tf.float32,initializer=tf.keras.initializers.he_normal())
    
    def call(self,x):
        return tf.reduce_sum(x * self.w,axis=1,name="sum") + self.b

model = tf.keras.Sequential([
    MyLayer()
])

In [62]:
model.compile(loss=tf.losses.mean_squared_error,metrics=["mae"],optimizer=tf.keras.optimizers.Adadelta(rho=0.95,learning_rate=1))
history = model.fit(ds,epochs=20,steps_per_epoch=100)

Train for 100 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [63]:
model.variables

[<tf.Variable 'w:0' shape=(1, 2) dtype=float32, numpy=array([[ 3.9851315, -3.0006926]], dtype=float32)>,
 <tf.Variable 'b:0' shape=(1,) dtype=float32, numpy=array([5.002375], dtype=float32)>]