In [1]:
import tensorflow as tf

from d2l.tensorflow import activations, config

config.setup()

Tensorflow running on CPU


## Product of Random Matrices

In [2]:
def prod_rand_matrices(scale, k):
    Y = tf.linalg.diag(tf.ones(k))
    for i in range(100):
        W = tf.random.normal(shape=(k,k), stddev=scale)
        Y = tf.matmul(W, Y)
    return Y

In [3]:
print(prod_rand_matrices(.5, 4))
print(prod_rand_matrices(.7, 4))

tf.Tensor(
[[-2.1154235e-06 -1.1732027e-06  5.2702433e-07  5.2780308e-07]
 [-4.5664733e-06 -2.5325417e-06  1.1376648e-06  1.1393455e-06]
 [ 1.5081496e-05  8.3641171e-06 -3.7573168e-06 -3.7628677e-06]
 [ 2.9725579e-06  1.6485643e-06 -7.4056584e-07 -7.4165985e-07]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[ -123450.984   135300.22    -71703.45    -73859.516]
 [-1506923.9    1651563.     -875273.8    -901579.94 ]
 [ 2089105.8   -2289624.     1213433.     1249895.6  ]
 [ -432368.3     473867.88   -251139.94   -258683.12 ]], shape=(4, 4), dtype=float32)


## Synthetic Gradients for MLP

In [4]:
def synthetic_grad(k, sigma, d_sigma, get_weight):
    res = []
    for repeat in range(10):
        x = tf.random.normal(shape=(k,))
        h = tf.ones(k)
        Y = tf.linalg.diag(tf.ones(k))
        for i in range(50):
            W = get_weight((k, k))
            Wh = tf.tensordot(W, h, axes=1)
            Y = tf.matmul(tf.cast(d_sigma(Wh), Y.dtype)*tf.linalg.matrix_transpose(W), Y)
            h = sigma(Wh)
        res.append(tf.reduce_mean(tf.abs(Y)))
    return sum(res)/len(res)

## ReLU 

In [5]:
k = 100
sigma = activations.relu
d_sigma = lambda x : x > 0

def get_weight(scale):
    return lambda shape : tf.random.normal(stddev=scale, shape=shape)

for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 1.508836611208153e-09
scale: 0.2, gradient mean: 1234917.875
scale: 0.4, gradient mean: 1.4625296237647563e+21
scale: 0.8, gradient mean: inf


## Xavier Initialization

In [6]:
scale = (6.0/(k+k))**.5

xavier = lambda shape : tf.random.uniform(minval=-scale, maxval=scale, shape=shape)

print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, xavier)))

scale: 0.17320508075688773, gradient mean: 1.0979059883453601e-09


## Sigmoid

In [7]:
sigma = activations.sigmoid

d_sigma = lambda x : (1 - activations.sigmoid(x)) * activations.sigmoid(x)

for scale in [0.1, 0.2, 0.4, 0.8]:
   print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 2.5721869837685743e-33
scale: 0.2, gradient mean: 3.481075305139694e-21
scale: 0.4, gradient mean: 3.0426941348415948e-12
scale: 0.8, gradient mean: 3.7135923776077107e-05


## Scaled Sigmoid

In [8]:
sigma = lambda x: 4 * activations.sigmoid(x) - 2

d_sigma = lambda x : (1 - activations.sigmoid(x)) * activations.sigmoid(x)

for scale in [0.1, 0.2, 0.4, 0.8]:
   print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 1.2319835400034187e-32
scale: 0.2, gradient mean: 7.945435420778586e-29
scale: 0.4, gradient mean: 4.8646900012430355e-23
scale: 0.8, gradient mean: 5.992310100804416e-17
