In [1]:
import tensorflow as tf

from d2l.tensorflow import activations, config

config.setup()

## Product of Random Matrices

In [2]:
def prod_rand_matrices(scale, k):
    Y = tf.linalg.diag(tf.ones(k))
    for i in range(100):
        W = tf.random.normal(shape=(k,k), stddev=scale)
        Y = tf.matmul(W, Y)
    return Y

In [3]:
print(prod_rand_matrices(.5, 4))
print(prod_rand_matrices(.7, 4))

tf.Tensor(
[[-4.2779675e-06 -3.5363096e-06 -3.0927210e-06 -2.3479388e-05]
 [ 5.3980875e-06  4.4622416e-06  3.9025076e-06  2.9627141e-05]
 [-1.1556385e-05 -9.5528976e-06 -8.3546038e-06 -6.3426662e-05]
 [-2.4241519e-05 -2.0038859e-05 -1.7525233e-05 -1.3304842e-04]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[ 2.1075683e+09 -4.7513046e+08  2.1987050e+08 -1.3153852e+09]
 [ 1.5215660e+09 -3.4302051e+08  1.5873787e+08 -9.4964397e+08]
 [ 1.6398280e+09 -3.6968288e+08  1.7107405e+08 -1.0234568e+09]
 [-6.0103840e+07  1.3549240e+07 -6.2708960e+06  3.7511200e+07]], shape=(4, 4), dtype=float32)


## Synthetic Gradients for MLP

In [4]:
def synthetic_grad(k, sigma, d_sigma, get_weight):
    res = []
    for repeat in range(10):
        x = tf.random.normal(shape=(k,))
        h = tf.ones(k)
        Y = tf.linalg.diag(tf.ones(k))
        for i in range(50):
            W = get_weight((k, k))
            Wh = tf.tensordot(W, h, axes=1)
            Y = tf.matmul(tf.cast(d_sigma(Wh), Y.dtype)*tf.linalg.matrix_transpose(W), Y)
            h = sigma(Wh)
        res.append(tf.reduce_mean(tf.abs(Y)))
    return sum(res)/len(res)

## ReLU 

In [5]:
k = 100
sigma = activations.relu
d_sigma = lambda x : x > 0

def get_weight(scale):
    return lambda shape : tf.random.normal(stddev=scale, shape=shape)

for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 9.869954853414242e-10
scale: 0.2, gradient mean: 1718322.25
scale: 0.4, gradient mean: 1.2986619291982302e+21
scale: 0.8, gradient mean: inf


## Xavier Initialization

In [6]:
scale = (6.0/(k+k))**.5

xavier = lambda shape : tf.random.uniform(minval=-scale, maxval=scale, shape=shape)

print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, xavier)))

scale: 0.17320508075688773, gradient mean: 1.2340025667967325e-09


## Sigmoid

In [7]:
sigma = activations.sigmoid

d_sigma = lambda x : (1 - activations.sigmoid(x)) * activations.sigmoid(x)

for scale in [0.1, 0.2, 0.4, 0.8]:
   print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 2.8542174658896117e-33
scale: 0.2, gradient mean: 3.0173028430005577e-21
scale: 0.4, gradient mean: 2.605374032835228e-12
scale: 0.8, gradient mean: 3.6048055335413665e-05


## Scaled Sigmoid

In [8]:
sigma = lambda x: 4 * activations.sigmoid(x) - 2

d_sigma = lambda x : (1 - activations.sigmoid(x)) * activations.sigmoid(x)

for scale in [0.1, 0.2, 0.4, 0.8]:
   print('scale: {}, gradient mean: {}'.format(scale, synthetic_grad(k, sigma, d_sigma, get_weight(scale))))

scale: 0.1, gradient mean: 1.1707341864486327e-32
scale: 0.2, gradient mean: 1.1733920779173673e-28
scale: 0.4, gradient mean: 8.963709714612384e-23
scale: 0.8, gradient mean: 8.619116765428632e-17
