# Numerical Stability and Initialization

In [1]:
%matplotlib inline
import math
from mxnet import nd, autograd
from matplotlib import pyplot as plt
from IPython import display
display.set_matplotlib_formats('svg')

## Product of Random Matrices

In [2]:
def prod_rand_matrices(scale, k):
    Y = nd.diag(nd.ones(k))
    for i in range(100):
        W = nd.random.normal(shape=(k,k), scale=scale)
        Y = nd.dot(W, Y)
    return Y

## Sensitive 

In [3]:
print(prod_rand_matrices(.5, 4))
print(prod_rand_matrices(.7, 4))


[[ 7.7827990e-06  6.3514190e-06 -6.3311933e-07 -4.8334055e-06]
 [-1.2880527e-05 -1.0511586e-05  1.0478124e-06  7.9992806e-06]
 [-3.1772654e-06 -2.5929123e-06  2.5846600e-07  1.9731981e-06]
 [ 3.1247484e-05  2.5500563e-05 -2.5419379e-06 -1.9405838e-05]]
<NDArray 4x4 @cpu(0)>

[[ 9.0700534e+09 -1.3179782e+10  3.0834102e+10 -5.8214457e+10]
 [-1.0173414e+10  1.4783087e+10 -3.4585031e+10  6.5296171e+10]
 [-1.4496120e+10  2.1064452e+10 -4.9280287e+10  9.3040656e+10]
 [-3.3476849e+09  4.8645524e+09 -1.1380619e+10  2.1486488e+10]]
<NDArray 4x4 @cpu(0)>


## Synthetic Gradients for MLP


In [4]:
def synthetic_grad(k, sigma, d_sigma, get_weight):
    res = []
    for repeat in range(10):
        x = nd.random.normal(shape=k)
        h = nd.ones(k)
        Y = nd.diag(nd.ones(k))
        for i in range(50):
            W = get_weight((k, k))
            Wh = nd.dot(W, h)
            Y = nd.dot(d_sigma(Wh)*W.T, Y)
            h = sigma(Wh)
        res.append(Y.abs().mean().asscalar())
    return sum(res)/len(res)

## ReLU 



In [5]:
k = 100
sigma = nd.relu
d_sigma = lambda x : x > 0
def get_weight(scale):
    return lambda shape : nd.random.normal(scale=scale, shape=shape)

for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean', 
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 1.500646751306789e-09
scale 0.2 gradient mean 942470.1375
scale 0.4 gradient mean 1.5052397923059011e+21
scale 0.8 gradient mean nan


## Xavier

In [6]:
scale = (6.0/(k+k))**.5
xavier = lambda shape : nd.random.uniform(low=-scale, high=scale, shape=shape)
synthetic_grad(k, sigma, d_sigma, xavier)

1.1741994654368427e-09

## Sigmoid

In [7]:
sigma = nd.sigmoid
d_sigma = lambda x : (1-nd.sigmoid(x))*nd.sigmoid(x)
for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean',
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 2.6142704253955312e-33
scale 0.2 gradient mean 3.3892738677569894e-21
scale 0.4 gradient mean 3.753343100730555e-12
scale 0.8 gradient mean 5.702634398403461e-05


## Scaled Sigmoid


In [8]:
sigma = lambda x: 4 * nd.sigmoid(x) - 2
d_sigma = lambda x : 4 * (1-nd.sigmoid(x))*nd.sigmoid(x)
for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean',
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 0.014177062083035707
scale 0.2 gradient mean 127.29893550872802
scale 0.4 gradient mean 53951930.55
scale 0.8 gradient mean 128571859009536.0
