In [7]:
import numpy as np
import cPickle as pickle
import gzip

----

In [4]:
class LinearLayer(object):
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim)
    def forward_propagation(self, input_):
        return input_.dot(self.W)
    def backward_propagation(self, gradient_wrt_output):
        pass
    def update_parameters(self, input_, gradient_wrt_output, learning_rate):
        pass

In [93]:
class SigmoidLayer(object):
    def __init__(self):
        pass
    def forward_propagation(self, input_):
        return np.apply_along_axis(lambda x : 1 / (1 + np.exp(-x)), axis=1, arr=input_)
    def backward_propagation(self, gradient_wrt_output):
        pass
    def update_parameters(self, input_, gradient_wrt_output, learning_rate):
        pass

In [223]:
class SoftmaxLayer(object):
    def __init__(self):
        pass
    def forward_propagation(self, input_):
        return np.apply_along_axis( lambda x: np.exp(x) / np.sum(np.exp(x)), axis=1, arr=input_)

In [152]:
def one_hot(indices):
    b = np.zeros( (indices.shape[0], np.max(indices)+1) )
    for i in range(0, b.shape[0]):
        b[i, indices[i]-1 ] = 1
    return b

----

In [9]:
with gzip.open("../data/mnist.pkl.gz") as f:
    train_set, valid_set, test_set = pickle.load(f)

In [15]:
Xt, yt = train_set

In [16]:
Xt.shape, yt.shape

((50000, 784), (50000,))

$X \in R^{b \times p}$, where $b$ is the batch size and $p$ is the number of attributes

In [156]:
Xt_batch = Xt[0:10]
yt_batch = one_hot( yt[0:10] )

In [95]:
layers = [LinearLayer(784, 100), SigmoidLayer(), LinearLayer(100, 10), SoftmaxLayer()]

In [183]:
inputs = [Xt_batch]
print "output shape: %s" % str(inputs[0].shape)
for layer in layers:
    inputs.append( layer.forward_propagation(inputs[-1]))
    print "output shape: %s" % (str(inputs[-1].shape))

output shape: (10, 784)
output shape: (10, 100)
output shape: (10, 100)
output shape: (10, 10)
output shape: (10, 10)


In [184]:
activations = inputs[-1]

In [102]:
for row in inputs[-1]:
    assert sum(row) - 1.0 < 1e-6

Let us now define the loss function

In [170]:
def loss(input_, targets):
    assert input_.shape == targets.shape
    return -np.log(np.sum(targets * activations, axis=1))

In [172]:
loss(activations, yt_batch).mean()

10.232014651559407

----

We want to compute $\frac{dL}{dout}$, where $out$ is the softmax output

In [226]:
# dimensions are (bs, k)
dL_wrt_out = activations - yt_batch

In [225]:
dL_wrt_out.shape

(10, 10)

The derivative of a softmax

In [210]:
tmp = activations[0]
tmp

array([  1.12467179e-06,   2.47123412e-02,   3.72375201e-10,
         8.71982089e-09,   6.62640793e-05,   2.06727097e-08,
         9.66170143e-01,   2.71365332e-09,   1.77181747e-06,
         9.04832314e-03])

In [215]:
jacobian = np.zeros( (tmp.shape[0], tmp.shape[0]) )
jacobian

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [244]:
#jacobian_tensor = []
for z in range(0, activations.shape[0]):
    row = activations[z]
    d_softmaxout_wrt_out = np.zeros( (row.shape[0], row.shape[0]) )
    for i in range(0, jacobian.shape[0]):
        for j in range(0, jacobian.shape[1]):
            if i == j:
                d_softmaxout_wrt_out[i,j] = row[i] * (1 - row[i])
            else:
                d_softmaxout_wrt_out[i,j] = row[i]*row[j]
                
    dL_wrt_softmaxout = dL_wrt_out[z:z+1].T
    #jacobian_tensor.append(jacobian)
    print d_softmaxout_wrt_out.T.shape
    print dL_wrt_softmaxout.shape
    print np.dot( d_softmaxout_wrt_out.T, dL_wrt_softmaxout ).shape
    break

(10, 10)
(10, 1)
(10, 1)


In [233]:
activations.shape

(10, 10)