In [1]:
import torch as th
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
%load_ext autoreload 
%autoreload 2

from matplotlib import pyplot as plt
%matplotlib inline

Test LSTMCell
=========

In [8]:
def rand_inputs(shape, precision=1):
    inputs = th.randn(*shape)*(10**1)
    inputs = inputs.round()/(10**1) 
    return inputs

In [16]:
T = 6
batch_size = 3
input_dim = 10
output_dim = 5
%precision 3
rnn = nn.LSTMCell(input_dim, output_dim, bias=True)
inputs = Variable(rand_inputs([T, batch_size, input_dim]))
h = Variable(rand_inputs([batch_size, output_dim], precision=2))
c = Variable(rand_inputs([batch_size, output_dim], precision=2))
output = []
for t in range(T):
    print(t, 'inputs:', inputs[t].size())
    h_new, c_new = rnn(inputs[t], (h, c))
    print('h_old:', h.data.numpy(), sep='\n')
    print('h_new:', h_new.data.numpy(), sep='\n')
    h, c = h_new, c_new

0 inputs: torch.Size([3, 10])
h_old:
[[ 0.9 -1.8 -0.2 -0.8  1. ]
 [-0.2 -0.1 -1.7 -0.1 -0.1]
 [-0.7 -0.3  0.1 -0.5  1.2]]
h_new:
[[ 0.007  0.06   0.071  0.314  0.225]
 [ 0.092  0.009  0.028  0.005 -0.171]
 [-0.01  -0.135  0.164  0.351  0.098]]
1 inputs: torch.Size([3, 10])
h_old:
[[ 0.007  0.06   0.071  0.314  0.225]
 [ 0.092  0.009  0.028  0.005 -0.171]
 [-0.01  -0.135  0.164  0.351  0.098]]
h_new:
[[ 0.003  0.346 -0.073  0.19   0.087]
 [-0.1    0.204  0.063  0.095 -0.181]
 [-0.189 -0.103  0.182  0.152 -0.101]]
2 inputs: torch.Size([3, 10])
h_old:
[[ 0.003  0.346 -0.073  0.19   0.087]
 [-0.1    0.204  0.063  0.095 -0.181]
 [-0.189 -0.103  0.182  0.152 -0.101]]
h_new:
[[-0.153  0.413  0.224  0.188 -0.021]
 [-0.242  0.292  0.237  0.214 -0.309]
 [-0.161  0.192 -0.038  0.098  0.054]]
3 inputs: torch.Size([3, 10])
h_old:
[[-0.153  0.413  0.224  0.188 -0.021]
 [-0.242  0.292  0.237  0.214 -0.309]
 [-0.161  0.192 -0.038  0.098  0.054]]
h_new:
[[-0.116  0.293 -0.041  0.243 -0.061]
 [-0.183  0

LSTM ForgetGates initialization:
================

In [54]:
def test_lstm(cell, batch_size):
    bias_size = cell.bias_ih.size()[0]
    forget_start, forget_end = bias_size//4, bias_size//2
    print('forgetgate coords:', forget_start, forget_end)
    print('bias_ih:', cell.bias_ih.data.numpy())
    cell.bias_ih.data.fill_(0.)
    cell.bias_ih.data[forget_start:forget_end].fill_(5.)
    
    print('bias_ih:', cell.bias_ih.data.numpy())
    inputs = th.ones(batch_size, cell.input_size)
    gates = F.linear(Variable(inputs), cell.weight_ih, cell.bias_ih)
    
    print('all gates_shape:', gates.size())
    gates = gates.chunk(4, 1)
    gate_names = ['ingate', 'forgetgate','cellgate', 'outgate']
    gate_funcs = [F.sigmoid, F.sigmoid, F.tanh, F.sigmoid]
        
    for name,gate, nl in zip(gate_names, gates, gate_funcs):
        out = nl(gate)
        print('{0}:'.format(name), 'mean: {0}'.format(out.mean().data.numpy()), out.data.numpy(), sep='\n')

rnn = nn.LSTMCell(input_dim, output_dim, bias=True)
test_lstm(rnn, 1)

forgetgate coords: 5 10
bias_ih: [ 0.306  0.313  0.295  0.409 -0.055 -0.371 -0.002 -0.3    0.189 -0.429
 -0.252  0.005 -0.355 -0.06   0.18  -0.045 -0.328 -0.078  0.005  0.049]
bias_ih: [ 0.  0.  0.  0.  0.  5.  5.  5.  5.  5.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
all gates_shape: torch.Size([1, 20])
ingate:
mean: [ 0.514]
[[ 0.454  0.722  0.557  0.408  0.429]]
forgetgate:
mean: [ 0.993]
[[ 0.996  0.998  0.996  0.979  0.996]]
cellgate:
mean: [ 0.149]
[[-0.89   0.258 -0.142  0.923  0.596]]
outgate:
mean: [ 0.506]
[[ 0.65   0.38   0.588  0.572  0.341]]


In [21]:
def check_lstm_init():
    from network import LSTMNetwork, FFNetwork
    net = LSTMNetwork(4, th.cuda.FloatTensor)
    bias_ih, bias_hh = net.lstm.bias_ih, net.lstm.bias_hh
    print('lstm_bias size:', bias_ih.size())
    hidden_size = net.lstm.hidden_size
    gates_names = ['ingate', 'forgetgate', 'cellgate', 'outgate']
    for i, gate_name in enumerate(gates_names):
        gate_ih = bias_ih.data[hidden_size*i:hidden_size*(i+1)]
        gate_hh = bias_hh.data[hidden_size*i:hidden_size*(i+1)]
        print('{0:<12} size:{1}'.format(gate_name, gate_ih.size()[0]), end=' ')
        print('biases mins:', (gate_ih.min(), gate_hh.min()), end=' ')
        print('biases maxs:', (gate_ih.max(), gate_hh.max()))

check_lstm_init()

lstm_bias size: torch.Size([1024])
ingate       size:256 biases mins: (0.0, 0.0) biases maxs: (0.0, 0.0)
forgetgate   size:256 biases mins: (0.5, 0.5) biases maxs: (0.5, 0.5)
cellgate     size:256 biases mins: (0.0, 0.0) biases maxs: (0.0, 0.0)
outgate      size:256 biases mins: (0.0, 0.0) biases maxs: (0.0, 0.0)


In [24]:
def check_net_init():
    print('init LSTMnet:')
    from network import LSTMNetwork, FFNetwork
    net = LSTMNetwork(4, th.cuda.FloatTensor)
    print('\n\ninit FFnet:')
    net = FFNetwork(4, th.cuda.FloatTensor)
    
check_net_init()

init LSTMnet:
CONV2D_INIT: Conv2d(3, 16, kernel_size=(8, 8), stride=(4, 4))
CONV2D_INIT: Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
LSTM_INIT: LSTMCell(2592, 256)
LINEAR_INIT: Linear (256 -> 4)
LINEAR_INIT: Linear (256 -> 1)


init FFnet:
CONV2D_INIT: Conv2d(4, 16, kernel_size=(8, 8), stride=(4, 4))
CONV2D_INIT: Conv2d(16, 32, kernel_size=(4, 4), stride=(2, 2))
LINEAR_INIT: Linear (2592 -> 256)
LINEAR_INIT: Linear (256 -> 4)
LINEAR_INIT: Linear (256 -> 1)


Simple example to check gradients correctness after clone() and inplace operations:
=================

In [24]:
def simple_example(W_prime_grads=False):
    def create_W_and_W_prime():
        W = Variable(th.zeros(4,3), requires_grad=True)
        W_prime = Variable(th.zeros(4,3), requires_grad=W_prime_grads)
        return W, W_prime
    
    def id_fun(a, b):
        return a
    
    def complex_fun(a, b):
        output = a.clone()
        output[change_rows,:] = b[change_rows,:]
        return output
    
    def print_grads(title, W, W_prime):
        print(title)
        print('W.grads:', W.grad, sep='\n')
        print('W_prime.grads:', W_prime.grad, sep='\n')

    T = Variable(th.ones(4,3)) # Target
    l1 = nn.L1Loss()
    change_rows = th.Tensor([0,1,1,0]).nonzero().view(-1)
    
    W, W_prime = create_W_and_W_prime()
    loss = l1(id_fun(W, W_prime), T)
    loss.backward()
    print_grads('grads for the identity function:', W, W_prime)
    print('\n\n')
    W, W_prime = create_W_and_W_prime()
    loss = l1(complex_fun(W, W_prime), T)
    loss.backward()
    print_grads('grads for the complex function:', W, W_prime)
    
    
simple_example(True)

grads for the identity function:
W.grads:
Variable containing:
1.00000e-02 *
 -8.3333 -8.3333 -8.3333
 -8.3333 -8.3333 -8.3333
 -8.3333 -8.3333 -8.3333
 -8.3333 -8.3333 -8.3333
[torch.FloatTensor of size 4x3]

W_prime.grads:
None



grads for the complex function:
W.grads:
Variable containing:
1.00000e-02 *
 -8.3333 -8.3333 -8.3333
  0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000
 -8.3333 -8.3333 -8.3333
[torch.FloatTensor of size 4x3]

W_prime.grads:
Variable containing:
1.00000e-02 *
  0.0000  0.0000  0.0000
 -8.3333 -8.3333 -8.3333
 -8.3333 -8.3333 -8.3333
  0.0000  0.0000  0.0000
[torch.FloatTensor of size 4x3]

