In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim

In [2]:
class AdditionDataset(Dataset):
    """Addition dataset as introduced in the original LSTM paper.
    This implementation is from p.11 of 'On the difficulty of training recurrent neural networks' """

    def __init__(self, dataset_length, len_sequence):
        self.dataset_length = dataset_length  # This is what is returned by len(), see def __len__(self) below
        self.t = len_sequence  # Length of sequence
        # Check that sequence length is at least 10
        # If not, there is no randomness in the position of the first number to be added
        assert (self.t > 10), 'Sequence length must be at least 10'

    def __len__(self):
        return self.dataset_length

    def __getitem__(self, dummy_index):
        # The dummy index is required for the dataloader to work,
        # but since we are sampling data randomly it has no effect

        # Sample the length of the sequence and positions of numbers to add
        t_dash = np.random.randint(self.t, int(self.t * 11.0 / 10.0))  # Length of the sequence
        t_1 = np.random.randint(0, int(t_dash / 10.0))  # Indicator of position of first number to add
        t_2 = np.random.randint(int(t_dash / 10.0), int(t_dash / 2.0))  # Indicator of position of second number to add

        # We generate random numbers uniformly sampled from [0,1]
        # as depicted in Figure 2 of
        # "Learning Recurrent Neural Networks with Hessian-Free Optimization"
        # Details of how to sample the numbers was not given in
        # "On the difficulty of training recurrent neural networks"
        sequence = torch.zeros((2, t_dash))  # Initialize empty sequence
        sequence[0, :] = torch.rand((1, t_dash))  # Make first row random numbers

        # Set second row to indicate which numbers to add
        sequence[1, t_1] = 1.0
        sequence[1, t_2] = 1.0

        # Calculate target
        target = torch.Tensor([sequence[0, t_1] + sequence[0, t_2]])

        # Collect sequence and target into a sample
        sample = (sequence, target)

        return sample

In [3]:
def addition_problem(train_length, test_length, sequence_length, num_workers=4):
    """
    This is the addition problem


    Args:
        train_length:       Number of training examples for each epoch
        test_length:        Number of test examples for each test
        sequence_length:    Length of each sequence
        num_workers:        Number of workers loading the data

    Returns:
        train_loader    Loads training data
        test_loader     Loads test data

    """
    # Batch size should be 1 to prevent sequences in the same batch having different lengths
    batch_size = 1

    train_loader = DataLoader(AdditionDataset(train_length, sequence_length),
                              batch_size=batch_size,
                              num_workers=num_workers)
    test_loader = DataLoader(AdditionDataset(test_length, sequence_length),
                             batch_size=batch_size,
                             num_workers=num_workers)
    return train_loader, test_loader

In [4]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden))
        hidden = self.i2h(combined)
#         hidden = nn.functional.sigmoid(hidden)
        output = self.i2o(combined)
        return output, hidden

    def initHidden(self):
        return Variable(torch.zeros(self.hidden_size))

In [21]:
class Isgd_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Isgd_LSTM, self).__init__()

        self.hidden_size = hidden_size

        self.out2output = nn.Linear(hidden_size, output_size)

        self.lstm = nn.LSTM(input_size, hidden_size)
        
    def forward(self, input, hidden):
        out, hidden = self.lstm(input, hidden)
        output = self.out2output(out)
        return output, hidden

    def initHidden(self):
        h0 = Variable(torch.zeros(1, 1, self.hidden_size)) # [1 x 1 x h]
        c0 = Variable(torch.zeros(1, 1, self.hidden_size)) # [1 x 1 x h]
        return (h0, c0)

In [23]:
# See if can do one feed forward through the LSTM
input = data[:,0].unsqueeze(0).unsqueeze(0) # [1 x 1 x 2]
h0 = Variable(torch.zeros(1, 1, n_hidden)) # [1 x 1 x h]
c0 = Variable(torch.zeros(1, 1, n_hidden)) # [1 x 1 x h]
hidden = (h0, c0)
# print('input: ', input.size())
# print('hidden: ', hidden[0].size())

# lstm = nn.LSTM(input_size, n_hidden)
# out2output = nn.Linear(n_hidden, 1)

# out, hidden = lstm(input, hidden)
# output = out2output(out)

output, hidden = isgd_lstm(input, hidden)

print('output: ', output)
print('hidden: ', hidden)

output:  Variable containing:
(0 ,.,.) = 
  0.1434
[torch.FloatTensor of size 1x1x1]

hidden:  (Variable containing:
(0 ,.,.) = 
 -0.0619 -0.0115 -0.1607 -0.0478 -0.0190
[torch.FloatTensor of size 1x1x5]
, Variable containing:
(0 ,.,.) = 
 -0.1449 -0.0194 -0.2813 -0.0877 -0.0633
[torch.FloatTensor of size 1x1x5]
)


In [6]:
# Load datasets
train_data = AdditionDataset(dataset_length=8, len_sequence=11)
train_loader, test_loader = addition_problem(train_length=8, test_length=9, sequence_length=11)

In [7]:
# Take a single sample from the dataset
for i_batch, (data, target) in enumerate(test_loader):
    data = Variable(data[0,:,:]) # [1 x 2 x t]
    target = Variable(target)
    break

print(data)
print(target)

Variable containing:

Columns 0 to 9 
 0.7184  0.2618  0.5362  0.8539  0.5778  0.3738  0.2416  0.4440  0.0909  0.9868
 1.0000  0.0000  0.0000  0.0000  1.0000  0.0000  0.0000  0.0000  0.0000  0.0000

Columns 10 to 10 
 0.9027
 0.0000
[torch.FloatTensor of size 2x11]

Variable containing:
 1.2963
[torch.FloatTensor of size 1x1]



In [8]:
# Set neural network parameters
n_hidden = 5
input_size = 2
output_size = 1

In [22]:
# Initialize RNN and lstm
rnn = RNN(2, n_hidden, 1)
isgd_lstm = Isgd_LSTM(2, n_hidden, 1)

In [10]:
# See if can do one feed forward through the RNN
input = data[:,0] # [2]
hidden = Variable(torch.zeros(n_hidden)) # [h]

combined = torch.cat((input, hidden)) # [h+2]

i2h = nn.Linear(input_size + n_hidden, n_hidden)
i2o = nn.Linear(input_size + n_hidden, output_size)

hidden_new = i2h(combined)
rnn(input, hidden)

(Variable containing:
  0.2859
 [torch.FloatTensor of size 1], Variable containing:
 -0.1871
 -0.1375
  0.0956
  0.1869
 -0.8787
 [torch.FloatTensor of size 5])

In [None]:
# Run one iteration of the rnn
criterion = nn.MSELoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
optimizer = optim.SGD(rnn.parameters(), lr=learning_rate)
rnn.train()
for epoch in range(10):
    for i_batch, (data, target) in enumerate(test_loader):
        
        data = Variable(data) # [1 x 2 x t]
        target = Variable(target) # [1]

        # Get rid of zeroth dimension, since the minibatch is of size 1
        data = data[0,:,:] # [2 x t]
        
        hidden = rnn.initHidden() # [h]
        hidden = hidden # [h]
        rnn.zero_grad()

        for i in range(data.size()[1]):
            input = data[:,i] # [2]
            output, hidden = rnn(input, hidden)

        loss = nn.MSELoss()(output, target)
        if i_batch == 0:
            print('loss: ', float(loss))
        loss.backward()

        optimizer.step()


In [25]:
# Run one iteration of the lstm
criterion = nn.MSELoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
optimizer = optim.SGD(isgd_lstm.parameters(), lr=learning_rate)
isgd_lstm.train()
for epoch in range(10):
    for i_batch, (data, target) in enumerate(test_loader):
        data = Variable(data) # [1 x 2 x t]
        target = Variable(target) # [1]
        
        
        hidden = isgd_lstm.initHidden() # [h]
        lstm.zero_grad()

        for i in range(data.size()[1]):
            input = data[:,:,i].unsqueeze(0) # [1 x 1 x 2]
            output, hidden = isgd_lstm(input, hidden)

        loss = nn.MSELoss()(output, target)
        if i_batch == 0:
            print('loss: ', float(loss))
        loss.backward()

        optimizer.step()


loss:  0.5140122771263123
loss:  0.49200868606567383
loss:  0.22727467119693756
loss:  1.3614734411239624
loss:  0.5532548427581787
loss:  0.15613891184329987
loss:  0.2769795358181
loss:  2.326101303100586
loss:  0.011181870475411415
loss:  0.5309451818466187


In [None]:
# Evaluate the rnn
rnn.eval()
data, target = train_data[0]
data = Variable(data) # [2 x t]
target = Variable(target) # [1]
hidden = rnn.initHidden()

for i in range(data.size()[1]):
    input = data[:,i]
    output, hidden = rnn(input, hidden)
    
print('data: ', data)
print('target: ', float(target))
print('output: ', float(output))