In [2]:
import torch

In [3]:
ori_data = torch.randint(0, 10, (10005, 1))
print(max(ori_data), min(ori_data), len(ori_data), ori_data.shape)

tensor([9]) tensor([0]) 10005 torch.Size([10005, 1])


In [4]:
feature = torch.cat((ori_data[0: -5], ori_data[1: -4], ori_data[2: -3], ori_data[3: -2]), axis=-1)
label = ori_data[4: -1]
feature = torch.reshape(feature, shape=(-1, 4))
label = torch.reshape(label, shape=(-1,))
print(feature.shape, label.shape)

torch.Size([10000, 4]) torch.Size([10000])


In [5]:
feature = torch.nn.functional.one_hot(feature, 10)
label = torch.nn.functional.one_hot(label, 10)

x_train, x_test = feature[:7000], feature[7000:]
y_train, y_test = label[:7000], label[7000:]
print(x_train.shape, x_train.shape, x_test.shape, y_test.shape)

torch.Size([7000, 4, 10]) torch.Size([7000, 4, 10]) torch.Size([3000, 4, 10]) torch.Size([3000, 10])


In [6]:
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i: i + batch_size])
        yield features[batch_indices], labels[batch_indices]

In [7]:
batch_size = 200
for x, y in data_iter(batch_size, x_train, y_train):
    print(x.shape, '\n', y.shape)
    break

torch.Size([200, 4, 10])
torch.Size([200, 10])

torch.Size([200, 4, 10]) 
 torch.Size([200, 10])


torch.Size([200, 10])

In [8]:
def normal(shape):
    return torch.randn(size=shape) * 0.01

In [9]:
def get_params(vocab_size, num_hiddens):
    num_inputs = num_outputs = vocab_size
    # input part params
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens)
    # output part params
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs)
    # add gradient
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

In [10]:
batch_size = 200
for x, y in data_iter(batch_size, x_train, y_train):
    print(x.shape, '\n', y.shape)
    break

torch.Size([200, 4, 10])
torch.Size([200, 10])

torch.Size([200, 4, 10]) 
 torch.Size([200, 10])


torch.Size([200, 10])

In [11]:
def normal(shape):
    return torch.randn(size=shape) * 0.01

In [12]:
def get_params(vocab_size, num_hiddens):
    num_inputs = num_outputs = vocab_size
    # input part params
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens)
    # output part params
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs)
    # add gradient
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params

In [13]:
def init_rnn_state(batch_size, num_hiddens):
    return (torch.zeros(((batch_size, num_hiddens)),))

In [14]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H = state[0]
    outputs = []
    for X in inputs:
        # **This is the critical line to add:**
        # It forces X to be a 2D matrix of shape (BatchSize, Features),
        # guaranteeing torch.mm has valid inputs.
        X = X.reshape(-1, X.shape[-1])

        H = torch.tanh(X @ W_xh + H @ W_hh + b_h)
        Y = H @ W_hq + b_q
        outputs.append(Y)
    return outputs[-1], (H,)

In [15]:
def accuracy(y, y_hat):
    y_hat = y_hat.argmax(axis=1)
    y = y.argmax(axis=1)
    count = y_hat.type(y.dtype) == y
    return float(count.type(y.dtype).sum() / count.shape[0])

In [16]:
class RNNModel:
    def __init__(self, vocab_size, num_hiddens, get_params, init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens)
        self.init_state, self.forward_fn = init_state, forward_fn

    def __call__(self, X, state):
        X = X.type(torch.float32)
        return self.forward_fn(X, state, self.params)

    def begin_state(self, batch_size):
        return self.init_state(batch_size, self.num_hiddens)

In [17]:
RNN_net = RNNModel(vocab_size=10, num_hiddens=32, init_state=init_rnn_state,
                   forward_fn=rnn, get_params=get_params)

In [18]:
loss = torch.nn.MSELoss()
updater = torch.optim.Adam(RNN_net.params, lr=0.001)

In [19]:
def train_epoch(net, loss, updater, batch_size, x_train, y_train):
    L = []
    ACC = []
    for X, y in data_iter(batch_size, x_train, y_train):
        state = net.begin_state(batch_size)
        y_hat, state = net(torch.transpose(X, 0, 1), state)
        y = y.type(y_hat.dtype)
        l = loss(y, y_hat)
        L.append(l)
        ACC.append(accuracy(y, y_hat))
        updater.zero_grad()
        l.backward()
        updater.step()
    return float(sum(L) / len(L)), float(sum(ACC) / len(ACC))

In [20]:
RNN_torch_layer = torch.nn.RNN(10, 32)
RNN_torch_class = torch.nn.Sequential(
    torch.nn.Linear(32, 10),
    torch.nn.Softmax()
)

RNN_torch_params = list(RNN_torch_layer.parameters()) + list(RNN_torch_class.parameters())

RNN_torch_updater = torch.optim.Adam(RNN_torch_params, lr=0.001)

In [21]:
epoch = 100
for _ in range(epoch):
    L = []
    ACC = []
    for x, y in data_iter(batch_size, x_train, y_train):
        x = x.type(torch.float32)
        y_hat, state = RNN_torch_layer(torch.transpose(x, 0, 1))
        y_hat = RNN_torch_class(y_hat)
        y = y.type(y_hat.dtype)
        l = loss(y, y_hat[-1])
        L.append(l)
        ACC.append(accuracy(y, y_hat[-1]))
        RNN_torch_updater.zero_grad()
        l.backward()
        RNN_torch_updater.step()
    print('epoch:', _, 'loss:', float(sum(L) / len(L)), 'accuracy:', float(sum(ACC) / len(ACC)))

  return self._call_impl(*args, **kwargs)
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:836.)
  print('epoch:', _, 'loss:', float(sum(L) / len(L)), 'accuracy:', float(sum(ACC) / len(ACC)))


epoch: 0 loss: 0.10459031164646149 accuracy: 0.101857143001897
epoch: 1 loss: 0.09071774035692215 accuracy: 0.10357142825211797
epoch: 2 loss: 0.09003285318613052 accuracy: 0.10428571413670268
epoch: 3 loss: 0.09002270549535751 accuracy: 0.09985714256763459
epoch: 4 loss: 0.0900166779756546 accuracy: 0.10185714342764446
epoch: 5 loss: 0.09001239389181137 accuracy: 0.10442857167550496
epoch: 6 loss: 0.09000804275274277 accuracy: 0.10442857167550496
epoch: 7 loss: 0.09000374376773834 accuracy: 0.10499999980841364
epoch: 8 loss: 0.0899994820356369 accuracy: 0.10514285638928414
epoch: 9 loss: 0.08999525755643845 accuracy: 0.10457142793706485
epoch: 10 loss: 0.08999105542898178 accuracy: 0.10614285724503654
epoch: 11 loss: 0.08998681604862213 accuracy: 0.10657142837132727
epoch: 12 loss: 0.08998256921768188 accuracy: 0.10671428591012955
epoch: 13 loss: 0.08997827768325806 accuracy: 0.10799999982118606
epoch: 14 loss: 0.08997391164302826 accuracy: 0.10971428560359138
epoch: 15 loss: 0.089969

In [22]:
test_loss = []
test_accuracy = []
for x, y in data_iter(batch_size, x_test, y_test):
    state = RNN_net.begin_state(batch_size)
    y_hat, state = RNN_net(torch.transpose(x, 0, 1), state)
    y = y.type(y_hat.dtype)
    l = loss(y, y_hat)
    test_loss.append(l)
    test_accuracy.append(accuracy(y, y_hat))
    updater.zero_grad()
    l.backward()
    updater.step()
print('test loss:', float(sum(test_loss) / len(test_loss)), 'accuracy:', float(sum(test_accuracy) / len(test_accuracy)))

test loss: 0.09791617840528488 accuracy: 0.09899999871850014


In [23]:
LSTM_torch_layer = torch.nn.LSTM(10, 32)
LSTM_torch_class = torch.nn.Sequential(
    torch.nn.Linear(32, 10),
    torch.nn.Softmax()
)

LSTM_torch_params = list(LSTM_torch_layer.parameters()) + list(LSTM_torch_class.parameters())

LSTM_torch_updater = torch.optim.Adam(LSTM_torch_params, lr=0.001)

In [24]:
epoch = 100
for _ in range(epoch):
    L = []
    ACC = []
    for x, y in data_iter(batch_size, x_train, y_train):
        x = x.type(torch.float32)
        y_hat, state = LSTM_torch_layer(torch.transpose(x, 0, 1))
        y_hat = LSTM_torch_class(y_hat)
        y = y.type(y_hat.dtype)
        l = loss(y, y_hat[-1])
        L.append(l)
        ACC.append(accuracy(y, y_hat[-1]))
        LSTM_torch_updater.zero_grad()
        l.backward()
        LSTM_torch_updater.step()
    print('epoch:', _, 'loss:', float(sum(L) / len(L)), 'accuracy:', float(sum(ACC) / len(ACC)))

epoch: 0 loss: 0.11041231453418732 accuracy: 0.09971428683825902
epoch: 1 loss: 0.09715361893177032 accuracy: 0.10499999948910305
epoch: 2 loss: 0.09011740237474442 accuracy: 0.09614285741533551
epoch: 3 loss: 0.09000716358423233 accuracy: 0.09771428661687033
epoch: 4 loss: 0.09000106900930405 accuracy: 0.09757142939737865
epoch: 5 loss: 0.08999975770711899 accuracy: 0.09942857154778072
epoch: 6 loss: 0.08999839425086975 accuracy: 0.10071428577814784
epoch: 7 loss: 0.08999689668416977 accuracy: 0.10242857124124255
epoch: 8 loss: 0.08999526500701904 accuracy: 0.10214285637651171
epoch: 9 loss: 0.08999354392290115 accuracy: 0.10271428674459457
epoch: 10 loss: 0.0899917259812355 accuracy: 0.10371428621666771
epoch: 11 loss: 0.08998984098434448 accuracy: 0.10414285681077412
epoch: 12 loss: 0.0899878740310669 accuracy: 0.10457142868212291
epoch: 13 loss: 0.08998587727546692 accuracy: 0.10385714343615941
epoch: 14 loss: 0.08998379111289978 accuracy: 0.10571428537368774
epoch: 15 loss: 0.0899