<a href="https://colab.research.google.com/github/fannix/timeseries_generation/blob/master/attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
import torch

class PeriodicSeriesDataset(torch.utils.data.Dataset):
  def __init__(self, sequence):
    
    self.sequence = sequence
    self.start_symbol = "SOS"
    self.end_symbol = "EOS"
    self.x = []
    self.y = []
    self.id2word = {i+2: w for (i, w) in enumerate(sequence)}
    self.id2word[0] = self.start_symbol
    self.id2word[1] = self.end_symbol
    self.word2id = {w: i for (i, w) in self.id2word.items()}
    seq2id = np.array([self.word2id[w] for w in sequence])
    print(sequence)
    place_holder = np.zeros(len(sequence) + 2, dtype=np.int)
    place_holder[-1] = 1
    for i in range(len(sequence)):
      place_holder[1:-1] = np.roll(seq2id, i)
      self.x.append(place_holder.copy())
      place_holder[1:-1] = np.roll(seq2id, i+1)
      self.y.append(place_holder.copy())
      
  def __len__(self):
    return len(self.id2word) - 2

  def onehot_seq(self, word_seq):
    num_seq = [self.word2id[w] for w in word_seq]
    return self.onehot_num(num_seq)
  
  def onehot_num(self, num_seq):
    y = torch.LongTensor(num_seq).view(-1, 1)
    onehot = torch.FloatTensor(len(num_seq), len(self.word2id))
    onehot.zero_()
    onehot.scatter_(1, y, 1)
    return onehot
  
  def onecold_num(self, tensor):
    dim_n = tensor.shape[0]
    dim_c = tensor.shape[1]
    onecold = tensor.argmax(dim=1)
    return onecold

  def onecold_seq(self, tensor):
    onecold = self.onecold_num(tensor)
    print(onecold)
    return [self.id2word[i.item()] for i in onecold]

  def __getitem__(self, index):
    return  self.x[index], self.y[index]

import string
sequence = list(string.ascii_letters[:6])
pseries = PeriodicSeriesDataset(sequence)
for i in range(len(pseries)):
  print(pseries[i])

onehot = pseries.onehot_seq(['a', 'b', 'c'])
print(onehot)
onecold = pseries.onecold_seq(onehot)
print(onecold)

['a', 'b', 'c', 'd', 'e', 'f']
(array([0, 2, 3, 4, 5, 6, 7, 1]), array([0, 7, 2, 3, 4, 5, 6, 1]))
(array([0, 7, 2, 3, 4, 5, 6, 1]), array([0, 6, 7, 2, 3, 4, 5, 1]))
(array([0, 6, 7, 2, 3, 4, 5, 1]), array([0, 5, 6, 7, 2, 3, 4, 1]))
(array([0, 5, 6, 7, 2, 3, 4, 1]), array([0, 4, 5, 6, 7, 2, 3, 1]))
(array([0, 4, 5, 6, 7, 2, 3, 1]), array([0, 3, 4, 5, 6, 7, 2, 1]))
(array([0, 3, 4, 5, 6, 7, 2, 1]), array([0, 2, 3, 4, 5, 6, 7, 1]))
tensor([[0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0.]])
tensor([2, 3, 4])
['a', 'b', 'c']


In [2]:
batch_size = 5
nb_digits = 10
y = torch.LongTensor(batch_size,1).random_() % nb_digits
y
y_onehot = torch.FloatTensor(batch_size, nb_digits)
y_onehot.zero_()
y_onehot.scatter_(1, y, 1)

tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

In [0]:
?y_onehot.scatter_

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
import torch
from torch import nn

from torch.utils.data import DataLoader


class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super().__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input, hc):
    output = self.embedding(input)
    output = torch.relu(output)
    output, (hidden, cell) = self.lstm(output, hc)
    #print(output.shape)
    output = self.out(output)
    return output, (hidden, cell)

  def init_hidden(self, batch_size):
    return (torch.zeros(1, batch_size, self.hidden_size, device=device), 
            torch.zeros(1, batch_size, self.hidden_size, device=device))


class EncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

  def forward(self, input, hc):
    embedded = self.embedding(input)
    output = embedded
    output, (hidden, cell) = self.lstm(output, hc)
    return output, (hidden, cell)

  def init_hidden(self, batch_size):
    return (torch.zeros(1, batch_size, self.hidden_size, device=device),
            torch.zeros(1, batch_size, self.hidden_size, device=device))

class Seq2Seq(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.encoder = EncoderRNN(input_size, hidden_size)
    self.decoder = DecoderRNN(hidden_size, input_size)
  
  def forward(self, input, expected = None):
    batch_size = input.shape[0]
    h0, c0 = self.encoder.init_hidden(batch_size)
    encode_output, (encode_hidden, encode_cell) = self.encoder(input, (h0, c0))

    inp = torch.zeros(batch_size, dtype=torch.long, device=device)
    inp = inp.view(batch_size, 1)
    h, c = encode_hidden, encode_cell

    #print(inp.shape)
    output_list = []
    while True:
      if expected == None:
        out, (h, c) = self.decoder(inp, (h, c))
      # teacher forcing
      else:
        pass
      output_list.append(out.squeeze(1))
      if len(output_list) == input.shape[1] - 1:
        break
    # print(output_list[0].shape)
    return torch.stack(output_list, 2)

loader = DataLoader(pseries, 4)

model = Seq2Seq(len(pseries.word2id), 20).to(device)

criterion = torch.nn.functional.cross_entropy
optimizer = torch.optim.RMSprop(model.parameters())
for epoch in range(10):
  sum_criterion = 0
  n_instance = 0
  for x, y in loader:
    x = x.to(device)
    y = y.to(device)
    n_instance += x.shape[0]
    optimizer.zero_grad()
    res = model(x.to(device))
    loss = criterion(res, y[:, 1:])
    loss.backward()
    sum_criterion += loss.item()
    
    optimizer.step()

  print(f'{epoch}: {sum_criterion/n_instance}')
  #print(res.shape)



0: 0.6856725215911865
1: 0.7008873224258423
2: 0.6543954809506735
3: 0.633760670820872
4: 0.6333734194437662
5: 0.617169717947642
6: 0.6080129941304525
7: 0.5717541376749674
8: 0.614373524983724
9: 0.5609776377677917


In [0]:
model(
    torch.LongTensor(
    [[0, 2, 3, 4, 5, 6, 7, 1],
     [0, 4, 5, 6, 7, 2, 3, 1]
     ])
).argmax(dim=1)

In [0]:
model(
    torch.LongTensor(
    [[0, 2, 3, 4, 5, 6, 1],
     [0, 4, 5, 6, 7, 2, 1]
     ])
).argmax(dim=1)

In [14]:
model(
    torch.LongTensor(
    [[0, 3, 5, 6, 2, 7, 4, 1],
     [0, 5, 6, 2, 3, 7, 4, 1]
     ]).to(device)
).argmax(dim=1)

tensor([[6, 6, 6, 6, 7, 7, 7],
        [5, 5, 5, 5, 5, 7, 7]], device='cuda:0')

In [0]:
criterion(res, y[:, 1:])

In [0]:
y[:, 1:].shape

In [0]:
res.shape

In [0]:
criterion(torch.rand((1, 8, 7)), y[1:, 1:])

In [0]:
len(pseries)

In [0]:
input = torch.randn(4, 8, requires_grad=True)
target = torch.randint(8, (4,), dtype=torch.int64)

torch.nn.functional.cross_entropy(input, target)

Attention LSTM Seq2Seq

In [26]:
from torch.utils import data
from random import choice, randrange
import numpy as np
class ReverseDataset(data.Dataset):
    """
    Inspired from https://talbaumel.github.io/blog/attention/
    """
    def __init__(self, min_length=5, max_length=20, type='train'):
        self.SOS = "<s>"  # id 0
        self.EOS = "</s>" # id 1
        self.characters = list("abcd")
        self.int2char = list(self.characters)
        self.char2int = {c: i+2 for i, c in enumerate(self.characters)}
        self.VOCAB_SIZE = len(self.characters)
        self.min_length = min_length
        self.max_length = max_length
        if type=='train':
            self.set = [self._sample() for _ in range(3000)]
        else:
            self.set = [self._sample() for _ in range(300)]

    def __len__(self):
        return len(self.set)

    def __getitem__(self, item):
        return self.set[item]

    def _sample(self):
        if self.min_length != self.max_length:
            random_length = randrange(self.min_length, self.max_length)# Pick a random length
        else:
            random_length = self.min_length
        random_char_list = [choice(self.characters[:-1]) for _ in range(random_length)]  # Pick random chars
        random_string = ''.join(random_char_list)
        a = np.array([self.char2int.get(x) for x in random_string] + [1])
        b = np.array([self.char2int.get(x) for x in random_string[::-1]] + [1]) # Return the random string and its reverse
        #x = np.zeros((random_length, self.VOCAB_SIZE))
        #x[np.arange(random_length), a-2] = 1
        return a, b

reverse_dataset = ReverseDataset(4, 4)
reverse_dataset[0]

(array([3, 3, 4, 2, 1]), array([2, 4, 3, 3, 1]))

In [0]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [0]:
import torch
from torch import nn
import math

from torch.utils.data import DataLoader

class AttenEncoderRNN(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size, hidden_size)
    self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

  def forward(self, input, hc):
    embedded = self.embedding(input)
    output = embedded
    output, (hidden, cell) = self.lstm(output, hc)
    return output, (hidden, cell)

  def init_hidden(self, batch_size):
    return (torch.zeros(1, batch_size, self.hidden_size, device=device),
            torch.zeros(1, batch_size, self.hidden_size, device=device))

class AttenDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size):
    super().__init__()
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size, hidden_size)
    self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
    self.out = nn.Linear(hidden_size, output_size)
    self.combine = nn.Linear(2 * hidden_size, hidden_size)

  def forward(self, input, hc, encode_out):
    embed = self.embedding(input)
    attn = attention(embed, encode_out, encode_out)
    comb = self.combine(torch.cat([embed, attn[0]], -1))
    output = torch.relu(comb)
    output, (hidden, cell) = self.lstm(output, hc)
    #print(output.shape)
    output = self.out(output)
    return output, (hidden, cell)

  def init_hidden(self, batch_size):
    return (torch.zeros(1, batch_size, self.hidden_size, device=device), 
            torch.zeros(1, batch_size, self.hidden_size, device=device))


def attention(query, key, value, mask=None, dropout=None):
    """Compute 'Scaled Dot Product Attention
    query: N x 1 x D
    key: N x T x D
    value: N x T x D. key and value are the same. query, key and value are the same for self attention
    scores: N x 1 x T
    p_attn: N x 1 x T
    result: N x 1 x D
    
    """
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = torch.nn.functional.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class AttenSeq2Seq(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.encoder = AttenEncoderRNN(input_size, hidden_size)
    self.decoder = AttenDecoderRNN(hidden_size, input_size)
  
  def forward(self, input, expected = None):
    batch_size = input.shape[0]
    h0, c0 = self.encoder.init_hidden(batch_size)
    encode_output, (encode_hidden, encode_cell) = self.encoder(input, (h0, c0))
    # 0 is the start_symbol
    inp = torch.zeros(batch_size, dtype=torch.long, device=device)
    inp = inp.view(batch_size, 1)
    h, c = encode_hidden, encode_cell
    #print(inp.shape)
    output_list = []
    while True:
      if expected == None:
        out, (h, c) = self.decoder(inp, (h, c), encode_output)
      # teacher forcing
      else:
        pass
      output_list.append(out.squeeze(1))
      if len(output_list) == input.shape[1]:
        break
    # print(output_list[0].shape)
    return torch.stack(output_list, 2)



In [39]:
reverse_dataset = ReverseDataset(3, 10)

loader = DataLoader(reverse_dataset, 1)

model = AttenSeq2Seq(len(reverse_dataset.char2int) + 2, 128).to(device)

criterion = torch.nn.functional.cross_entropy
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(10):
  sum_criterion = 0
  n_instance = 0
  for x, y in loader:
    n_instance += x.shape[0]
    optimizer.zero_grad()
    res = model(x.to(device))
    loss = criterion(res, y.to(device))
    loss.backward()
    sum_criterion += loss.item()
    
    optimizer.step()

  print(f'{epoch}: {sum_criterion/n_instance}')

0: 0.24724922055195686
1: 0.06262208303797343
2: 0.041106641849943726
3: 0.0277314139242244
4: 0.01948141407735587
5: 0.017707403165288346
6: 0.013074126324757457
7: 0.010106217944389755
8: 0.009920213843760071
9: 0.008330162984554893


In [42]:
print(y)

res = model(x.to(device))

print(res.argmax(dim=1))

print(torch.sum(res.argmax(dim=1) == y.to(device)) / float(res.shape[0] * res.shape[2]))

tensor([[4, 4, 3, 4, 2, 2, 3, 2, 1]])
tensor([[4, 4, 3, 4, 2, 2, 3, 2, 1]])
tensor(1.)


In [44]:
a = torch.LongTensor([[2, 3, 4, 2, 1], [4, 3, 2, 3, 1]]).to(device)
model(a).argmax(1)

tensor([[2, 4, 3, 2, 1],
        [3, 2, 3, 4, 1]])

In [57]:
reverse_dataset_val = ReverseDataset(3, 10, type="test")
val_loader = DataLoader(reverse_dataset_val, 1)
with torch.no_grad():
  ncorrect = 0
  nwrong = 0
  for x, y in val_loader:
    predict = model(x.to(device)).argmax(1)
    if torch.equal(y, predict):
      ncorrect += 1
    else:
      nwrong += 1
print(f'{ncorrect}, {nwrong}')


296, 4


Test PeriodicDataset

In [22]:
loader = DataLoader(pseries, 16)

model = AttenSeq2Seq(len(pseries.word2id), 32).to(device)

criterion = torch.nn.functional.cross_entropy
optimizer = torch.optim.RMSprop(model.parameters())
for epoch in range(50):
  sum_criterion = 0
  n_instance = 0
  for x, y in loader:
    n_instance += x.shape[0]
    optimizer.zero_grad()
    res = model(x.to(device))
    loss = criterion(res, y[:, 1:].to(device))
    loss.backward()
    sum_criterion += loss.item()
    
    optimizer.step()

  print(f'{epoch}: {sum_criterion/n_instance}')
  #print(res.shape)


0: 0.34284647305806476
1: 0.33279256025950116
2: 0.364102562268575
3: 0.35499246915181476
4: 0.3365637461344401
5: 0.3467588424682617
6: 0.32402437925338745
7: 0.31968875726064044
8: 0.32248131434122723
9: 0.3161206046740214
10: 0.3083093762397766
11: 0.29815038045247394
12: 0.29484466711680096
13: 0.3263248602549235
14: 0.2754681309064229
15: 0.24778000513712564
16: 0.2203899621963501
17: 0.26362089316050213
18: 0.2042824625968933
19: 0.17681060234705606
20: 0.18516016006469727
21: 0.2149167855580648
22: 0.14177588621775308
23: 0.11447614431381226
24: 0.09494992097218831
25: 0.07967820266882579
26: 0.07398969928423564
27: 0.11055134733517964
28: 0.07617683211962382
29: 0.08176023761431377
30: 0.13822277386983237
31: 0.09190871318181355
32: 0.1032823920249939
33: 0.08659808834393819
34: 0.045922686656316124
35: 0.039204212526480355
36: 0.0348358949025472
37: 0.03247294326623281
38: 0.038141533732414246
39: 0.04629839460055033
40: 0.039137559632460274
41: 0.02876174698273341
42: 0.02461

In [23]:
model(
    torch.LongTensor(
    [[0, 2, 3, 4, 5, 6, 7, 1],
     [0, 4, 5, 6, 7, 2, 3, 1]
     ]).to(device)
).argmax(dim=1)

tensor([[7, 2, 3, 4, 5, 6, 1],
        [3, 4, 5, 6, 7, 2, 1]], device='cuda:0')

In [24]:
model(
    torch.LongTensor(
    [[0, 2, 3, 4, 5, 1],
     [0, 3, 4, 5, 6, 1]
     ]).to(device)
).argmax(dim=1)

tensor([[5, 6, 7, 2, 3],
        [6, 7, 2, 3, 4]], device='cuda:0')

In [25]:
model(
    torch.LongTensor(
    [[0, 3, 5, 6, 2, 7, 4, 1],
     [0, 5, 6, 2, 3, 7, 4, 1]
     ]).to(device)
).argmax(dim=1)

tensor([[4, 5, 6, 7, 2, 3, 1],
        [4, 5, 6, 7, 2, 3, 1]], device='cuda:0')