# GPU

In [231]:
import copy
import math
import time

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import torchtext

In [243]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [244]:
device

device(type='cpu')

In [246]:
torch.cuda.is_available()

False

# Vocab

In [233]:
#### from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, interleave_keys

def itos(field, batch):  # batch에서 원본 sentence 얻는 함수
    with torch.cuda.device_of(batch):
        #batch = batch.T.tolist()
        batch = batch.tolist()
    batch = [[field.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize
    
    def trim(s, t):  # 현재 token ~ <EOS> token 사이의 문장 return
        sentence = []
        for w in s:
            if w == t:
                break
            sentence.append(w)
        return sentence

    batch = [trim(ex, field.eos_token) for ex in batch]  # batch를 문장으로 
    
    def filter_special(tok):
        return tok not in (field.init_token, field.pad_token)

    batch = [list(filter(filter_special, ex)) for ex in batch]
    return batch

In [122]:
SRC = Field(tokenize = "spacy",
            tokenizer_language = "de",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language = "en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

In [123]:
from torchtext.datasets import Multi30k

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

In [124]:
SRC.build_vocab(train_data)
TRG.build_vocab(train_data)

# Model

<img src="./transformer.jpg" width="500">

In [183]:
class Transformer(nn.Module):
    
    def __init__(self, src_vocab, trg_vocab, src_embed, trg_embed, encoder, decoder, fc_layer):
        super(Transformer, self).__init__()
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.encoder = encoder
        self.decoder = decoder
        self.fc_layer = fc_layer
        
    def forward(self, src, trg, src_mask, trg_mask):
        out = self.encode(src, src_mask)
        out = self.decode(trg, trg_mask, out, src_mask) # Decoder's src: Encoder's output
        out = self.fc_layer(out)
        #out = F.Logsoftmax(out, dim=-1)
        return out
    
    def encode(self, x, mask):
        out = self.encoder(self.src_embed(x), mask)
        return out
    
    def decode(self, x, mask, encoder_output, encoder_mask):
        out = self.decoder(self.trg_embed(x), mask, encoder_output, encoder_mask)
        return out

In [126]:
class Encoder(nn.Module):
    
    def __init__(self, sub_layer, n_layer):
        super(Encoder, self).__init__()
        self.layers = []
        for i in range(n_layer):
            self.layers.append(copy.deepcopy(sub_layer))
    
    def forward(self, x, mask):
        out = x
        for layer in self.layers: 
            out = layer(out, mask)
        return out

In [127]:
class Decoder(nn.Module):
    
    def __init__(self, sub_layer, n_layer):
        super(Decoder, self).__init__()
        self.layers = []
        for i in range(n_layer):
            self.layers.append(copy.deepcopy(sub_layer))
    
    def forward(self, x, mask, encoder_output, encoder_mask):
        out = x
        for layer in self.layers: 
            out = layer(x, mask, encoder_output, encoder_mask)
        return out

In [128]:
class EncoderLayer(nn.Module):
    
    def __init__(self, multi_head_attention_layer, fc_layer, residual_connection=True):
        super(EncoderLayer, self).__init__()
        if residual_connection:
            self.multi_head_attention_layer = ResidualConnectionLayer(multi_head_attention_layer)
            self.fc_layer = ResidualConnectionLayer(fc_layer)
        else:
            self.multi_head_attention_layer = multi_head_attention_layer
            self.fc_layer = fc_layer
    
    def forward(self, x, mask):
        out = self.multi_head_attention_layer(query=x, key=x, value=x, mask=mask)
        out = self.fc_layer(x=out)
        return out

In [129]:
class DecoderLayer(nn.Module):
    
    def __init__(self, multi_head_attention_layer, masked_multi_head_attention_layer, fc_layer, residual_connection=True):
        super(DecoderLayer, self).__init__()
        if residual_connection:
            self.masked_multi_head_attention_layer = ResidualConnectionLayer(multi_head_attention_layer)
            self.multi_head_attention_layer = ResidualConnectionLayer(masked_multi_head_attention_layer)
            self.fc_layer = ResidualConnectionLayer(fc_layer)
        else:
            self.masked_multi_head_attention_layer = multi_head_attention_layer
            self.multi_head_attention_layer = masked_multi_head_attention_layer
            self.fc_layer = fc_layer
    
    def forward(self, x, mask, encoder_output, encoder_mask):
        out = self.masked_multi_head_attention_layer(query=x, key=x, value=x, mask=mask)
        out = self.multi_head_attention_layer(query=x, key=encoder_output, value=encoder_output, mask=encoder_mask)
        out = self.fc_layer(x=out)
        return out

<img src="./self_attention.jpg" width="200"> <img src="./multi_head_attention.jpg" width="300">

<img src="./attention.png" width="1000">

In [130]:
class MultiHeadAttentionLayer(nn.Module):
    
    def __init__(self, d_embed, d_model, n_head, qkv_fc_layer, fc_layer):
        super(MultiHeadAttentionLayer, self).__init__()
        self.d_embed = d_embed
        self.d_model = d_model
        self.n_head = n_head
        self.query_fc_layer = copy.deepcopy(qkv_fc_layer)
        self.key_fc_layer = copy.deepcopy(qkv_fc_layer)
        self.value_fc_layer = copy.deepcopy(qkv_fc_layer)
        self.fc_layer = copy.deepcopy(fc_layer)
    
    def forward(self, query, key, value, mask=None):
        # query, key, value's shape: (n_batch, seq_len, d_embed)
        n_batch = query.shape[0]
        
        # reshape (n_batch, seq_len, d_embed) to (n_batch, n_head, seq_len, d_k)
        def transform(x, fc_layer):
            # x's shape: (n_batch, seq_len, d_embed)
            out = fc_layer(x) # d_embed -> d_model, out's shape: (n_batch, seq_len, d_model)
            out = out.view(n_batch, -1, self.n_head, self.d_model//self.n_head) # out's shape: (n_batch, seq_len, n_head, d_k) notice: d_k == d_model//n_head
            out = out.transpose(1, 2) # out's shape: (n_batch, n_head, seq_len, d_k)
            return out
        
        query = transform(query, self.query_fc_layer)      # query, key, value's shape: (n_batch, n_head, seq_len, d_k)
        key = transform(key, self.key_fc_layer)
        value = transform(value, self.value_fc_layer)
        
        if mask is not None:
            mask = mask.unsqueeze(1)
            
        out = self.calculate_attention(query, key, value, mask) # out's shape: (n_batch, n_head, seq_len, d_k)
        out = out.transpose(1, 2)  # out's shape: (n_batch, seq_len, n_head, d_k)
        out = out.contiguous().view(n_batch, -1, self.d_model)  # out's shape: (n_batch, seq_len, d_model)
        out = self.fc_layer(out)  # d-model -> d_embed, out's shape: (n_batch, seq_len, d_embed)
        return out
    
    def calculate_attention(self, query, key, value, mask): 
        d_k = key.size(-1) # query, key, value's shape: (n_batch, n_head, seq_len, d_k)
        score = torch.matmul(query, key.transpose(-2, -1)) # Q x K^T
        score = score / math.sqrt(d_k)  # scaling
        if mask is not None:
            score = score.masked_fill(mask==0, -1e9)  # masking (Decoder's Masked Multi-Attention Layer)
        out = F.softmax(score, dim = -1) # get softmax score
        out = torch.matmul(out, value) # score x V
        return out

In [131]:
class ResidualConnectionLayer(nn.Module):
    def __init__(self, sub_layer):
        super(ResidualConnectionLayer, self).__init__()
        self.sub_layer = sub_layer
    
    def forward(self, **kwargs):
        if 'x' in kwargs.keys():
            x = kwargs['x']
            out = x + self.sub_layer(x)
        elif 'query' in kwargs.keys():
            x = kwargs['query']
            out = x + self.sub_layer(**kwargs)
        return out

In [132]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_embed, max_seq_len=5000):
        super(PositionalEncoding, self).__init__()
        encoding = torch.zeros(max_seq_len, d_embed)
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_embed, 2) * -(math.log(10000.0) / d_embed))
        encoding[:, 0::2] = torch.sin(position * div_term)
        encoding[:, 1::2] = torch.cos(position * div_term)
        encoding = encoding.unsqueeze(0)
        self.encoding = encoding
        
    def forward(self, x):
        out = x + Variable(self.encoding[:, :x.size(1)], requires_grad=False).to(device)
        return out

In [133]:
class Embedding(nn.Module):
    def __init__(self, d_embed, vocab):
        super(Embedding, self).__init__()
        self.embedding = nn.Embedding(len(vocab), d_embed)
        self.vocab = vocab
        self.d_embed = d_embed
    
    def forward(self, x):
        out = self.embedding(x) * math.sqrt(self.d_embed)
        return out

In [134]:
class TransformerEmbedding(nn.Module):
    def __init__(self, embedding, positional_encoding):
        super(TransformerEmbedding, self).__init__()
        self.embedding = nn.Sequential(embedding, positional_encoding)
    
    def forward(self, x):
        out = self.embedding(x)
        return out

In [135]:
def make_model(src_vocab, trg_vocab, d_embed=512, n_layer=6, d_model=512, n_head=8):
    multi_head_attention_layer = MultiHeadAttentionLayer(d_embed = d_embed,
                                                         d_model = d_model,
                                                         n_head = n_head,
                                                         qkv_fc_layer = nn.Linear(d_embed, d_model),
                                                         fc_layer = nn.Linear(d_model, d_embed))
    model = Transformer(src_vocab = src_vocab, 
                        trg_vocab = trg_vocab, 
                        src_embed = TransformerEmbedding(Embedding(d_embed, src_vocab), PositionalEncoding(d_embed)), 
                        trg_embed = TransformerEmbedding(Embedding(d_embed, trg_vocab), PositionalEncoding(d_embed)),
                        encoder = Encoder(sub_layer = EncoderLayer(multi_head_attention_layer = copy.deepcopy(multi_head_attention_layer).to(device),
                                                                   fc_layer = nn.Linear(d_embed, d_embed),
                                                                   residual_connection = True),
                                          n_layer = n_layer),
                        decoder = Decoder(sub_layer = DecoderLayer(multi_head_attention_layer = copy.deepcopy(multi_head_attention_layer).to(device),
                                                                  masked_multi_head_attention_layer = copy.deepcopy(multi_head_attention_layer).to(device),
                                                                  fc_layer = nn.Linear(d_embed, d_embed),
                                                                  residual_connection = True),
                                          n_layer = n_layer),
                        fc_layer = nn.Linear(d_model, len(trg_vocab)).to(device))
    return model

# Dataset, Batch

In [136]:
#### from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, interleave_keys

def itos(field, batch):  # batch에서 원본 sentence 얻는 함수
    with torch.cuda.device_of(batch):
        #batch = batch.T.tolist()
        batch = batch.tolist()
    batch = [[field.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize
    
    def trim(s, t):  # 현재 token ~ <EOS> token 사이의 문장 return
        sentence = []
        for w in s:
            if w == t:
                break
            sentence.append(w)
        return sentence

    batch = [trim(ex, field.eos_token) for ex in batch]  # batch를 문장으로 
    
    def filter_special(tok):
        return tok not in (field.init_token, field.pad_token)

    batch = [list(filter(filter_special, ex)) for ex in batch]
    return batch

In [238]:
BATCH_SIZE = 128

# get iterator (train, valid, test)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

In [204]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')  # masking with upper triangle
    return torch.from_numpy(subsequent_mask) == 0 # reverse (masking=False, non-masking=True)

class Batch:
    
    "Object for holding a batch of data with masking during training." 
    def __init__(self, src, trg=None, pad=1):
        self.src = src.T
        self.src_mask = (self.src != pad).unsqueeze(-2)  # source mask, <pad>: False, other tokens: True
        if trg is not None:
            self.trg = trg.T[:, :-1]  # target sentence 0 ~ -1
            self.trg_y = trg.T[:, 1:]  # target sentence 1 ~ end
            self.trg_mask = self.make_std_mask(self.trg, pad) # target mask
            self.ntokens = (self.trg_y != pad).data.sum() # number of tokens
    
    def __len__(self):
        return len(self.src)
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2) # <pad>: False, other tokens: True, reshape (batch_size, seq_len) -> (batch_size, 1, seq_len)
        tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)) # not <pad> && non-masking: True, others: False
        return tgt_mask

In [139]:
# 데이터 확인

pad_index = SRC.vocab[SRC.pad_token]

for i, batch_without_mask in enumerate(train_iterator):
    batch = Batch(batch_without_mask.src, batch_without_mask.trg, pad_index)
    print(batch.src.shape, batch.src_mask.shape, batch.trg.shape, batch.trg_mask.shape)
    
    print('src[0]: ', ' '.join(itos(SRC, batch.src)[0]))
    print()
    
    print('trg[0]: ', ' '.join(itos(TRG, batch.trg)[0]))
    print()
    
    print('trg[1]: ', ' '.join(itos(TRG, batch.trg)[1]))
    print()
    
    print('trg_y[0]: ', ' '.join(itos(TRG, batch.trg_y)[0]))
    print()
    break

torch.Size([128, 25]) torch.Size([128, 1, 25]) torch.Size([128, 25]) torch.Size([128, 25, 25])
src[0]:  bauarbeiter balancieren auf einem schmalen balken , hoch über dem boden .

trg[0]:  construction workers balance on a narrow beam far above the ground .

trg[1]:  one man is visible riding a black horse with a brown horse and bull in the background .

trg_y[0]:  construction workers balance on a narrow beam far above the ground .



# Tranining

In [205]:
model = make_model(SRC.vocab, TRG.vocab, d_embed=512, n_layer=6, d_model=512, n_head=8)
model.to(device)
print(model)

# get num of parameters
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
n_parameter = sum([np.prod(p.size()) for p in model_parameters])
print(n_parameter)

Transformer(
  (src_embed): TransformerEmbedding(
    (embedding): Sequential(
      (0): Embedding(
        (embedding): Embedding(18668, 512)
      )
      (1): PositionalEncoding()
    )
  )
  (trg_embed): TransformerEmbedding(
    (embedding): Sequential(
      (0): Embedding(
        (embedding): Embedding(9799, 512)
      )
      (1): PositionalEncoding()
    )
  )
  (encoder): Encoder()
  (decoder): Decoder()
  (fc_layer): Linear(in_features=512, out_features=9799, bias=True)
)
19601991


In [221]:
def run_epoch(data_iter, model, loss_compute, optimizer):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    pad_index = SRC.vocab[SRC.pad_token]

    for i, batch_without_mask in enumerate(data_iter):
        # mask 적용
        batch = Batch(batch_without_mask.src, batch_without_mask.trg, pad_index)

        batch.src = batch.src.to(device)
        batch.trg = batch.trg.to(device)
        batch.src_mask = batch.src_mask.to(device)
        batch.trg_mask = batch.trg_mask.to(device)
        out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        out = out.transpose(1, -1) # reshape for CrossEntropyLoss (#batch, #seq, #vocab) -> (#batch, #vocab, #seq)
        #print(out.shape)
        #print(batch.trg_y.shape)
        #loss = loss_compute(out.contiguous().transpose(-2, -1), batch.trg_y.contiguous())
        loss = loss_compute(out, batch.trg_y)
        #print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" % (i, total_loss / total_tokens, tokens / elapsed))
            start = time.time()
            tokens = 0

    return total_loss / total_tokens

In [223]:
lr = 0.01
n_epoch = 5

#criterion = nn.NLLLoss()
criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

for epoch in range(n_epoch):
    model.train()
    run_epoch(train_iterator, model, criterion, optimizer)

Epoch Step: 1 Loss: 283.147400 Tokens per Sec: 1851.262207
Epoch Step: 51 Loss: 287.860504 Tokens per Sec: 1737.088257
Epoch Step: 101 Loss: 280.713104 Tokens per Sec: 1717.550659
Epoch Step: 151 Loss: 278.570465 Tokens per Sec: 1779.993408
Epoch Step: 201 Loss: 278.388885 Tokens per Sec: 1664.557495
Epoch Step: 1 Loss: 217.146210 Tokens per Sec: 1558.296143
Epoch Step: 51 Loss: 220.008453 Tokens per Sec: 1697.605225
Epoch Step: 101 Loss: 229.331635 Tokens per Sec: 1744.115723
Epoch Step: 151 Loss: 235.345016 Tokens per Sec: 1708.278198
Epoch Step: 201 Loss: 240.260284 Tokens per Sec: 1733.173828


KeyboardInterrupt: 

# Test Code

In [182]:
x = torch.Tensor([[0.8982, 0.805, 0.6393, 0.9983, 0.5731, 0.0469, 0.556, 0.1476, 0.8404, 0.1476, 0.8404, 0.5544]])
y = torch.LongTensor([1])

cross_entropy_loss = torch.nn.CrossEntropyLoss()
print(cross_entropy_loss(x, y))

log_softmax = nn.LogSoftmax(dim=1)
x_log = log_softmax(x)
def NLLLoss(logs, targets):
    out = torch.zeros_like(targets, dtype=torch.float)
    for i in range(len(targets)):
        out[i] = logs[i][targets[i]]
    return -out.sum()/len(out)
print(NLLLoss(x_log, y))

nll_loss = torch.nn.NLLLoss()
print(nll_loss(x_log, y))

tensor(2.3112)
tensor(2.3112)
tensor(2.3112)


In [33]:
lr = 0.01

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

@torch.no_grad()
def evaluate(data_iter):
    model.eval()
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    pad_index = SRC.vocab[SRC.pad_token]

    for i, batch_without_mask in enumerate(data_iter):
        # mask 적용
        batch = Batch(batch_without_mask.src, batch_without_mask.trg, pad_index)

        batch.src = batch.src.to(device)
        batch.trg = batch.trg.to(device)
        batch.src_mask = batch.src_mask.to(device)
        batch.trg_mask = batch.trg_mask.to(device)
        out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_compute(out.contiguous().transpose(-2, -1), batch.trg_y.contiguous())
        optimizer.zero_grad()
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" % (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0

    print("Final: ", total_loss / total_tokens)

In [147]:
m = nn.Softmax(dim=1)
loss = nn.NLLLoss()
input = torch.randn(3, 5, requires_grad=True)

target = torch.tensor([1, 0, 4])
softmax = m(input)
output = loss(softmax, target)
output.backward()

print(softmax)
print(target)
print(torch.argmax(softmax, dim=1))
print(output)

tensor([[0.1793, 0.2996, 0.1393, 0.1840, 0.1978],
        [0.1153, 0.0821, 0.4615, 0.3128, 0.0282],
        [0.0667, 0.1453, 0.4121, 0.3134, 0.0625]], grad_fn=<SoftmaxBackward>)
tensor([1, 0, 4])
tensor([1, 2, 2], grad_fn=<NotImplemented>)
tensor(-0.1591, grad_fn=<NllLossBackward>)


In [154]:
N, C = 5, 4
loss = nn.NLLLoss()
data = torch.randn(N, 16, 10, 10)
conv = nn.Conv2d(16, C, (3,3))
m = nn.Softmax(dim=1)
c = conv(data)
softmax = m(c)
target = torch.empty(N, 8, 8, dtype=torch.long).random_(0,C)
output = loss(softmax, target)
output.backward()

print(c.shape)
print(softmax.shape)
print(softmax)
print(target.shape)
print(output)

torch.Size([5, 4, 8, 8])
torch.Size([5, 4, 8, 8])
tensor([[[[0.1257, 0.3147, 0.1313,  ..., 0.0672, 0.2222, 0.3530],
          [0.3009, 0.3475, 0.1587,  ..., 0.4168, 0.2699, 0.1418],
          [0.1525, 0.1346, 0.1601,  ..., 0.4446, 0.2420, 0.2255],
          ...,
          [0.1057, 0.1527, 0.0767,  ..., 0.4231, 0.1867, 0.3147],
          [0.2915, 0.3935, 0.4222,  ..., 0.2901, 0.0910, 0.2017],
          [0.2523, 0.4173, 0.1394,  ..., 0.1240, 0.1669, 0.2435]],

         [[0.3368, 0.3739, 0.3243,  ..., 0.1857, 0.2341, 0.2075],
          [0.2632, 0.1699, 0.2498,  ..., 0.1559, 0.2550, 0.3730],
          [0.3154, 0.4212, 0.2729,  ..., 0.2056, 0.3012, 0.2725],
          ...,
          [0.4808, 0.4178, 0.1734,  ..., 0.2439, 0.3152, 0.2723],
          [0.1382, 0.2605, 0.2284,  ..., 0.1918, 0.3403, 0.3044],
          [0.2736, 0.1364, 0.4364,  ..., 0.2642, 0.2774, 0.1141]],

         [[0.3692, 0.1042, 0.1943,  ..., 0.1924, 0.1289, 0.1675],
          [0.2561, 0.2766, 0.2956,  ..., 0.2508, 0.2486, 0