<a href="https://colab.research.google.com/github/datajcthemax/playdata/blob/main/day30_Attention_Transfromer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 30번째 이야기
- Attention
- Transformer

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [None]:
# 계산기
def generate_data(num_samples, max_num):
  inputs, targets = [], []
  for _ in range(num_samples):
    a, b = random.randint(1, max_num), random.randint(1, max_num)
    inputs.append(f"{a}+{b}")
    targets.append(str(a+b))
  return inputs, targets

In [None]:
# 모델 학습에 넣을 데이터 준비
def prepare_data(x, y, max_seq_len):
  sos_token=10
  eos_token=11
  pad_token=13

  def str_to_tensor(s, is_target=False):
    char_to_idx = {'0': 0,'1': 1,'2': 2,'3': 3,'4': 4,'5': 5,'6': 6,'7': 7,'8': 8,'9': 9,'+': 12 }
    # char_to_idx = { str(num):num for num in range(10) }
    if is_target:
      char_to_idx['SOS'] = sos_token
      char_to_idx['EOS'] = eos_token
    else:
      char_to_idx['PAD'] = pad_token
    tensor = [char_to_idx[char] for char in s]
    if is_target:
      tensor = [sos_token]+tensor+[eos_token]
    tensor = tensor + [pad_token]*(max_seq_len-len(tensor))
    return torch.tensor(tensor, dtype=torch.long).unsqueeze(0)
  
  src = str_to_tensor(x)
  tgt = str_to_tensor(y, is_target=True)
  tgt_y = tgt[:, 1:]
  src_mask = None
  tgt_mask = None
  return src, tgt, tgt_y, src_mask, tgt_mask

In [None]:
# 모델 생성
# 속도!

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_len):
    super().__init__()
    position = torch.arange(0, max_seq_len).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2)*-(torch.log(torch.tensor(10000.0))/d_model))
    pos_enc=torch.zeros(max_seq_len, d_model)
    pos_enc[:,0::2] = torch.sin(position*div_term)
    pos_enc[:,1::2] = torch.cos(position*div_term)
    pos_enc = pos_enc.unsqueeze(0)
    self.register_buffer('pos_enc', pos_enc)
  def forward(self, x):
    return x + self.pos_enc[:, :x.size(1)]

In [None]:
class Transformer(nn.Module):
  def __init__(self, input_vocab_size, output_vocab_size, d_model, nhead, num_layers, max_seq_len):
    super().__init__()
    self.embedding = nn.Embedding(input_vocab_size, d_model)
    self.pos_enc = PositionalEncoding(d_model, max_seq_len)
    self.transformer = nn.Transformer(d_model, nhead, num_layers)
    self.fc = nn.Linear(d_model, output_vocab_size)
  def forward(self, src, tgt):
    src = self.pos_enc(self.embedding(src))
    tgt = self.pos_enc(self.embedding(tgt))
    out = self.transformer(src, tgt)
    out = self.fc(out)
    return out

In [None]:
def train(model, criterion, optimizer, src, tgt, tgt_y, src_mask, tgt_mask):
  model.train() # model.eval()
  optimizer.zero_grad()
  output = model(src, tgt)
  loss = criterion(output.view(-1, output.size(2)), tgt.view(-1))
  loss.backward()
  optimizer.step()
  return loss.item()

In [None]:
# hyper parameters
num_samples = 10000
max_num = 99 # 1+99, max_num=1000 => 987+999
max_seq_len = 10
input_vocab_size = output_vocab_size = 14
d_model = 64
nhead = 4
num_layers = 2
num_epochs = 10
lr = 0.001

# dataset 만들기
data_x, data_y = generate_data(num_samples, max_num)

model = Transformer(input_vocab_size , output_vocab_size, d_model, nhead,num_layers, max_seq_len)
model.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs):
  epoch_loss = 0.
  for i in range(num_samples):
    src, tgt, tgt_y, src_mask, tgt_mask = prepare_data(data_x[i], data_y[i], max_seq_len)
    loss = train(model, criterion,optimizer, src.to('cuda'), tgt.to('cuda'), tgt_y.to('cuda'), src_mask, tgt_mask)
    epoch_loss +=loss
  print(f"Epoch {epoch+1}, Loss : {epoch_loss / num_samples}")

Epoch 1, Loss : 0.6231853401087224
Epoch 2, Loss : 0.642166246059537
Epoch 3, Loss : 0.6395856628447771
Epoch 4, Loss : 0.6363807393372058
Epoch 5, Loss : 0.6345647348046303


KeyboardInterrupt: ignored

# 예측 어떻게 되는지 보여주는 코드 작성
# GPU 활용도를 높이는 코드
# IMDB를 적용
# 다른 분야에 적용
# positional encoding 알고리즘을 변경 etc