<a href="https://colab.research.google.com/github/dltnqls9788/AI-YangJaeHub/blob/main/Seq2Seq(Multi30k).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -V

Python 3.7.13


In [None]:
# 버전 일치 안될때
!apt install python3.7
!pip install -U torchtext==0.6.0
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3.7 is already the newest version (3.7.13-1+bionic3).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-07-26 11:51:54.812025: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful

In [None]:
import torch 
import torch.nn as nn 
import torch.optim as optim 

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator 

import spacy 
import numpy as np

import random 
import math
import time 

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

In [None]:
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(tokenize= tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize= tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

In [None]:
# Multi30k 3만개의 영어, 독일어, 프랑스어 문장이 있는, 각 문장당 12개의 단어로, exts = 언어를 정할 수 있음 
# '.de' 독일어
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG))

In [None]:
print(f'train_data : {len(train_data)}')
print(f'valid_data : {len(valid_data)}')
print(f'test_data : {len(test_data)}')

train_data : 29000
valid_data : 1014
test_data : 1000


In [None]:
print(len(vars(train_data.examples[0])['src']))
print(len(vars(train_data.examples[1])['src']))

print(vars(train_data.examples[0]))
print(vars(train_data.examples[1]))

13
8
{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}
{'src': ['.', 'antriebsradsystem', 'ein', 'bedienen', 'schutzhelmen', 'mit', 'männer', 'mehrere'], 'trg': ['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']}


In [None]:
# Vocab 빌드하기 
SRC.build_vocab(train_data, min_freq=2) # 최소한 두번은 등장
TRG.build_vocab(train_data, min_freq=2)

In [None]:
print(f'Unique tokens in SRC vocab: {len(SRC.vocab)}')
print(f'Unique tokens in TRG vocab: {len(TRG.vocab)}')

Unique tokens in SRC vocab: 7853
Unique tokens in TRG vocab: 5893


In [None]:
print(f'가장 자주 나오는 단어들 20개 in TRG : \n{TRG.vocab.freqs.most_common(20)}\n')

가장 자주 나오는 단어들 20개 in TRG : 
[('a', 49165), ('.', 27623), ('in', 14886), ('the', 10955), ('on', 8035), ('man', 7781), ('is', 7525), ('and', 7379), ('of', 6871), ('with', 6179), ('woman', 3973), (',', 3963), ('two', 3886), ('are', 3717), ('to', 3128), ('people', 3122), ('at', 2927), ('an', 2861), ('wearing', 2623), ('shirt', 2324)]



In [None]:
# interator
print(torch.__version__)

1.12.0+cu113


In [None]:
# GPU 활용
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)

In [None]:
print(TRG.vocab.stoi[TRG.pad_token]) #<pad> token의 index = 1

for i, batch in enumerate(train_iterator):
    src = batch.src
    trg = batch.trg

    src = src.transpose(1,0)
    print(f"첫 번째 배치의 text 크기: {src.shape}")
    print(src[0])
    print(src[1])

    break

print(len(train_iterator))
print(len(train_iterator)*128)

1
첫 번째 배치의 text 크기: torch.Size([128, 23])
tensor([  2,   4, 268,  24,  12,  31, 256, 331,  14,   7,  16,   8,   3,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1])
tensor([   2,    4,   62,  501,   14,   22,   87,   77,   11,  308,    5,   35,
           9, 1551,   54,    3,    1,    1,    1,    1,    1,    1,    1])
227
29056


## 모델 build 

### Enocder : 2개의 LSTM layer로 구성 (논문에서는 4개 사용)
#### h1_t = Encoder(e(x_t), h1_(t-1))
#### h2_t = Encoder(h1_t, h2_(t_1))
#### LSTM의 cell state = c_t 라고 하면, 이것도 같이 입력으로 들어감
___

#### (h1_t, c1_t) = Encoder LSTM(e(x_t), (h1_(t-1), c1_(t-1) ))
#### (h2_t, c2_t) = Encoder LSTM(h1_t, (h2_(t-1), c2_(t-1) ))

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)

    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

    self.dropout = nn.Dropout(dropout)

  def forward(self, src):

    # src = [src len, batch size]
    embedded = self.dropout(self.embedding(src))

    # embedded = [src len, batch size, emb dim]

    outputs, (hidden, cell) = self.rnn(embedded)

    # hidden = [n layers * n directions, batch size, hid dim]
    # cell = [n layer * n directions, batch size, hid dim]

    # outputs = [src len, batch size, hid dim * n directions]
    ## output은 언제나 hidden layer의 top에 있음

    return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        
        # input = [batch size]
        ## 한번에 하나의 token만 decoding하므로 forward에서의 input token의 길이는 1입니다.
        
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hid dim]
        # context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        # input을 0차원에 대해 unsqueeze해서 1의 sentence length dimension을 추가합니다.
        # input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        # embedding layer를 통과한 후에 dropout을 합니다.
        # embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        # output = [seq len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # seq len and n directions will always be 1 in the decoder, therefore:
        # output = [1, batch size, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

## Seq 2 Seq


#### encoder에 src sentence를 넣는다 
#### encoder를 학습시켜서 고정된 크기의 context vector를 출력한다 
#### context vector를 decoder에 넣어서 예측된 trg sentence를 생성한다 

#### Teacher Forcing은 다음 입력으로 디코더의 예측을 사용하는 대신, 실제 목표 출력을 다음 입력으로 사용하는 컨셉
#### Ground truth 진짜 레이블을 디코더의 다음 입력으로 넣어줌으로써 더 정확한 예측을 가능하게 

In [None]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # output을 저장할 tensor를 만듭니다.(처음에는 전부 0으로)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # src문장을 encoder에 넣은 후 hidden, cell값을 구합니다.
        hidden, cell = self.encoder(src)
        
        # decoder에 입력할 첫번째 input입니다.
        # 첫번째 input은 모두 <sos> token입니다.
        # trg[0,:].shape = BATCH_SIZE 
        input = trg[0,:]  
        
        
        '''한번에 batch_size만큼의 token들을 독립적으로 계산
        즉, 총 trg_len번의 for문이 돌아가며 이 for문이 다 돌아가야지만 하나의 문장이 decoding됨
        또한, 1번의 for문당 128개의 문장의 각 token들이 다같이 decoding되는 것'''
        for t in range(1, trg_len):
            
            # input token embedding과 이전 hidden/cell state를 decoder에 입력합니다.
            # 새로운 hidden/cell states와 예측 output값이 출력됩니다.
            output, hidden, cell = self.decoder(input, hidden, cell)

            #output = [batch size, output dim]

            # 각각의 출력값을 outputs tensor에 저장합니다.
            outputs[t] = output
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            # predictions들 중에 가장 잘 예측된 token을 top에 넣습니다.
            # 1차원 중 가장 큰 값만을 top1에 저장하므로 1차원은 사라집니다.
            top1 = output.argmax(1) 
            # top1 = [batch size]
            
            # teacher forcing기법을 사용한다면, 다음 input으로 target을 입력하고
            # 아니라면 이전 state의 예측된 출력값을 다음 input으로 사용합니다.
            input = trg[t] if teacher_force else top1
        
        return outputs

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
  for name, param in m.named_parameters():
    nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_paramters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Trainable Param num: {count_paramters(model):,}')

Trainable Param num: 13,898,501


## Training Model 

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

### Evaluation

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time/60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)

    return elapsed_mins, elapsed_secs

## learning through epochs 

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 25m 36s
	Train Loss: 5.054 | Train PPL: 156.724
	 Val. Loss: 4.863 |  Val. PPL: 129.352
Epoch: 02 | Time: 25m 18s
	Train Loss: 4.463 | Train PPL:  86.711
	 Val. Loss: 4.758 |  Val. PPL: 116.516
Epoch: 03 | Time: 24m 59s
	Train Loss: 4.170 | Train PPL:  64.714
	 Val. Loss: 4.624 |  Val. PPL: 101.949
Epoch: 04 | Time: 25m 30s
	Train Loss: 3.968 | Train PPL:  52.899
	 Val. Loss: 4.503 |  Val. PPL:  90.262


In [None]:
# model.load_state_dict(torch.load('tut1-model.pt')) ### 저장한 모델 로드

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'WtTestLoss: {test_loss:.3f} | Test Acc:{test_acc*100:.2f}%')