<a href="https://colab.research.google.com/github/gagyeomkim/Deep-Learning-Paper-Review-and-Practice/blob/main/code_practice/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Sequence to Sequence Learning with Neural Networks (NIPS 2014) 실습**

- 본 코드는 **Attention을 이용하지 않은 Seq2Seq** 모델을 다룹니다.
- 코드 실행 전에 **[런타임] → [런타임 유형 변경]** → 유형을 **GPU**로 설정합니다
- code by: https://github.com/graykode/nlp-tutorial?tab=readme-ov-file

In [3]:
import numpy as np
import torch
import torch.nn as nn

In [4]:
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps

def make_batch():
    input_batch, output_batch, target_batch = [], [], []

    for seq in seq_data:
        for i in range(2):  # 0~1
            seq[i] = seq[i] + 'P'*(n_step - len(seq[i]))    # 패딩

        input = [num_dic[n] for n in seq[0]]    # char2index
        output = [num_dic[n] for n in ('S' + seq[1])]   # teacher forcing을 위한 decoding 입력
        target = [num_dic[n] for n in (seq[1] + 'E')]   # 실제로 예측해야하는 정답 sequence

        input_batch.append(np.eye(n_class)[input])  # input에 해당하는 word의 one-hot vector
        output_batch.append(np.eye(n_class)[output])    # (Teacher forcing) input에 해당하는 word의 one-hot vector
        target_batch.append(target) # not one-hot

    # make tensor
    return torch.FloatTensor(input_batch), torch.FloatTensor(output_batch), torch.LongTensor(target_batch)

# Model
class Seq2Seq(nn.Module):
    def __init__(self):
        super(Seq2Seq, self).__init__()
        self.enc_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.dec_cell = nn.RNN(input_size=n_class, hidden_size=n_hidden, dropout=0.5)
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, enc_input, enc_hidden, dec_input):
        """
        enc_input: [batch_size, max_len(=n_step; time step), n_class]
                     [배치크기, 단어의최대길이, 원-핫 벡터라 dimension이 n_class]
        enc_hidden: [num_layers * num_directions, batch_size, n_hidden]
        dec_input: [batch_size, max_len+1(=n_step; time step)(because of 'S' or 'E'), n_class]
        """
        enc_input = enc_input.transpose(0,1)    # enc_input: [max_len(=n_step; time step), batch_size, n_class]
        dec_input = dec_input.transpose(0, 1)   # dec_input: [max_len(=n_step; time step), batch_size, n_class]

        # enc_states: [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        _, enc_states = self.enc_cell(enc_input, enc_hidden)
        # outputs: [max_len+1(=6), batch_size, num_directions(=1) * n_hidden(=128)]
        outputs, _ = self.dec_cell(dec_input, enc_states)

        model = self.fc(outputs)    # model: [max_len+1(=6), batch_size, n_class]
        return model


if __name__ == '__main__':
    n_step = 5  # 최대 단어 길이
    n_hidden = 128

    char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
    num_dic = {n:i for i, n in enumerate(char_arr)} # char2index
    n_class = len(num_dic)
    seq_data = [['man','women'],
                ['black','white'],
                ['king', 'queen'],
                ['girl','boy'],
                ['up','down'],
                ['high','low']]
    batch_size = len(seq_data)

    # input_batch: [batch_size, max_len(=n_step; time step), n_class]
                #  [배치크기, 단어의최대길이, 원-핫 벡터라 dimension이 n_class]
    # output_batch: [batch_size, max_len+1(=n_step; time step)(because of 'S' or 'E'), n_class]
    # target_batch: [batch_size, max_len+1(n_step; time step)], not one-hot
    input_batch, output_batch, target_batch = make_batch()

    model = Seq2Seq()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    input_batch, output_batch, target_batch = make_batch()

    for epoch in range(5000):
        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
        hidden = torch.zeros(1, batch_size, n_hidden)

        optimizer.zero_grad()
        # output = [max_len+1, batch_size, n_class]
        output = model(input_batch, hidden, output_batch)
        output = output.transpose(0, 1) # [batch_size, max_len+1(=6), n_class]
        loss = 0
        for i in range(0, len(target_batch)):
            # output[i]: [max_len+1, n_class]
            # target_batch[i]: [max_len+1]
            loss += criterion(output[i], target_batch[i])   # 문자별로 loss 계산
        if (epoch + 1)%1000 == 0:
            print(f'Epoch: {epoch+1:#04d} cost:{loss:.6f}')
        loss.backward()
        optimizer.step()

    # Test
    def make_testbach(input_word):
        """
        parameter:
            input_word: str
        return:
            torch.FloatTensor(input_batch).unsqueeze(0): [1, max_len(=n_step), n_class],
            torch.FloatTensor(output_batch).unsqueeze(0): [1, max_len+1, n_class]
        """
        input_batch, output_batch = [], []

        input_w = input_word + 'P'*(n_step - len(input_word))
        input = [num_dic[n] for n in input_w]   # 1개의 word를 char2index
        output = [num_dic[n] for n in 'S' + 'P'*n_step]

        input_batch = np.eye(n_class)[input]    # one-hot
        output_batch = np.eye(n_class)[output]  # one-hot

        return torch.FloatTensor(input_batch).unsqueeze(0), torch.FloatTensor(output_batch).unsqueeze(0)

    def translate(word):
        """
        word: str
        """
        input_batch, output_batch = make_testbach(word)

        # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
        hidden = torch.zeros(1, 1, n_hidden)
        # output: [max_len+1(=6), batch_size(=1), n_class]
                                  # word가 1개가 들어가기 때문에, batch=1
        output = model(input_batch, hidden, output_batch)
        predict = torch.argmax(output, dim=2)   # logit은 n_class에 적혀있으므로, dim=2를 기준으로 구함
        decoded = [char_arr[i] for i in predict]
        end = decoded.index('E')
        translated = ''.join(decoded[:end])

        return translated.replace('P', '')

    print('test')
    print('man -> ', translate('man'))
    print('mans-> ', translate('mans'))
    print('king ->', translate('king'))
    print('black ->', translate('black'))
    print('upp ->', translate('upp'))



Epoch: 1000 cost:0.003316
Epoch: 2000 cost:0.000902
Epoch: 3000 cost:0.000384
Epoch: 4000 cost:0.000192
Epoch: 5000 cost:0.000104
test
man ->  women
mans->  women
king -> queen
black -> white
upp -> down


### 참조

- `output, hidden = nn.RNN()`
    - https://docs.pytorch.org/docs/stable/generated/torch.nn.RNN.html
    - output: RNN의 각 시점(time step)마다 계산된 **최종 계층(last layer)의 은닉 상태(hidden state)**들을 모두 모아놓은 텐서
    - hidden: RNN이 모든 시퀀스를 처리한 후, **마지막 시점(last time step)에서의 모든 계층(all layers)의 은닉 상태**를 담고 있는 텐서

- `unsqueeze()`: 1인 차원을 생성하는 함수

### Code Test

In [5]:
range(2)

range(0, 2)

In [6]:
target = [1,2,4]
torch.FloatTensor(target).shape

torch.Size([3])

In [7]:
char_arr = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
num_dic = {n:i for i, n in enumerate(char_arr)} # char2index
num_dic

{'S': 0,
 'E': 1,
 'P': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [8]:
input_word = 'test'
n_step = 5
n_class = len(num_dic)

input_batch, output_batch = [], []

input_w = input_word + 'P'*(n_step - len(input_word))
input = [num_dic[n] for n in input_w]
output = [num_dic[n] for n in 'S' + 'P'*n_step]

input_batch = np.eye(n_class)[input]    # one-hot
output_batch = np.eye(n_class)[output]  # one-hot
# input_batch.shape
output_batch.shape

(6, 29)