In [1]:
import os
import pickle
# 수학 관련 라이브러리
import numpy as np
import math
# pytorch 관련 라이브러리
import torch
import torch.nn as nn 
import torch.nn.functional as F 

import pandas as pd
import nltk
import tqdm
from torch.utils import data # dataset 관련된 utility 를 사용하려는 용도
from random import choice, randrange # random
from itertools import zip_longest 
import json 
import random
import pdb


nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# 파일 올리기
from google.colab import files
uploaded = files.upload()

Saving data_files.zip to data_files.zip


In [3]:
!unzip data_files.zip

Archive:  data_files.zip
  inflating: english_id2word.pkl     
  inflating: english_vocab.pkl       
  inflating: english_word2id.pkl     
   creating: korean_data/
  inflating: korean_data/test_english.csv  
  inflating: korean_data/test_korean.csv  
  inflating: korean_data/train.csv   
  inflating: korean_id2word.pkl      
  inflating: korean_vocab.pkl        
  inflating: korean_word2id.pkl      
  inflating: train_english.pkl       
  inflating: train_korean.pkl        


In [4]:
def split_last(x, shape):
    "split the last dimension to given shape"
    shape = list(shape)
    assert shape.count(-1) <= 1
    if -1 in shape:
        shape[shape.index(-1)] = int(x.size(-1) / -np.prod(shape))
    return x.view(*x.size()[:-1], *shape)

def merge_last(x, n_dims):
    "merge the last n_dims to a dimension"
    s = x.size()
    assert n_dims > 1 and n_dims < len(s)
    return x.view(*s[:-n_dims], -1)

# Activation function

In [5]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

# Layer normalization

In [6]:
class LayerNorm(nn.Module):
    def __init__(self, cfg, variance_epsilon=1e-12):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(cfg.dim))
        self.beta  = nn.Parameter(torch.zeros(cfg.dim))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        # get mean, variance
        u = x.mean(-1, keepdim=True) # sequence 방향 mean
        s = (x - u).pow(2).mean(-1, keepdim=True) # sequence 방향 variance
        
        # normalize
        x = (x - u) / torch.sqrt(s + self.variance_epsilon) # (x - mean)/std 
        
        return self.gamma * x + self.beta # gamma, beta를 이용해 mean, std 조정

# Embedding

In [7]:
def get_sinusoid_encoding_table(n_position, d_model):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_model)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
    return torch.FloatTensor(sinusoid_table)


class Embeddings(nn.Module):
    "The embedding module from word, position and token_type embeddings."
    def __init__(self, cfg, vocab_size):
        super().__init__()
        
        self.tok_embed = nn.Embedding(vocab_size, cfg.dim) # token embedding
        self.pos_embed = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(512, cfg.dim),freeze=True) # position embedding

        self.norm = LayerNorm(cfg)
        self.drop = nn.Dropout(cfg.p_drop_hidden)

    def forward(self, x):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long, device=x.device) # 0,1,2,3,4,5, ..., seq_len-1
        pos = pos.unsqueeze(0).expand_as(x) # (S,) -> (B, S)

        e = self.tok_embed(x) + self.pos_embed(pos)
        return self.drop(self.norm(e))

#  Transformer encoder

In [8]:
class Attention(nn.Module):
    #Scaled Dot Product Attention
    
    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1)) # scale
        
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

In [9]:
class MultiHeadAttention(nn.Module):
    """ Multi-Headed Dot Product Attention """
    def __init__(self, cfg):
        super().__init__()
        self.proj_q = nn.Linear(cfg.dim, cfg.dim)
        self.proj_k = nn.Linear(cfg.dim, cfg.dim)
        self.proj_v = nn.Linear(cfg.dim, cfg.dim)
        self.drop = nn.Dropout(cfg.p_drop_attn)
        self.scores = None # for visualization
        self.n_heads = cfg.n_heads

    def forward(self, x, mask, x_q=None):
        """
        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
        mask : (B(batch_size) x S(seq_len))
        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
        """
        
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        if x_q is None:
            q, k, v = self.proj_q(x), self.proj_k(x), self.proj_v(x)
        else:
            q, k, v = self.proj_q(x_q), self.proj_k(x), self.proj_v(x)
        q, k, v = (split_last(x, (self.n_heads, -1)).transpose(1, 2)
                   for x in [q, k, v])
        # (B, H, S, W) @ (B, H, W, S) -> (B, H, S, S) -softmax-> (B, H, S, S)
        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(k.size(-1)) # @ == torch.matmul (dot product)
        if mask is not None:
            mask = mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)
            scores = scores.masked_fill_(mask, -1e9)
        scores = self.drop(F.softmax(scores, dim=-1))
        # (B, H, S, S) @ (B, H, S, W) -> (B, H, S, W) -trans-> (B, S, H, W)
        h = torch.matmul(scores, v).transpose(1,2).contiguous()
        # -merge-> (B, S, D)
        h = merge_last(h, 2)
        self.scores = scores
        return h

# Base feedforward network

In [10]:
class PositionWiseFeedForward(nn.Module):
    """ FeedForward Neural Networks for each position """
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg.dim, cfg.dim_ff)
        self.fc2 = nn.Linear(cfg.dim_ff, cfg.dim)

    def forward(self, x):
        # (B, S, D) -> (B, S, D_ff) -> (B, S, D)
        return self.fc2(gelu(self.fc1(x)))

# Transformer

In [25]:
class Encoder_Block(nn.Module):
    """ Transformer Block """
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(cfg)
        self.proj = nn.Linear(cfg.dim, cfg.dim)
        self.norm1 = LayerNorm(cfg)
        self.pwff = PositionWiseFeedForward(cfg)
        self.norm2 = LayerNorm(cfg)
        self.drop = nn.Dropout(cfg.p_drop_hidden)

    def forward(self, x, mask):
        h = self.attn(x, mask)
        h = self.norm1(x + self.drop(self.proj(h)))
        h = self.norm2(h + self.drop(self.pwff(h)))
        return h
    
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k
    
def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.tensor(subsequent_mask, device=seq.device).byte()
    return subsequent_mask
    
    
class Decoder_Block(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.self_attention = MultiHeadAttention(cfg)
        self.encoder_attention = MultiHeadAttention(cfg)
        
        self.norm1 = LayerNorm(cfg)
        self.proj1 = nn.Linear(cfg.dim, cfg.dim)
        self.norm2 = LayerNorm(cfg)
        self.proj2 = nn.Linear(cfg.dim, cfg.dim)
        
        self.pwff = PositionWiseFeedForward(cfg)
        self.norm3 = LayerNorm(cfg)
        
        self.drop = nn.Dropout(cfg.p_drop_hidden)
        
    def forward(self,x , enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        
        # self-attention -> add&norm
        h = self.self_attention(x, dec_self_attn_mask)
        h = self.norm1(x + self.drop(self.proj1(h)))
        
        # encoder attention -> add&norm
        h2 = self.encoder_attention(enc_outputs, dec_enc_attn_mask, x_q=h)
        h = self.norm2(h + self.drop(self.proj2(h2))) 
        
        # feedforward network
        h = self.norm3(h + self.drop(self.pwff(h)))
        
        return h

class Transformer(nn.Module):
    """ Transformer with Self-Attentive Blocks"""
    def __init__(self, cfg):
        super().__init__()
        #====================encoder===========================
        self.encoder_embed = Embeddings(cfg, len(korean_vocab))
        self.encoder_blocks = nn.ModuleList([Encoder_Block(cfg) for _ in range(cfg.n_layers)])

        #====================decoder============================
        self.decoder_embed = Embeddings(cfg, len(english_vocab))
        self.decoder_blocks = nn.ModuleList([Decoder_Block(cfg) for _ in range(cfg.n_layers)])
        
        #=========================================================
        self.projection = nn.Linear(cfg.dim, len(english_vocab))
        
        
    def forward(self, enc_inputs, dec_inputs):
        #============encoder============
        h = self.encoder_embed(enc_inputs)
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
        for block in self.encoder_blocks:
            h = block(h, enc_self_attn_mask)
            
        enc_outputs = h
        
        
        #============decoder============
        
        # self attention mask
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs).float()
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs).float()
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        # encoder attention mask
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)
        
        
        # embedding
        h = self.decoder_embed(dec_inputs)
        
        
        for block in self.decoder_blocks:
            h = block(h, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
        #============projection==========
        
        out = self.projection(h)
        
        return out
    
    def greedy_decoding(self, enc_inputs, start_token_index = 1, end_token_index = 2, generation_max_len=128):
        
        with torch.no_grad():
            batch_size, max_length = enc_inputs.size()
            generation_end_flag = [0 for i in range(batch_size)]
            predicted_sentences = []
            #=================encoding=============
            h = self.encoder_embed(enc_inputs)
            enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
            for block in self.encoder_blocks:
                h = block(h, enc_self_attn_mask)    
            enc_outputs = h

            #================ greedy decoding ==================
            # dec_inputs : (batch size, 1) # <s>
            dec_inputs = torch.ones(batch_size, 1, device=enc_inputs.device) * start_token_index
            dec_inputs = dec_inputs.long()

            for i in range(generation_max_len):

                #====================== decoder =======================
                # self attention mask
                dec_self_attn_pad_mask = None
                dec_self_attn_subsequent_mask = None
                dec_self_attn_mask = None

                # encoder attention mask
                dec_enc_attn_mask = None


                # embedding
                h = self.decoder_embed(dec_inputs)


                for block in self.decoder_blocks:
                    h = block(h, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)


                out = self.projection(h[:,-1,:]) # [B, S, D] # -1는 마지막 새 단어를 만들어야하므로 
                pred = out.argmax(-1) # argmax: 가장 확률이 높은 값을 반환 # max는 값, argmax는 인덱스

#                 print(out.size(), pred)

                dec_inputs = torch.cat((dec_inputs, pred.unsqueeze(1)),dim=1)
                # (B, 1) -> (B, 2) -> (B, 3) -> ... -> (B, max_length)

    #             print(dec_inputs)

                predicted_sentences.append(pred)
                for j, boolean in enumerate(pred==end_token_index): # <s>
                    if boolean == True:
                        generation_end_flag[j] = 1
                if sum(generation_end_flag) == batch_size:
                    break

        return torch.stack(predicted_sentences, dim=1)
        
        

In [26]:
# 한 -> 영
data_ = pd.read_csv("./korean_data/train.csv")
korean_data = data_["Korean"].values
english_data = data_["English"].values

In [27]:
nltk.tokenize.word_tokenize(korean_data[100])
# nltk / spacy 차이는 구글에게 물어보기 

['그것을', '막기', '위해', ',', '많은', '과학자는', '친환경적인', '사용', '방법을', '연구했어요', '.']

In [28]:
def build_dict(seqs):
    num_skip_sent = 0
    word_count = 4
    vocab = ["<pad>","<s>","</s>","<unk>"]
    word2id = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3}
    id2word = {0: "<pad>", 1: "<s>", 2: "</s>", 3: "<unk>"}
    print("Building vocab and dict..")
    for line in seqs:
        words = line.strip().split(' ') # tokenized by space 
        for word in words:
            if word not in vocab:
                word_count += 1 # increment word_count
                vocab.append(word) # append new unique word
                index = word_count - 1 # word index (consider index 0)
                word2id[word] = index # word to index
                id2word[index] = word # index to word
    
    print("Number of unique words: %d" % len(vocab))
    print("Finised building vocab and dict!")

    return vocab, word2id, id2word

In [29]:
if os.path.isfile("./korean_vocab.pkl"):
    with open("./train_korean.pkl", "rb") as f:
        korean_data_token_ = pickle.load(f)
    with open("./train_english.pkl", "rb") as f:
        english_data_token_ = pickle.load(f)    
    
    with open("./korean_vocab.pkl", "rb") as f:
        korean_vocab = pickle.load(f)
    with open("./korean_word2id.pkl", "rb") as f:
        korean_word2id = pickle.load(f)
    with open("./korean_id2word.pkl", "rb") as f:
        korean_id2word = pickle.load(f)
    with open("./english_vocab.pkl", "rb") as f:
        english_vocab = pickle.load(f)
    with open("./english_word2id.pkl", "rb") as f:
        english_word2id = pickle.load(f)
    with open("./english_id2word.pkl", "rb") as f:
        english_id2word = pickle.load(f)

else:
    korean_data_token = []
    for sent in korean_data:
        korean_data_token.append([token for token in nltk.tokenize.word_tokenize(sent)])

    english_data_token = []
    for sent in english_data:
        english_data_token.append([token for token in nltk.tokenize.word_tokenize(sent)])

    korean_data_token_ = [' '.join(token) for token in korean_data_token]
    english_data_token_ = [' '.join(token) for token in english_data_token]    
    
    korean_vocab, korean_word2id, korean_id2word = build_dict(korean_data_token_)
    english_vocab, english_word2id, english_id2word = build_dict(english_data_token_)
    
    pickle.dump(korean_data_token_, open("./train_korean.pkl", "wb" ))
    pickle.dump(english_data_token_, open("./train_english.pkl", "wb" ))    
    
    pickle.dump(korean_vocab, open("./korean_vocab.pkl", "wb" ))
    pickle.dump(korean_word2id, open("./korean_word2id.pkl", "wb" ))
    pickle.dump(korean_id2word, open("./korean_id2word.pkl", "wb" ))

    pickle.dump(english_vocab, open("./english_vocab.pkl", "wb" ))
    pickle.dump(english_word2id, open("./english_word2id.pkl", "wb" ))
    pickle.dump(english_id2word, open("./english_id2word.pkl", "wb" ))

In [19]:
korean_vocab

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 '그들은',
 '현실에',
 '대해',
 '지나치게',
 '냉소적인',
 '태도를',
 '지니고',
 '있었습니다',
 '.',
 '경기장',
 '근처의',
 '공영',
 '주차장을',
 '이용하시면',
 '됩니다',
 '티켓은',
 '앞면에',
 '명시된',
 '경기장에서만',
 '유효합니다',
 '매장에',
 '좋은',
 '리뷰를',
 '올려주신',
 '고객님께',
 '추첨을',
 '통해',
 '1만원권',
 '쿠폰을',
 '지급해드립니다',
 '어제',
 '백화점에',
 '물건',
 '사러',
 '갔다',
 '왔어요',
 '나는',
 '지금',
 '전혀',
 '괜찮지',
 '않아서',
 '좀',
 '조용히',
 '혼자',
 '있고',
 '싶어요',
 '우선',
 '애쉬번햄에서',
 '하는',
 '일들을',
 '익히고',
 '최선을',
 '다하고',
 '싶습니다',
 '당신은',
 '트럭을',
 '운전할',
 '수',
 '있습니까',
 '?',
 '저는',
 '프로그래머',
 '이정수입니다',
 '불량',
 '학생들을',
 '쉽게',
 '지도할',
 '있습니다',
 '자세가',
 '바르게',
 '되어',
 '어깨와',
 '허리',
 '통증이',
 '완화됩니다',
 '몇',
 '가지',
 '질문이',
 '있어서',
 '메일을',
 '드립니다',
 '너는',
 '그동안',
 '나한테',
 '연락하지',
 '않았잖아',
 '우리',
 '아기가',
 '원래',
 '열이',
 '많은',
 '체질이라',
 '여름에',
 '피부트러블이',
 '것',
 '같아요',
 '그렇게',
 '생각한다면',
 '나에게도',
 '아주',
 '바쁘고',
 '힘든',
 '하루였어',
 '위의',
 '문장',
 '유형은',
 '일찍',
 '만나서',
 '수영을',
 '즐길',
 '있음을',
 '의미합니다',
 '그들에게',
 '부산에서의',
 '오픈을',
 '위한',
 '큰',
 '에너지를',
 '받았습니다',
 '내',

In [20]:
english_vocab

['<pad>',
 '<s>',
 '</s>',
 '<unk>',
 'They',
 'were',
 'way',
 'too',
 'much',
 'cynical',
 'to',
 'reality',
 '.',
 'You',
 'can',
 'use',
 'a',
 'public',
 'parking',
 'lot',
 'near',
 'the',
 'stadium',
 'Tickets',
 'are',
 'only',
 'valid',
 'for',
 'admission',
 'venue',
 'listed',
 'on',
 'face',
 'of',
 'ticket',
 'A',
 'million',
 'won',
 'cash',
 'coupon',
 'will',
 'be',
 'raffled',
 'off',
 'customers',
 'who',
 'upload',
 'good',
 'review',
 'Store',
 'I',
 'went',
 'shopping',
 'in',
 'department',
 'store',
 'yesterday',
 "'m",
 'not',
 'okay',
 'at',
 'all',
 'right',
 'now',
 ',',
 'so',
 'want',
 'alone',
 'quietly',
 'First',
 'become',
 'familiar',
 'with',
 'what',
 'should',
 'do',
 'Ashburnham',
 'and',
 'would',
 'like',
 'my',
 'best',
 'Can',
 'you',
 'drive',
 'truck',
 '?',
 'am',
 'programmer',
 'Jungsoo',
 'Lee',
 'easily',
 'teach',
 'disorderly',
 'students',
 'The',
 'fixed',
 'stance',
 'help',
 'relieve',
 'back',
 'pain',
 'sending',
 'this',
 'email

In [30]:
def batch(iterable, n=1):
    args = [iter(iterable)] * n
    return zip_longest(*args)


def pad_tensor(vec, pad, value=0, dim=0):
    """
    pad token으로 채우는 용도 
    args:
        vec - tensor to pad
        pad - the size to pad to
        dim - dimension to pad
    return:
        a new tensor padded to 'pad' in dimension 'dim'
    """
    pad_size = pad - vec.shape[0] # 문장 길이에서 모자라는 만큼 

    if len(vec.shape) == 2:
        zeros = torch.ones((pad_size, vec.shape[-1])) * value
    elif len(vec.shape) == 1:
        zeros = torch.ones((pad_size,)) * value
    else:
        raise NotImplementedError
    return torch.cat([torch.Tensor(vec), zeros], dim=dim)


In [31]:
def pad_collate(batch, values=(0, 0), dim=0):
    """
    데이터 로더에 들어가기전에 batch화 할 때 거치는 함수 
    args:
        batch - list of (tensor, label)
    reutrn:
        xs - a tensor of all examples in 'batch' after padding
        ys - a LongTensor of all labels in batch
        ws - a tensor of sequence lengths
    """

    # [[s1, s2], [s1, s2], [s1, s2], ...]

    sequence_lengths = torch.Tensor([int(x[0].shape[dim]) for x in batch]) # 각 batch 마다 길이를 얻어내고 
    sequence_lengths, xids = sequence_lengths.sort(descending=True) # 감소하는 순서로 정렬
    target_lengths = torch.Tensor([int(x[1].shape[dim]) for x in batch])

    # find longest sequence (가장 긴 sequence의 길이를 구함 )
    src_max_len = max(map(lambda x: x[0].shape[dim], batch))
    tgt_max_len = max(map(lambda x: x[1].shape[dim], batch))

    # pad according to max_len (max length 만큼 padd를 추가 )
    # [[x, y], [x, y], [x, y], ... ]
    batch = [(pad_tensor(x, pad=src_max_len, dim=dim), pad_tensor(y, pad=tgt_max_len, dim=dim)) for (x, y) in batch]

    # stack all
    xs = torch.stack([x[0] for x in batch], dim=0)
    ys = torch.stack([x[1] for x in batch], dim=0) # 하나의 tensor로 만들어줌

    # transformer에서는 잘 사용하지 않음(별다른 영향이 없음). RNN에서 주로 쓰임. 
    # 불필요한 계산을 없애기 위함 (pad)
    xs = xs[xids].contiguous() # decreasing order로 다시 나열 
    ys = ys[xids].contiguous() # xids 와 같은 순서로 

    target_lengths = target_lengths[xids] 
    return xs.long(), ys.long(), sequence_lengths.int(), target_lengths.int() # int32 / long: int64


class ToyDataset(data.Dataset): # 한 -> 영 
    """
    https://talbaumel.github.io/blog/attention/
    """
    def __init__(self,  ko_path, en_path , ko_word2id, en_word2id):
        with open(ko_path, "rb") as f:
            self.ko_seqs = pickle.load(f)
        with open(en_path, "rb") as f:
            self.en_seqs = pickle.load(f)
        self.ko_word2id = ko_word2id
        self.en_word2id = en_word2id


    def __len__(self):
        return len(self.ko_seqs)

    def __getitem__(self, index):
        ko_seqs = self.ko_seqs[index]
        en_seqs = self.en_seqs[index]
        ko_seqs = self.process(ko_seqs, self.ko_word2id)
        en_seqs = self.process(en_seqs, self.en_word2id)

        # 한국어 문장 -> 영어 문장 
        # <s> 한국어 문장 인덱스들 </s> 
        # <s> 영어 문장 인덱스들 </s>

        return ko_seqs, en_seqs       

    def process(self, seq, word2id):

        # <s> 한국어 문장 인덱스들 </s> 
        # <s> 영어 문장 인덱스들 </s>
        sequence = []
        sequence.append(word2id["<s>"]) # <SOS>
        words = seq.strip().split(' ') # space 단위로 쪼갬 
        for word in words:
            if len(sequence) < model_config.max_len:
                if word in word2id:
                    sequence.append(word2id[word]) # 만약 vocabulary에 있으면 ,해당 index 넣기
                else: # 없으면 <unk> 넣기 
                    sequence.append(3) # replace by <unk> token 
            else:
                break
        sequence.append(word2id["</s>"])
        sequence = torch.Tensor(sequence)
        return sequence

In [32]:
sample_config = { # 적당한 하이퍼파라미터
    "dim": 32,
    "dim_ff": 32,
    "n_layers": 2,
    "p_drop_attn": 0.1,
    "n_heads": 4,
    "p_drop_hidden": 0.1,
    "max_len": 30,
    "n_segments": 2,
    "vocab_size": 30522,
    "batch_size": 32
}

class AttributeDict(dict):
    def __getattr__(self, name):
        return self[name]

model_config = AttributeDict(sample_config)
model = Transformer(model_config)
model = model.cuda()
# out = model(sample[0].cuda(),sample[0].cuda())
# out.size()

In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr= 0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0) # pad token(0번 토큰) 무시 

In [34]:
dataset = ToyDataset("train_korean.pkl", "train_english.pkl", korean_word2id, english_word2id)
train_loader = data.DataLoader(dataset, batch_size=model_config.batch_size, shuffle=True, collate_fn=pad_collate, drop_last=True)

In [35]:
# 학습하기 
model.train()
for epoch in range(10):
    loss_list =[]
    for idx, batch in enumerate(train_loader):
        x, y, x_len, y_len = batch
        x = x.cuda()
        y = y.cuda()
        logits = model(x, y[:,:-1]) # </s>(<EOS>) token 제외 
        loss = loss_fn(logits.view(-1,len(english_vocab)) , 
                       y[:,1:].contiguous().view(-1)) # loss 구하기 우리는 cross entropy 사용 
                       # <s>(<SOS>) token 제외
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_list.append(loss.item())
        
        if (idx+1) % 400 == 0 :
            print('epoch {} iteration {}/{} loss {:.4f}'.format(epoch+1, idx+1, len(train_loader), np.mean(loss_list)))
            loss_list=[]
            
        if (idx+1) % 2000 == 0:
            with torch.no_grad():
                # greedy_decoding
                ## 생성할 때 확률이 높은 순서대로 나열하는 것 
                # beam search
                ## 모든 경우의 수의 확률을 계산해 가장 높은 것을 채택 

                greedy_pred = model.greedy_decoding(x[0].unsqueeze(0))
            greedy_pred_indices = greedy_pred[0].data.cpu().tolist()
            pred_indices = logits[0].argmax(-1).data.cpu().tolist()
            input_indices = x[0].data.cpu().tolist()
            label_indices = y[0].data.cpu().tolist()
            
            if 2 in pred_indices:
                greedy_pred_len = pred_indices.index(2)
            else:
                greedy_pred_len = 128
            
            if 2 in pred_indices:
                pred_len = pred_indices.index(2)
            else:
                pred_len = 128

            if 2 in input_indices:
                input_len = input_indices.index(2)
            else:
                input_len = 128

            if 2 in label_indices:
                label_len = label_indices.index(2)
            else:
                label_len = 128
            
            greedy_pred_words = [english_id2word[idx] for i, idx in enumerate(greedy_pred_indices) if i<= greedy_pred_len]
            pred_words = [english_id2word[idx] for i, idx in enumerate(pred_indices) if i<=pred_len]
            input_words = [korean_id2word[idx] for i, idx in enumerate(input_indices) if i<=input_len]
            label_words = [english_id2word[idx] for i, idx in enumerate(label_indices) if i<=label_len]

            print('=====================================')
            print('입력:{}'.format(' '.join(input_words[1:-1])))
            print('출력(teacher forcing):{}'.format(' '.join(pred_words[:-1])))
            print('출력(greedy decoding):{}'.format(' '.join(greedy_pred_words[:-1])))
            print('정답:{}'.format(' '.join(label_words[1:-1])))
            print('=====================================')

epoch 1 iteration 400/11250 loss 6.9128
epoch 1 iteration 800/11250 loss 5.7098
epoch 1 iteration 1200/11250 loss 5.4449
epoch 1 iteration 1600/11250 loss 5.3082
epoch 1 iteration 2000/11250 loss 5.1909
입력:이것은 제가 분석업무를 수행할 때 상사 또는 고객사의 니즈를 더욱 더 명확히 파악할 수 있으리라 생각합니다 .
출력(teacher forcing):I do you can be a to be to to same , the friend . the friend . I can n't same .
출력(greedy decoding):I 'm going to the same to the same .
정답:I think I will be able to clearly tell the needs of my clients or my boss when I do the analysis .
epoch 1 iteration 2400/11250 loss 5.0978
epoch 1 iteration 2800/11250 loss 5.0465
epoch 1 iteration 3200/11250 loss 4.9885
epoch 1 iteration 3600/11250 loss 4.9441
epoch 1 iteration 4000/11250 loss 4.9029
입력:이러한 이유로 , FTA는 한국 경제에 많은 도움이 되고 있다고 생각하며 FTA가 계속 이루어져야 한다고 생각해요 .
출력(teacher forcing):The , , , the have the the product , not important . be product . of the world . be .
출력(greedy decoding):The product of the product , and the product , the product is a lot of th

In [36]:
#결과값 출력
model.eval()
for idx, batch in enumerate(train_loader):
    x, y, x_len, y_len = batch
    x = x.cuda()
    y = y.cuda()
    pred = model.greedy_decoding(x)

    for i in range(pred.shape[0]):
        pred_indices = pred[i].data.cpu().tolist()
        input_indices = x[i].data.cpu().tolist()
        label_indices = y[i].data.cpu().tolist()

        if 2 in pred_indices:
            pred_len = pred_indices.index(2)
        else:
            pred_len = 128

        if 2 in input_indices:
            input_len = input_indices.index(2)
        else:
            input_len = 128
        
        if 2 in label_indices:
            label_len = label_indices.index(2)
        else:
            label_len = 128
        
        pred_words = [english_id2word[idx] for i,idx in enumerate(pred_indices) if i <= pred_len]
        input_words = [korean_id2word[idx] for i,idx in enumerate(input_indices) if i <= input_len]
        label_words = [english_id2word[idx] for i,idx in enumerate(label_indices) if i <= label_len]
        if pred_len>1:
            print('===================')
            print('입력:{}'.format(' '.join(input_words)))
            print('출력(greedy decoding):{}'.format(' '.join(pred_words)))
            print('답안:{}'.format(' '.join(label_words)))

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
입력:<s> 당신의 수당은 총 3,000불입니다 . </s>
출력(greedy decoding):I have to know the total of the total of the total of the hotel . </s>
답안:<s> Your payment is a total of 3,000 dollars . </s>
입력:<s> 너 여행 때문에 피곤했나봐 . </s>
출력(greedy decoding):I have to know if you have to know the trip . </s>
답안:<s> You seemed to be tired from traveling . </s>
입력:<s> 각 도면은 그 앞에 오는 도면과 그리고 그 뒤에 오는 도면과 약간 다를 뿐이에요 . </s>
출력(greedy decoding):Each number is the screen , and the same time of the screen . </s>
답안:<s> Each drawing differ only slightly from the drawing that comes before it and the one that comes after it . </s>
입력:<s> 베지마이트는 야채에서 추출한 즙과 소금 , 이스트를 추출물로 만든 잼이라고 보면 됩니다 . </s>
출력(greedy decoding):If the water , it is the water , it is the natural cream of the water . </s>
답안:<s> Vegemite is a jam which is made with vegetable extracts , salt , and yeast . </s>
입력:<s> 지난번에 Park 모델 관련 문의드렸었는데 , 현재에도 가지고 계신 원숭이 모델이 없으신지요 ? </s>
출력(greedy decoding):I have received the

# model save

In [37]:
path = 'trained_model.pth'
torch.save(model.state_dict(), path)

## model load 

In [38]:
path = 'trained_model.pth'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

## TEST

In [39]:
def test_pad_collate(batch, values=(0, 0), dim=0):
    """
    데이터 로더에 들어가기전에 batch화 할 때 거치는 함수 
    args:
        batch - list of (tensor, label)
    reutrn:
        xs - a tensor of all examples in 'batch' after padding
        ys - a LongTensor of all labels in batch
        ws - a tensor of sequence lengths
    """

    sequence_lengths = torch.Tensor([int(x.shape[dim]) for x in batch]) # 각 batch 마다 길이를 얻어내고 
    # find longest sequence (가장 긴 sequence의 길이를 구함 )
    src_max_len = max(map(lambda x: x.shape[dim], batch))
    # pad according to max_len (max length 만큼 padd를 추가 )
    batch = [pad_tensor(x, pad=src_max_len, dim=dim) for x in batch]

    # stack all
    xs = torch.stack(batch, dim=0)
    return xs.long(), sequence_lengths.int()

class TestDataset(data.Dataset):
    def __init__(self,  ko_path, ko_word2id):
        self.load_data(ko_path)
        self.ko_word2id = ko_word2id
        
        
    def load_data(self, ko_path):
        korean_test_data_ = pd.read_csv("./korean_data/test_korean.csv")
        korean_test_data =korean_test_data_['Korean'].iloc[:2000].values
        korean_test_data_token = []
        for sent in korean_test_data:
            korean_test_data_token.append([token for token in nltk.tokenize.word_tokenize(sent)])
        korean_test_data_token = [' '.join(token) for token in korean_test_data_token]
        self.ko_seqs = korean_test_data_token
        
    def __len__(self):
        return len(self.ko_seqs)

    def __getitem__(self, index):
        ko_seqs = self.ko_seqs[index]
        ko_seqs = self.process(ko_seqs, self.ko_word2id)
        return ko_seqs       

    def process(self, seq, word2id):
        sequence = []
        sequence.append(word2id["<s>"])
        words = seq.strip().split(' ')
        for word in words:
            if word in word2id:
                sequence.append(word2id[word]) # 
            else:
                sequence.append(3) # replace by <unk> token
        sequence.append(word2id["</s>"])
        sequence = torch.Tensor(sequence)
        return sequence
    
    
def test(model):
    
    with open("./korean_word2id.pkl", "rb") as f:
        korean_word2id = pickle.load(f)
    
    test_dataset = TestDataset('./test_korean.pkl', korean_word2id)
    test_loader = data.DataLoader(test_dataset, batch_size=model_config.batch_size, shuffle=False, collate_fn=test_pad_collate, drop_last=False)
    model.eval()
    j = 0
    
    f = open('prediction_result.txt', 'w')
    for idx, batch in enumerate(test_loader):
        x, x_len= batch
        x = x.cuda()
        pred = model.greedy_decoding(x)

        for i in range(pred.shape[0]):
            j+=1
            
            pred_indices = pred[i].data.cpu().tolist()
            input_indices = x[i].data.cpu().tolist()

            if 2 in pred_indices:
                pred_len = pred_indices.index(2)
            else:
                pred_len = 128

            if 2 in input_indices:
                input_len = input_indices.index(2)
            else:
                input_len = 128

            pred_words = [english_id2word[idx] for i,idx in enumerate(pred_indices) if i <= pred_len]
            input_words = [korean_id2word[idx] for i,idx in enumerate(input_indices) if i <= input_len]
            
            if j%50 == 0 :
                print('========== index {} ========='.format(j))
                print('입력:{}'.format(' '.join(input_words[1:-1])))
                print('출력:{}'.format(' '.join(pred_words[:-1])))

            f.write(' '.join(pred_words[:-1]))
            f.write('\n')
            

In [40]:
test(model)

입력:나이가 들어도 <unk> 맨날 싸우는 것 같아 .
출력:I have to know how I have to get to get to the way to the way .
입력:너는 무섭거나 어려워하는 때가 언젠데 ?
출력:I know if you know the time , I know you know ?
입력:내가 영어 잘 못 하니까 네가 한국어 공부해야 해 .
출력:I have to know how I have to study English .
입력:두 번째로 좋아하는 음악은 <unk> .
출력:Second , I have to know the music .
입력:주제 : 여러분이 세상을 <unk> 믿는 한 명의 한국인을 생각해 보세요 .
출력:To be aware of the world , you are going to see the world .
입력:<unk> 작품은 <unk> 나타나면서 한국의 <unk> 주체적으로 표현한 <unk> 평가해요 .
출력:I have to know the Korean traditional store and the place to be able to be able to be able to know the right now .
입력:내 친구는 호주에 있어서 너무 보고 싶어 .
출력:I have to know my friend before I have been now .
입력:<unk> <unk> 삼각형 모양을 매우 잘 <unk> 할 수 있습니다 .
출력:I have to know the shape of the shape of the shape of the shape of the shape .
입력:그중에서도 연어 초밥과 참치 초밥을 가장 좋아합니다 .
출력:I have to know the most popular food .
입력:나는 정말 <unk> 갖고 싶어요 .
출력:I have to know how I have to know .
입력:너는 대학교에서 어떤 걸 <unk> ?
출력:What is you know wh

In [None]:
# 과제

1. query, key, value 간의 관계에 대해 작성해주세요.
- query: 현재 처리하고자 하는 token을 나타내는 vector 
  key: label. sequence 내에 있는 모든 token에 대한 id 
  value: key와 연결된 실제 token을 나타내는 vector 
  즉, key가 폴더, value가 key라는 폴더에 위치한 data, query가 찾고자하는 값이라고 볼 수 있다. 

2. multi-head 의 필요성에 대해 작성해주세요.
- 한번의 attention을 하는 것보다 병렬로 여러 번 사용하는 게 더 효과적이다. 
  다른 시각에서 정보를 수집하는 역할을 한다. 

3. BERT 사전 학습의 두가지 태스크에 대해 설명해주세요.
- MLM: 양방향성으로 언어 모델 학습 
  NSP: 입력값에 다음 문장도 함께 추가하여 다음 문장인지 여부를 분류(Binary Classification)

4. Transformer embedding과 BERT embedding의 차이점 두가지를 작성해주세요.
- Transformer embedding: positional encoding을 통해 위치 정보를 포함한다. 
                        sin 함수를 사용 neural network가 상대적 위치를 학습할 수 있게 한다. 
  BERT embedding: positional encoding 대신 positional embedding을 사용하고, 추가로 segment embedding을 사용한다. 
                  segment embedding을 사용해 앞뒤 문장을 연결할 수 있다.
