In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [4]:
import sentencepiece as spm

de_vocab_file = '../vocab/de.model'
en_vocab_file = '../vocab/en.model'

de_vocab = spm.SentencePieceProcessor()
en_vocab = spm.SentencePieceProcessor()

# de, en vocab 로드
de_vocab.load(de_vocab_file)
en_vocab.load(en_vocab_file)

True

In [6]:
import pandas as pd

train_df = pd.read_csv('../dataset/train.csv')

In [7]:
# data.py

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler

# mt Dataset
class MtDataset(Dataset):
  def __init__(self, src_vocab, trg_vocab, df, src_name, trg_name):
    self.src_vocab  = src_vocab
    self.trg_vocab = trg_vocab
    self.src_train = []
    self.trg_train = []

    for idx, row in df.iterrows():
      src_line = row[src_name]
      trg_line = row[trg_name]
      if type(src_line) != str or type(trg_line) != str:
        continue
      # src 문장, trg 문장 각각 tokenize
      self.src_train.append(src_vocab.encode_as_ids(src_line))
      self.trg_train.append(trg_vocab.encode_as_ids(trg_line))

  def __len__(self):
    assert len(self.src_train) == len(self.trg_train)
    return len(self.src_train)

  def __getitem__(self, idx):
    return (torch.tensor(self.src_train[idx]), torch.tensor(self.trg_train[idx]))


# mt data collate_fn
# 배치 단위로 데이터 처리
def mt_collate_fn(inputs):
  enc_inputs, dec_inputs = list(zip(*inputs)) # to do

  # 입력 길이가 다르므로 입력 최대 길이에 맟춰 padding(0) 추가
  enc_inputs = torch.nn.utils.rnn.pad_sequence(enc_inputs, batch_first=True)
  dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True)

  batch = [
      enc_inputs,
      dec_inputs
  ]

  return batch # DataLoader iterate 할 때 return됨


# DataLoader
def build_mt_data_loader(src_vocab, trg_vocab, df, src_name, trg_name, args, shuffle=True):
  # Dataset 생성
  dataset = MtDataset(src_vocab, trg_vocab, df, src_name, trg_name)
  if 1 < args['n_gpu'] and shuffle:
    sampler = DistributedSampler(dataset)
    loader = DataLoader(dataset, batch_size=args['batch_size'], sampler=sampler, collate_fn=mt_collate_fn)
  else:
    sampler = None
    loader = DataLoader(dataset, batch_size=args['batch_size'], sampler=sampler, shuffle=shuffle, collate_fn=mt_collate_fn)

  return loader, sampler

In [8]:
tmp_config = {
    "n_gpu": 1, #tmp
    "n_layer": 6,
    "batch_size": 256,
    "n_enc_vocab": 8000, # tmp
    "n_dec_vocab": 8000, # tmp
    "n_enc_seq": 80, # tmp
    "n_dec_seq": 80, # tmp
    "d_model": 512,
    "d_ff": 2048,
    "h": 8,
    "d_h": 64,
    "dropout": 0.1,
    "layer_norm_epsilon": 1e-12,
    "i_pad": 0,
}

In [9]:
args = {
    'n_gpu': tmp_config['n_gpu'],
    'batch_size': tmp_config['batch_size'],
}

loader, sampler = build_mt_data_loader(en_vocab, de_vocab, train_df, 'en', 'de', args)

In [10]:
# Sinusoidal position representations
def get_sinusoidal(n_seq, d_model):
  '''
  Args:
      n_seq: sequence 길이 (=한 문장 내 토큰 개수)
      d_model: (=512)
  '''
  def cal_angle(i_seq, i_dmodel):
    return i_seq / np.power(10000, 2 * (i_dmodel // 2) / d_model)

  def get_pos_enc(i_seq):
    return [cal_angle(i_seq, i_dmodel) for i_dmodel in range(d_model)]

  pos_enc_table = np.array([get_pos_enc(i_seq) for i_seq in range(n_seq)])
  pos_enc_table[:, 0::2] = np.sin(pos_enc_table[:, 0::2]) # even idx
  pos_enc_table[:, 1::2] = np.cos(pos_enc_table[:, 1::2]) # odd idx

  return pos_enc_table

In [11]:
class FFN(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.conv1 = nn.Conv1d(in_channels=self.config["d_model"], out_channels=self.config["d_ff"], kernel_size=1)
    self.conv2 = nn.Conv1d(in_channels=self.config["d_ff"], out_channels=self.config["d_model"], kernel_size=1)
    self.active = F.relu
    self.dropout = nn.Dropout(self.config["dropout"])

  # inputs: (batch_size, n_seq, d_model)
  def forward(self, inputs):
    # (batch_size, n_seq, d_model) -> (batch_size, d_model, n_seq) -> (batch_size, d_ff, n_seq)
    output = self.active(self.conv1(inputs.transpose(1,2)))
    # (batch_size, d_ff, n_seq) -> (batch_size, d_model, n_seq) -> (batch_size, n_seq, d_model)
    output = self.conv2(output).transpose(1,2)
    output = self.dropout(output)
    # output: (batch_size, n_seq, d_model)
    return output

In [12]:
# attention pad mask
def get_attn_pad_mask(query, key, i_pad):
  '''
  Args:
      query: query(Q) (batch_size, 문장 내 토큰 개수)
      key: key(K) (batch_size, 문장 내 토큰 개수)
      * 전처리 했으므로 배치 내 토큰 개수 동일
      i_pad: padding 인덱스 (=0)
  '''
  batch_size, len_q = query.size()
  batch_size, len_k = key.size()
  # (batch_size, len_q, len_k)
  mask = key.data.eq(i_pad).unsqueeze(1).expand(batch_size, len_q, len_k)
  return mask


# attention decoder mask
def get_attn_decoder_mask(seq):
  '''
  Args:
      seq: (batch_size, 문장 내 토큰 개수)
  '''
  mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
  # (batch_size, len_seq, len_seq)
  mask = mask.triu(diagonal=1)
  return mask

In [13]:
class ScaledDotProductAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config
    self.dropout = nn.Dropout(self.config["dropout"])
    self.scale = 1 / (self.config["d_h"] ** 0.5)

  def forward(self, Q, K, V, attn_mask):
    '''
    Args:
        Q: (batch_size, h, len_q, d_h)
        K: (batch_size, h, len_k, d_h)
        V: (batch_size, h, len_v, d_h)
        attn_mask: (batch_size, h, len_q, len_k)
    '''
    # (batch_size, h, len_q, len_k)
    affinities = torch.matmul(Q, K.transpose(-1, -2)).mul_(self.scale)
    affinities.masked_fill_(attn_mask, -1e9)
    # (batch_size, h, len_q, len_k)
    attn_weights = nn.Softmax(dim=-1)(affinities)
    attn_weights = self.dropout(attn_weights)
    # (batch_size, h, len_q, d_h)
    output = torch.matmul(attn_weights, V)
    # (batch_size, h, len_q, d_h), (batch_size, h, len_q, len_k)
    return output, attn_weights

In [14]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config
    self.W_Q = nn.Linear(self.config['d_model'], self.config['h'] * self.config['d_h'])
    self.W_K = nn.Linear(self.config['d_model'], self.config['h'] * self.config['d_h'])
    self.W_V = nn.Linear(self.config['d_model'], self.config['h'] * self.config['d_h'])
    self.scaled_dot_attn = ScaledDotProductAttention(self.config)
    self.linear = nn.Linear(self.config['h'] * self.config['d_h'], self.config['d_model'])
    self.dropout = nn.Dropout(self.config['dropout'])

  def forward(self, Q, K, V, attn_mask):
    '''
    Args:
        Q: (batch_size, len_q, d_model)
        K: (batch_size, len_q, d_model)
        V: (batch_size, len_q, d_model)
        attn_mask: (batch_size, len_q, len_k)
    '''
    # linearly project the queries, keys and values
    # (batch_size, len_q, d_model) * (d_model, h * d_h) = (batch_size, len_q, h * d_h)
    # -> (batch_size, len_q, h, d_h)
    # -> (batch_size, h, len_q, d_h)
    pjted_Q = self.W_Q(Q).view(self.config['batch_size'], -1, self.config['h'], self.config['d_h']).transpose(1,2)
    pjted_K = self.W_K(K).view(self.config['batch_size'], -1, self.config['h'], self.config['d_h']).transpose(1,2)
    pjted_V = self.W_V(V).view(self.config['batch_size'], -1, self.config['h'], self.config['d_h']).transpose(1,2)
    # (batch_size, len_q, len_k) -> (batch_size, h, len_q, len_k)
    attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config['h'], 1, 1)
    # scaled dot product attention
    # (batch_size, h, len_q, d_h), (batch_size, h, len_q, len_k)
    context, attn_weights = self.scaled_dot_attn(pjted_Q, pjted_K, pjted_V, attn_mask)
    # concat
    # (batch_size, h, len_q, d_h) -> (batch_size, len_q, h * d_h)
    context= context.transpose(1, 2).contiguous().view(self.config['batch_size'], -1, self.config['h'] * self.config['d_h'])
    # linear
    # (batch_size, len_q, h * d_h) * (h * d_h, d_model)
    # -> (batch_size, len_q, d_model)
    output = self.linear(context)
    output = self.dropout(output)
    # (batch_size, len_q, d_model), (batch_size, h, len_q, len_k)
    return output, attn_weights

In [15]:
# encoder layer
class EncoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.self_attn = MultiHeadAttention(self.config)
    self.layer_norm1 = nn.LayerNorm(self.config["d_model"], eps = self.config["layer_norm_epsilon"])
    self.ffn = FFN(self.config)
    self.layer_norm2 = nn.LayerNorm(self.config["d_model"], eps = self.config["layer_norm_epsilon"])

  '''
  Args:
      inputs: (batch_size, len_seq, d_model)
      attn_mask: (batch_size, len_q, len_k)
  '''
  def forward(self, inputs, attn_mask):
    # (batch_size, len_q, d_model), (batch_size, h, len_q, len_k)
    attn_output, attn_weights = self.self_attn(inputs, inputs, inputs, attn_mask)
    # (batch_size, len_q, d_model)
    attn_output = self.layer_norm1(inputs + attn_output)
    # (batch_size, len_q, d_model)
    ffn_output = self.ffn(attn_output)
    # (batch_size, len_q, d_model)
    ffn_output = self.layer_norm2(ffn_output + attn_output)
    # (batch_size, len_q, d_model), (batch_size, h, len_q, len_k)
    return ffn_output, attn_weights

In [16]:
# encoder
class Encoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.enc_emb = nn.Embedding(self.config["n_enc_vocab"], self.config["d_model"])
    pos_enc_table = torch.FloatTensor(get_sinusoidal(self.config["n_enc_seq"] + 1, self.config["d_model"]))
    self.pos_emb = nn.Embedding.from_pretrained(pos_enc_table, freeze=True)

    self.layers = nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config["n_layer"])])

  '''
  Args
      inputs: (batch_size, len_seq)
  '''
  def forward(self, inputs):
    # (batch_size, len_enc_seq)
    positions = torch.arange(inputs.size(1), device=inputs.device, dtype=inputs.dtype).expand(inputs.size(0), inputs.size(1)).contiguous() + 1
    pos_mask = inputs.eq(self.config["i_pad"])
    positions.masked_fill_(pos_mask, 0)

    # (batch_size, len_enc_seq, d_model)
    output = self.enc_emb(inputs) + self.pos_emb(positions)

    # (batch_size, len_enc_seq, len_enc_seq)
    attn_mask = get_attn_pad_mask(inputs, inputs, self.config["i_pad"])

    attn_weights_history = list([])
    for layer in self.layers:
      # (batch_size, len_enc_seq, d_model), (batch_size, h, len_enc_seq, len_enc_seq)
      output, attn_weights = layer(output, attn_mask)
      attn_weights_history.append(attn_weights)

    # (batch_size, len_enc_seq, d_model), [(batch_size, h, len_enc_seq, len_enc_seq)]
    return output, attn_weights_history

In [17]:
class DecoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.self_attn = MultiHeadAttention(self.config)
    self.layer_norm1 = nn.LayerNorm(self.config["d_model"], eps = self.config["layer_norm_epsilon"])
    self.enc_dec_attn = MultiHeadAttention(self.config)
    self.layer_norm2 = nn.LayerNorm(self.config["d_model"], eps = self.config["layer_norm_epsilon"])
    self.ffn = FFN(self.config)
    self.layer_norm3 = nn.LayerNorm(self.config["d_model"], eps = self.config["layer_norm_epsilon"])

  '''
  Args:
      dec_inputs: (batch_size, len_seq, d_model)
      enc_outputs: (batch_size, len_enc_seq, d_model)
      self_attn_mask: (batch_size, len_dec_seq, len_dec_seq)
      enc_dec_attn_mask: (batch_size, len_dec_seq, len_enc_seq)
  '''
  def forward(self, dec_inputs, enc_outputs, self_attn_mask, enc_dec_attn_mask):
    # (batch_size, len_dec_seq, d_model), (batch_size, h, len_dec_seq, len_dec_seq)
    self_attn_output, self_attn_weights = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
    self_attn_output = self.layer_norm1(dec_inputs + self_attn_output)
    # (batch_size, len_dec_seq, d_model), (batch_size, h, len_dec_seq, len_ebc_seq)
    enc_dec_attn_output, enc_dec_attn_weights = self.enc_dec_attn(self_attn_output, enc_outputs, enc_outputs, enc_dec_attn_mask)
    enc_dec_attn_output = self.layer_norm2(self_attn_output + enc_dec_attn_output)
    # (batch_size, len_dec_seq, d_model)
    ffn_output = self.ffn(enc_dec_attn_output)
    ffn_output = self.layer_norm3(enc_dec_attn_output + ffn_output)
    # (batch_size, len_dec_seq, d_model), (batch_size, h, len_dec_seq, len_dec_seq), (batch_size, h, len_dec_seq, len_ebc_seq)
    return ffn_output, self_attn_weights, enc_dec_attn_weights

In [18]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.dec_emb = nn.Embedding(self.config["n_dec_vocab"], self.config["d_model"])
    pos_enc_table = torch.FloatTensor(get_sinusoidal(self.config["n_dec_seq"] + 1, self.config["d_model"]))
    self.pos_emb = nn.Embedding.from_pretrained(pos_enc_table, freeze=True)

    self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config["n_layer"])])

  '''
  Args:
      dec_inputs: (batch_size, len_dec_seq, d_model)
      enc_inputs: (batch_size, len_enc_seq, d_model)
      enc_outputs: (batch_size, len_enc_seq, d_model)
  '''
  def forward(self, dec_inputs, enc_inputs, enc_outputs):
    # (batch_size, len_enc_seq)
    positions = torch.arange(dec_inputs.size(1), device=dec_inputs.device, dtype=dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1
    pos_mask = dec_inputs.eq(self.config["i_pad"])
    positions.masked_fill_(pos_mask, 0)

    # (batch_size, n_dec_seq, d_model)
    dec_output = self.dec_emb(dec_inputs) + self.pos_emb(positions)

    # (batch_size, len_dec_seq, len_dec_seq)
    attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config["i_pad"])
    # (batch_size, len_dec_seq, len_dec_seq)
    attn_decoder_mask = get_attn_decoder_mask(dec_inputs)
    # (batch_size, len_dec_seq, len_dec_seq)
    self_attn_mask = torch.gt((attn_pad_mask + attn_decoder_mask), 0)
    # (batch_size, len_dec_seq, len_enc_seq)
    enc_dec_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, self.config["i_pad"])

    self_attn_weights_history, enc_dec_attn_weights_history = list([]), list([])
    for layer in self.layers:
      # (batch_size, len_dec_seq, d_model), (batch_size, h, len_dec_seq, len_dec_seq), (batch_size, h, len_dec_seq, len_ebc_seq)
      output, self_attn_weights, enc_dec_attn_weights = layer(dec_output, enc_outputs, self_attn_mask, enc_dec_attn_mask)
      self_attn_weights_history.append(self_attn_weights)
      enc_dec_attn_weights_history.append(enc_dec_attn_weights)
    # (batch_size, len_dec_seq, d_model), (batch_size, h, len_dec_seq, len_dec_seq), (batch_size, h, len_dec_seq, len_ebc_seq)
    return output, self_attn_weights_history, enc_dec_attn_weights_history

In [19]:
def returnExampleBatch():
  cnt = 0
  for [enc, dec] in loader:
   if (cnt < 1):
      return enc, dec
   else:
      break

q, k = returnExampleBatch()

In [20]:
tmp_config["n_enc_seq"] = q.size(1)
tmp_config["n_dec_seq"] = k.size(1)
encoder = Encoder(tmp_config)
decoder = Decoder(tmp_config)

In [21]:
dec_emb = nn.Embedding(tmp_config["n_dec_vocab"], tmp_config["d_model"])
dec_emb(k)

tensor([[[-0.3198,  1.2951, -1.3993,  ...,  1.0435, -1.2243,  0.1747],
         [ 1.3494,  0.3532, -1.1673,  ..., -0.5503,  0.7476,  0.9111],
         [ 0.8210,  0.0699,  0.6038,  ...,  2.5298,  0.4141, -0.2870],
         ...,
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497],
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497],
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497]],

        [[ 0.4927, -0.2389, -0.9813,  ..., -0.7799, -0.1842, -0.6516],
         [-0.4706,  1.0932, -0.8204,  ..., -0.1230, -0.8389,  0.0110],
         [-0.2819, -1.6732, -0.8879,  ...,  1.0908, -1.8238,  0.2782],
         ...,
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497],
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497],
         [-0.1503,  0.7005,  0.1998,  ...,  0.9149, -0.3655, -2.8497]],

        [[ 1.6670, -1.5437, -0.7686,  ...,  0.6676,  0.5874, -2.2881],
         [ 0.2686,  1.6338, -1.7464,  ...,  2

In [22]:
enc_output, enc_attn_weights_history = encoder(q)

In [23]:
dec_output, self_attn_weights_history, enc_dec_attn_weights_history = decoder(k, q, enc_output)

In [24]:
dec_output

tensor([[[-0.6147,  0.2046, -0.5662,  ...,  0.4568,  0.7345, -0.0306],
         [-2.1806, -1.8684, -0.7533,  ...,  0.2911,  0.5270, -0.4030],
         [-0.0437, -1.8510, -0.2617,  ...,  0.8317,  0.6737, -0.4886],
         ...,
         [-1.1348,  1.0266, -1.9929,  ...,  0.9832, -0.1817,  1.6928],
         [-1.1243,  0.9976, -1.9546,  ...,  0.9526, -0.1631,  1.6141],
         [-1.2238,  1.0436, -2.0170,  ...,  0.9551, -0.2663,  1.7579]],

        [[ 0.1220, -1.9133, -0.6859,  ...,  1.6220, -0.4513,  1.1376],
         [-0.3976, -0.3339, -0.6933,  ..., -0.4622, -0.7073,  1.1712],
         [-0.7005, -0.5292, -1.0053,  ..., -0.1962, -0.3625, -1.0097],
         ...,
         [-0.7120,  1.1325, -2.0580,  ...,  0.8775, -0.8116,  1.5938],
         [-0.9712,  1.2309, -1.9925,  ...,  0.7699, -0.4765,  1.6661],
         [-0.7355,  1.0696, -1.4077,  ...,  0.8887, -0.3985,  1.6437]],

        [[-0.2350, -0.3345, -1.1349,  ...,  0.9967,  0.3326, -1.3287],
         [ 0.3488, -2.0294, -0.2325,  ...,  0

In [25]:
dec_output.size()

torch.Size([256, 78, 512])

In [28]:
len(self_attn_weights_history)

6

In [29]:
self_attn_weights_history[0].size()

torch.Size([256, 8, 78, 78])

In [30]:
len(enc_dec_attn_weights_history)

6

In [32]:
enc_dec_attn_weights_history[0].size()

torch.Size([256, 8, 78, 92])