In [1]:
 # This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = 'backup'
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/MyDrive/{}'.format(FOLDERNAME))

# Change dariectory to current folder
%cd /content/drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/backup


In [2]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
! pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
import sentencepiece as spm

de_vocab_file = 'de.model'
en_vocab_file = 'en.model'

de_vocab = spm.SentencePieceProcessor()
en_vocab = spm.SentencePieceProcessor()

# de, en vocab 로드
de_vocab.load(de_vocab_file)
en_vocab.load(en_vocab_file)

True

In [5]:
import pandas as pd

train_df = pd.read_csv('train.csv')

In [6]:
# data.py

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler

# mt Dataset
class MtDataset(Dataset):
  def __init__(self, src_vocab, trg_vocab, df, src_name, trg_name):
    self.src_vocab  = src_vocab
    self.trg_vocab = trg_vocab
    self.src_train = []
    self.trg_train = []

    for idx, row in df.iterrows():
      src_line = row[src_name]
      trg_line = row[trg_name]
      if type(src_line) != str or type(trg_line) != str:
        continue
      # src 문장, trg 문장 각각 tokenize
      self.src_train.append(src_vocab.encode_as_ids(src_line))
      self.trg_train.append(trg_vocab.encode_as_ids(trg_line))

  def __len__(self):
    assert len(self.src_train) == len(self.trg_train)
    return len(self.src_train)

  def __getitem__(self, idx):
    return (torch.tensor(self.src_train[idx]), torch.tensor(self.trg_train[idx]))


# mt data collate_fn
# 배치 단위로 데이터 처리
def mt_collate_fn(inputs):
  enc_inputs, dec_inputs = list(zip(*inputs)) # to do

  # 입력 길이가 다르므로 입력 최대 길이에 맟춰 padding(0) 추가
  enc_inputs = torch.nn.utils.rnn.pad_sequence(enc_inputs, batch_first=True)
  dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True)

  batch = [
      enc_inputs,
      dec_inputs
  ]

  return batch # DataLoader iterate 할 때 return됨


# DataLoader
def build_mt_data_loader(src_vocab, trg_vocab, df, src_name, trg_name, args, shuffle=True):
  # Dataset 생성
  dataset = MtDataset(src_vocab, trg_vocab, df, src_name, trg_name)
  if 1 < args['n_gpu'] and shuffle:
    sampler = DistributedSampler(dataset)
    loader = DataLoader(dataset, batch_size=args['batch'], sampler=sampler, collate_fn=mt_collate_fn)
  else:
    sampler = None
    loader = DataLoader(dataset, batch_size=args['batch'], sampler=sampler, shuffle=shuffle, collate_fn=mt_collate_fn)

  return loader, sampler

In [7]:
args = {
    'n_gpu': 1,
    'batch': 256
}

loader, sampler = build_mt_data_loader(en_vocab, de_vocab, train_df, 'en', 'de', args)

In [8]:
cnt = 0

for [enc, dec] in loader:
  if (cnt < 1):
    print(dec)
    print(len(dec))
    cnt+= 1
  else:
    break

tensor([[2917,  249,  269,  ...,    0,    0,    0],
        [2852,   95,  138,  ...,    0,    0,    0],
        [ 118,  198,   18,  ...,    0,    0,    0],
        ...,
        [ 934,  615,  153,  ...,    0,    0,    0],
        [1954,  200, 3551,  ...,    0,    0,    0],
        [ 145,  850, 1287,  ...,    0,    0,    0]])
256


In [9]:
# Sinusoidal position representations
def get_sinusoidal(n_seq, d_model):
  '''
  Args:
      n_seq: sequence 길이 (=한 문장 내 토큰 개수)
      d_model: (=512)
  '''
  def cal_angle(i_seq, i_dmodel):
    return i_seq / np.power(10000, 2 * (i_dmodel // 2) / d_model)

  def get_pos_enc(i_seq):
    return [cal_angle(i_seq, i_dmodel) for i_dmodel in range(d_model)]

  pos_enc_table = np.array([get_pos_enc(i_seq) for i_seq in range(n_seq)])
  pos_enc_table[:, 0::2] = np.sin(pos_enc_table[:, 0::2]) # even idx
  pos_enc_table[:, 1::2] = np.cos(pos_enc_table[:, 1::2]) # odd idx

  return pos_enc_table

In [10]:
# encoder
class Encoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.enc_emb = nn.Embedding(self.config["n_enc_vocab"], self.config["d_model"])
    pos_enc_table = torch.FloatTensor(get_sinusoidal(self.config["n_enc_seq"], self.config["d_model"]))
    self.pos_emb = nn.Embedding.from_pretrained(pos_enc_table, freeze=True)

    # to do: EncoderLayer

  # to do: forward

In [11]:
tmp_config = {
    "n_enc_vocab": 8000, # tmp
    "n_enc_seq": 80, # tmp
    "d_model": 512,
    "d_ff": 2048,
    "dropout": 0.1,
}

In [12]:
Encoder(tmp_config)

Encoder(
  (enc_emb): Embedding(8000, 512)
  (pos_emb): Embedding(80, 512)
)

In [13]:
print(Encoder(tmp_config).enc_emb.weight)

Parameter containing:
tensor([[ 0.8210, -0.0443, -2.0217,  ..., -0.1930, -0.1446, -0.1005],
        [ 0.7124,  0.9599,  0.1027,  ..., -0.2614,  1.2562, -1.2971],
        [-0.5088,  0.2574,  0.9511,  ..., -0.7704, -0.6936, -1.3515],
        ...,
        [ 0.0284, -1.7049,  0.7069,  ...,  0.1039, -0.6297, -1.7581],
        [-0.9041,  0.3820, -0.5600,  ...,  0.8880,  1.3087,  1.7325],
        [-0.7475, -0.7080,  1.1771,  ...,  0.9510,  1.1564, -2.3144]],
       requires_grad=True)


In [14]:
print(Encoder(tmp_config).pos_emb.weight)

Parameter containing:
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00],
        ...,
        [ 9.9952e-01, -3.0975e-02, -8.9979e-01,  ...,  9.9997e-01,
          7.9820e-03,  9.9997e-01],
        [ 5.1398e-01, -8.5780e-01, -1.5400e-01,  ...,  9.9996e-01,
          8.0856e-03,  9.9997e-01],
        [-4.4411e-01, -8.9597e-01,  7.2432e-01,  ...,  9.9996e-01,
          8.1893e-03,  9.9997e-01]])


In [15]:
class FFN(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.conv1 = nn.Conv1d(in_channels=self.config["d_model"], out_channels=self.config["d_ff"], kernel_size=1)
    self.conv2 = nn.Conv1d(in_channels=self.config["d_ff"], out_channels=self.config["d_model"], kernel_size=1)
    self.active = F.relu
    self.dropout = nn.Dropout(self.config["dropout"])

  # inputs: (batch, n_seq, d_model)
  def forward(self, inputs):
    # (batch, n_seq, d_model) -> (batch, d_model, n_seq) -> (batch, d_ff, n_seq)
    output = self.active(self.conv1(inputs.transpose(1,2)))
    # (batch, d_ff, n_seq) -> (batch, d_model, n_seq) -> (batch, n_seq, d_model)
    output = self.conv2(output).transpose(1,2)
    output = self.dropout(output)
    # output: (batch, n_seq, d_model)
    return output

In [16]:
ffn = FFN(tmp_config)

In [17]:
enc_emb = nn.Embedding(tmp_config["n_enc_vocab"], tmp_config["d_model"])

In [18]:
def returnExampleBatch():
  cnt = 0
  for [enc, dec] in loader:
   if (cnt < 1):
      return dec
   else:
      break

dec = returnExampleBatch()

In [19]:
dec

tensor([[ 190,  253,   73,  ...,    0,    0,    0],
        [ 180, 3355,  226,  ...,    0,    0,    0],
        [3754, 2520,   95,  ...,    0,    0,    0],
        ...,
        [ 255,   79,  617,  ...,    0,    0,    0],
        [ 667,  251,  125,  ...,    0,    0,    0],
        [ 551, 5833, 3959,  ...,    0,    0,    0]])

In [20]:
enc_emb(dec)

tensor([[[ 1.0415,  0.8546, -1.0981,  ..., -0.4615, -0.8794, -0.8114],
         [-1.1174, -0.2138, -0.4115,  ...,  0.5420, -1.4181,  0.8671],
         [-0.4778,  0.5613,  0.1543,  ..., -0.3974,  0.5664, -0.8370],
         ...,
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411],
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411],
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411]],

        [[ 0.9699,  1.3379,  0.5686,  ..., -1.5771, -0.0311, -0.8929],
         [ 0.3331,  0.3030, -1.5208,  ..., -0.4909,  0.9014, -0.3108],
         [ 0.2615, -1.4597,  1.2918,  ..., -0.7148, -0.9845, -2.3205],
         ...,
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411],
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411],
         [-1.2571,  0.3741, -0.4790,  ..., -0.1764,  0.2920, -0.6411]],

        [[-0.4639, -0.7064, -0.0405,  ..., -0.1532,  1.0295, -0.2411],
         [ 1.8425, -1.4005,  0.0723,  ...,  0

In [21]:
enc_emb(dec).shape

torch.Size([256, 99, 512])

In [22]:
ffn(enc_emb(dec))

tensor([[[-0.0621,  0.1647,  0.0645,  ...,  0.4189, -0.0120,  0.1492],
         [ 0.0017,  0.0761,  0.0288,  ..., -0.2232, -0.0000,  0.3664],
         [-0.2253,  0.0000,  0.0461,  ...,  0.0597, -0.3720,  0.2997],
         ...,
         [ 0.1333, -0.0858, -0.0000,  ...,  0.2197,  0.4001,  0.1680],
         [ 0.1333, -0.0858, -0.1067,  ...,  0.2197,  0.4001,  0.1680],
         [ 0.1333, -0.0858, -0.1067,  ...,  0.2197,  0.0000,  0.1680]],

        [[ 0.1283, -0.1453, -0.0175,  ...,  0.2233, -0.1305, -0.3449],
         [-0.2191,  0.1587, -0.1020,  ...,  0.0289,  0.2410,  0.5558],
         [-0.3985, -0.1711, -0.1567,  ...,  0.1896, -0.4181,  0.0000],
         ...,
         [ 0.1333, -0.0858, -0.0000,  ...,  0.2197,  0.4001,  0.1680],
         [ 0.1333, -0.0858, -0.1067,  ...,  0.0000,  0.4001,  0.1680],
         [ 0.1333, -0.0858, -0.1067,  ...,  0.2197,  0.4001,  0.1680]],

        [[ 0.0702, -0.0319, -0.1256,  ..., -0.0000,  0.1472,  0.0247],
         [-0.0154,  0.2240, -0.3654,  ...,  0

In [23]:
ffn(enc_emb(dec)).shape

torch.Size([256, 99, 512])