In [None]:
!pip install konlpy
!pip install transformers
!pip install sentencepiece

from konlpy.tag import Komoran, Okt
from tqdm import tqdm, trange
import sentencepiece as spm
import re
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data



In [None]:
%matplotlib inline
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(515)

<torch._C.Generator at 0x7f58dd6f0a50>

In [None]:
print(device)

cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# preprocessing
def load_vocab(file):
    vocab = spm.SentencePieceProcessor()
    vocab.load(file)
    return vocab

In [None]:
# utils

def train(config, model, criterion, optimizer, trn_dataloader, nb_epochs):
    trn_losses = []
    model.train()

    for epoch in range(nb_epochs):
        trn_loss = 0.0
        for batch, values in enumerate(trn_dataloader):
            labels, enc_inputs, dec_inputs = values
            labels, enc_inputs, dec_inputs = labels.to(device), enc_inputs.to(device), dec_inputs.to(device)

            optimizer.zero_grad()
            hypothesis = model(enc_inputs, dec_inputs)
            logits = hypothesis[0]
            train_loss = criterion(logits, labels)
            train_loss.backward()
            optimizer.step()

            trn_loss += train_loss.item()
        trn_losses.append(trn_loss)
        print(np.mean(trn_losses))
    return trn_losses

def _train(model, criterion, optimizer, input_size, output_size,
          train_dataloader, validation_dataloader,
          nb_epochs):
    
    # Train loop
    trn_loss_list = []
    val_loss_list = []
    for epoch in range(nb_epochs):

        # Train
        trn_loss = 0.0
        for trn_batch, train_samples in enumerate(train_dataloader):

            # train data setting
            x_train, y_train = train_samples
            x_train = x_train.unsqueeze(0).to(device)
            y_train = y_train.float().to(device)

            # train
            model.train()
            optimizer.zero_grad()
            hypothesis = model(x_train).view(-1, output_size)
            train_loss = criterion(hypothesis, y_train)
            train_loss.backward()
            optimizer.step()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm)

            # train loss
            trn_loss += train_loss.item() / len(train_dataloader)
        trn_loss_list.append(trn_loss)

        # Evaluation
        with torch.no_grad():
            val_loss = 0.0
            for val_batch, validation_samples in enumerate(validation_dataloader):

                # validatoin data setting
                x_validation, y_validation = validation_samples
                x_validation = x_validation.unsqueeze(0).to(device)
                y_validation = y_validation.float().to(device)

                # evaluation
                model.eval()
                prediction = model(x_validation).view(-1, output_size)
                validation_loss = criterion(prediction, y_validation)

                # validation loss
                val_loss += validation_loss.item() / len(validation_dataloader)
            val_loss_list.append(val_loss)

        print("Epoch: {:3d} | Train Loss: {:.6f} | Val Loss: {:.6f}".format(epoch + 1, trn_loss, val_loss))
    #torch.save(model, './data/temperature_model.pt')
    return trn_loss_list, val_loss_list

def checkdata(dataloader):
    for batch, values in enumerate(dataloader):
        while batch < 6:
            label, enc_inputs, dec_inputs = values
            print("{} Batch".format(batch))
            print("Input: {}".format(enc_inputs.size()))
            print("Target: {}".format(dec_inputs.size()))
            break
    return label, enc_inputs, dec_inputs

def checkfunction(criterion, dataloader, output_size, epoch):
    model.train()
    for batch, values in enumerate(dataloader):
        labels, enc_inputs, dec_inputs = values
        labels, enc_inputs, dec_inputs = labels.long().to(device), enc_inputs.to(device), dec_inputs.to(device)
        hypothesis = model(enc_inputs, dec_inputs)
        logits = hypothesis[0]
        loss = criterion(logits, labels)
        print("Batch:", batch+1)
        print("enc_inputs size:", enc_inputs.size())
        print("dec_inputs size:", dec_inputs.size())
        print("logits", logits)
        print("logits size:", logits.size())
        print("Loss:", loss)
        print()
        if batch == epoch:
            break

In [None]:
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

class CustomDataSet(data.Dataset):
    def __init__(self, vocab, infile):

        self.vocab = vocab
        self.labels = []
        self.sentences = []

        line_cnt = 0
        with open(infile, "r") as f:
            for line in f:
                line_cnt += 1

        with open(infile, "r") as f:
            for i, line in enumerate(tqdm(f, total=line_cnt, desc=f"Loading {infile}", unit=" lines")):
                data = json.loads(line)
                self.labels.append(data["label"])
                self.sentences.append([vocab.piece_to_id(p) for p in data["doc"]])
    
    def __len__(self):
        assert len(self.labels) == len(self.sentences)
        return len(self.labels)
    
    def __getitem__(self, item):
        return (torch.tensor(self.labels[item]),
                torch.tensor(self.sentences[item]),
                torch.tensor([self.vocab.piece_to_id("[BOS]")]))
        
def movie_collate_fn(inputs):
    labels, enc_inputs, dec_inputs = list(zip(*inputs))

    enc_inputs = torch.nn.utils.rnn.pad_sequence(enc_inputs, batch_first=True, padding_value=0)
    dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True, padding_value=0)

    batch = [
        torch.stack(labels, dim=0),
        enc_inputs,
        dec_inputs,
    ]
    return batch

In [None]:
# Model
class Classification(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # values
        self.config = config

        # layers
        self.transformer = Transformer(self.config)
        self.projection = nn.Linear(self.config['hidden_size'], self.config['output_size'], bias = False)

    def forward(self, enc_inputs, dec_inputs):
        dec_outputs, enc_self_attn_probs, dec_self_attn_probs, dec_enc_attn_probs = self.transformer(enc_inputs, dec_inputs)
        dec_outputs, _ = torch.max(dec_outputs, dim=1)
        logits = self.projection(dec_outputs)
        return logits, enc_self_attn_probs, dec_self_attn_probs, dec_enc_attn_probs

    def save(self, epoch, loss, score, path):
        torch.save({
            "epoch": epoch,
            "loss": loss,
            "score": score,
            "state_dict": self.state_dict()
        }, path)
    
    def load(self, path):
        save = torch.load(path)
        self.load_state_dict(save["state_dict"])
        return save["epoch"], save["loss"], save["score"]

class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.encoder = Encoder(self.config)
        self.decoder = Decoder(self.config)

    def forward(self, enc_inputs, dec_inputs):
        enc_outputs, enc_self_attn_probs = self.encoder(enc_inputs)
        dec_outputs, dec_self_attn_probs, dec_enc_attn_probs = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        return dec_outputs, enc_self_attn_probs, dec_self_attn_probs, dec_enc_attn_probs

In [None]:
# Encoding
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.config = config

        self.enc_emb = nn.Embedding(self.config.n_enc_vocab, self.config.hidden_size)
        sinusoid_table = torch.FloatTensor(get_sinusoid_encoding_table(self.config.n_enc_seq + 1, self.config.hidden_size))
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)
        self.layers = nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config.n_layer)])
    
    def forward(self, x):
        positions = torch.arange(x.size(1), device=x.device, dtype=x.dtype).expand(x.size(0), x.size(1)).contiguous() + 1
        pos_mask = x.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)
        outputs = self.enc_emb(x) + self.pos_emb(positions)
        attn_mask = get_attn_pad_mask(x, x, self.config.i_pad)

        attn_probs = []
        for layer in self.layers:
            outputs, attn_prob = layer(outputs, attn_mask)
            attn_probs.append(attn_prob)
        return outputs, attn_probs

class EncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm2 = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
    
    def forward(self, x, attn_mask):
        att_outputs, attn_prob = self.self_attn(x, x, x, attn_mask)
        att_outputs = self.layer_norm1(x + att_outputs)
        ffn_outputs = self.pos_ffn(att_outputs)
        ffn_outputs = self.layer_norm2(ffn_outputs + att_outputs)
        return ffn_outputs, attn_prob

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.W_Q = nn.Linear(self.config.hidden_size, self.config.n_head * self.config.d_head)
        self.W_K = nn.Linear(self.config.hidden_size, self.config.n_head * self.config.d_head)
        self.W_V = nn.Linear(self.config.hidden_size, self.config.n_head * self.config.d_head)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.hidden_size)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)

        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)

        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
        output = self.linear(context)
        output = self.dropout(output)
        return output, attn_prob

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.conv1 = nn.Conv1d(in_channels=self.config.hidden_size, out_channels=self.config.d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.hidden_size, kernel_size=1)
        self.active = F.gelu
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        output = self.active(self.conv1(x.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)
        return output

class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
    
    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)).mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        attn_prob = nn.Softmax(dim=-1)(scores)
        attn_prob = self.dropout(attn_prob)
        context = torch.matmul(attn_prob, V)
        return context, attn_prob

In [None]:
# Decoding
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.hidden_size)
        sinusoid_table = torch.FloatTensor(get_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.hidden_size))
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)

        self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])
    
    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        positions = torch.arange(dec_inputs.size(1), device=dec_inputs.device, dtype=dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1
        pos_mask = dec_inputs.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)
    
        dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)

        dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)
        dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)
        dec_self_attn_mask = torch.gt((dec_attn_pad_mask + dec_attn_decoder_mask), 0)
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, self.config.i_pad)

        self_attn_probs, dec_enc_attn_probs = [], []
        for layer in self.layers:
            dec_outputs, self_attn_prob, dec_enc_attn_prob = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            self_attn_probs.append(self_attn_prob)
            dec_enc_attn_probs.append(dec_enc_attn_prob)
        return dec_outputs, self_attn_probs, dec_enc_attn_probs

class DecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()

        # values
        self.config = config

        # layers
        self.self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
        self.dec_enc_attn = MultiHeadAttention(self.config)
        self.layer_norm2 = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm3 = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
    
    def forward(self, dec_inputs, enc_outputs, self_attn_mask, dec_enc_attn_mask):
        self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
        self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)
        dec_enc_att_outputs, dec_enc_attn_prob = self.dec_enc_attn(self_att_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_enc_att_outputs = self.layer_norm2(self_att_outputs + dec_enc_att_outputs)
        ffn_outputs = self.pos_ffn(dec_enc_att_outputs)
        ffn_outputs = self.layer_norm3(dec_enc_att_outputs + ffn_outputs)
        return ffn_outputs, self_attn_prob, dec_enc_attn_prob

In [None]:
# Encoding/Decoding util functions
def get_sinusoid_encoding_table(n_seq, hidden_size):
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / hidden_size)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(hidden_size)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table

def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad).unsqueeze(1).expand(batch_size, len_q, len_k)  # <pad>
    return pad_attn_mask

def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask

In [None]:
vocab = load_vocab("/content/drive/My Drive/data/kowiki.model")

In [None]:
config = Config({
    "batch_size": 128, 
    "n_enc_vocab": len(vocab),
    "n_dec_vocab": len(vocab),
    "num_sequence": 64, 
    "n_enc_seq": 256,
    "n_dec_seq": 256,
    "n_layer": 6,
    "hidden_size": 256,
    "i_pad": 0,
    "d_ff": 1024,
    "n_head": 4,
    "d_head": 64,
    "dropout": 0.1,
    "layer_norm_epsilon": 1e-12,
    "output_size": 2,
    "weight_decay": 0,
    "learning_rate": 5e-5,
    "max_norm": 5,
    "adam_epsilon": 1e-8,
    "warmup_steps": 0,
    "nb_epochs": 100
})

In [None]:
trn_dataset = CustomDataSet(vocab, "/content/drive/My Drive/data/ratings_train.json")
trn_dataloader = data.DataLoader(trn_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=movie_collate_fn)
test_dataset = CustomDataSet(vocab, "/content/drive/My Drive/data/ratings_test.json")
test_dataloader = data.DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=movie_collate_fn)

Loading /content/drive/My Drive/data/ratings_train.json: 100%|██████████| 149995/149995 [00:05<00:00, 25582.77 lines/s]
Loading /content/drive/My Drive/data/ratings_test.json: 100%|██████████| 49997/49997 [00:02<00:00, 24983.40 lines/s]


In [None]:
trn_label, trn_enc_inputs, trn_dec_inputs = checkdata(trn_dataloader)
test_label, test_enc_inputs, test_dec_inputs = checkdata(test_dataloader)

0 Batch
Input: torch.Size([128, 100])
Target: torch.Size([128, 1])
1 Batch
Input: torch.Size([128, 92])
Target: torch.Size([128, 1])
2 Batch
Input: torch.Size([128, 97])
Target: torch.Size([128, 1])
3 Batch
Input: torch.Size([128, 95])
Target: torch.Size([128, 1])
4 Batch
Input: torch.Size([128, 95])
Target: torch.Size([128, 1])
5 Batch
Input: torch.Size([128, 89])
Target: torch.Size([128, 1])
0 Batch
Input: torch.Size([128, 97])
Target: torch.Size([128, 1])
1 Batch
Input: torch.Size([128, 86])
Target: torch.Size([128, 1])
2 Batch
Input: torch.Size([128, 84])
Target: torch.Size([128, 1])
3 Batch
Input: torch.Size([128, 93])
Target: torch.Size([128, 1])
4 Batch
Input: torch.Size([128, 95])
Target: torch.Size([128, 1])
5 Batch
Input: torch.Size([128, 109])
Target: torch.Size([128, 1])


In [None]:
print("Train label:", trn_label[0])
print("Train encoder input:", trn_enc_inputs[0])
print("Train decoder input:", trn_dec_inputs[0])
print("Test label:", test_label[0])
print("Test encoder input:", test_enc_inputs[0])
print("Test decoder input:", test_dec_inputs[0])

Train label: tensor(1)
Train encoder input: tensor([ 886, 3616,   85, 3652,  723, 3620,  418,   26, 3668, 3613, 3857, 3664,
          10, 1102, 4071, 2723, 3590,  339, 3593, 1920,  139, 3699, 3783,  760,
        3817, 2723, 3590,  542,  125, 3062, 4097, 3692,  165, 3592, 2413,  444,
        3355, 1421, 3588, 3766,  331, 1453, 3037, 3603,  571, 3719,    9, 3595,
        3048, 3588,  247,   96, 3892, 3635, 3587, 4552, 4552, 2378,   26, 3963,
        3603, 1744, 3929, 3941, 3882, 3591,  639, 3621, 4163,  571, 3719, 3798,
        1234, 3590,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
Train decoder input: tensor([2])
Test label: tensor(0)
Test encoder input: tensor([3587, 7191, 3601, 1006, 3603, 3344,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0

In [None]:
encoder = Encoder(config)
decoder = Decoder(config)

trn_enc_outputs, trn_enc_attn_probs = encoder(trn_enc_inputs)
trn_dec_outputs, trn_self_attn_probs, trn_dec_enc_attn_probs = decoder(trn_dec_inputs, trn_enc_inputs, trn_enc_outputs)

In [None]:
print(encoder)
print(decoder)

Encoder(
  (enc_emb): Embedding(8007, 256)
  (pos_emb): Embedding(257, 256)
  (layers): ModuleList(
    (0): EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=256, out_features=256, bias=True)
        (W_K): Linear(in_features=256, out_features=256, bias=True)
        (W_V): Linear(in_features=256, out_features=256, bias=True)
        (scaled_dot_attn): ScaledDotProductAttention(
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (linear): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (pos_ffn): PoswiseFeedForwardNet(
        (conv1): Conv1d(256, 1024, kernel_size=(1,), stride=(1,))
        (conv2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    )
    (1

In [None]:
print(trn_enc_outputs[0])
print(trn_enc_attn_probs[0][0][0])
print(trn_dec_outputs[0])
print(trn_self_attn_probs[0][0][0])
print(trn_dec_enc_attn_probs[0][0][0])

tensor([[-0.3533, -1.2610,  1.6061,  ...,  1.9826, -3.1601,  0.9536],
        [-1.6468, -0.0844,  0.2194,  ...,  0.0312,  0.3796,  1.6402],
        [-0.5145, -0.8300, -0.4607,  ..., -1.1830,  1.2419,  0.4088],
        ...,
        [-0.2472,  2.0135, -0.3530,  ...,  2.9525, -1.1697,  0.5054],
        [ 0.0717,  1.9396, -0.4761,  ...,  3.1744, -1.0924,  0.6159],
        [ 0.0821,  1.8500, -0.4951,  ...,  2.8323, -1.1104,  0.4999]],
       grad_fn=<SelectBackward>)
tensor([[0.0130, 0.0000, 0.0145,  ..., 0.0000, 0.0000, 0.0000],
        [0.0113, 0.0125, 0.0123,  ..., 0.0000, 0.0000, 0.0000],
        [0.0081, 0.0174, 0.0183,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0038, 0.0061, 0.0124,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0061, 0.0124,  ..., 0.0000, 0.0000, 0.0000],
        [0.0038, 0.0061, 0.0124,  ..., 0.0000, 0.0000, 0.0000]],
       grad_fn=<SelectBackward>)
tensor([[ 1.7225, -1.7472, -0.2956,  1.2938, -0.1310,  0.3033,  1.2102,  0.6799,
          0.2558,  0.

In [None]:
print(np.shape(trn_enc_outputs))
print(np.shape(trn_enc_attn_probs))
print(np.shape(trn_dec_outputs))
print(np.shape(trn_self_attn_probs))
print(np.shape(trn_dec_enc_attn_probs))

torch.Size([128, 89, 256])
(6,)
torch.Size([128, 1, 256])
(6,)
(6,)


In [None]:
model = Classification(config).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr = config['learning_rate'], eps=config['adam_epsilon'])

In [None]:
print("Model:", model)
print("Criterion:", criterion)
print("Optimizer:", optimizer)
print(checkfunction(criterion = criterion, 
                    dataloader = trn_dataloader, 
                    output_size = config['output_size'], epoch = 0))

Model: Classification(
  (transformer): Transformer(
    (encoder): Encoder(
      (enc_emb): Embedding(8007, 256)
      (pos_emb): Embedding(257, 256)
      (layers): ModuleList(
        (0): EncoderLayer(
          (self_attn): MultiHeadAttention(
            (W_Q): Linear(in_features=256, out_features=256, bias=True)
            (W_K): Linear(in_features=256, out_features=256, bias=True)
            (W_V): Linear(in_features=256, out_features=256, bias=True)
            (scaled_dot_attn): ScaledDotProductAttention(
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (linear): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (layer_norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (pos_ffn): PoswiseFeedForwardNet(
            (conv1): Conv1d(256, 1024, kernel_size=(1,), stride=(1,))
            (conv2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
    

In [None]:
nb_epochs = 3

for epoch in range(nb_epochs):
    losses = []
    trn_loss = 0.0
    model.train()
    for batch, values in enumerate(trn_dataloader):
        labels, enc_inputs, dec_inputs = values
        hypothesis = model(enc_inputs, dec_inputs)
        optimizer.zero_grad()
        logits = hypothesis[0]
        loss = criterion(logits, labels)
        trn_loss += loss.item()
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        print("{} / {}".format(batch+1, len(trn_dataloader)))
    print("loss", trn_loss / len(trn_dataloader))

1 / 1172
2 / 1172
3 / 1172
4 / 1172
5 / 1172
6 / 1172
7 / 1172
8 / 1172
9 / 1172
10 / 1172
11 / 1172
12 / 1172
13 / 1172
