## Mount google drive and change directory

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = False)

!git clone https://github.com/drumpt/NMT_practice.git/ /content/drive/My\ Drive/Colab\ Notebooks/CS495\ Individual\ Study/1회차
%cd /content/drive/My\ Drive/Colab\ Notebooks/CS495\ Individual\ Study/1회차/NMT_practice
!pip install -r requirements.txt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
fatal: destination path '/content/drive/My Drive/Colab Notebooks/CS495 Individual Study/1회차' already exists and is not an empty directory.
/content/drive/My Drive/Colab Notebooks/CS495 Individual Study/1회차/NMT_practice


## Import libraries and choose device

In [2]:
import os
import math
# import argparse
import easydict
import gc

import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm

from dataset.dataloader import load_data, get_loader
from dataset.field import Vocab
from utils import seq2sen

if torch.cuda.is_available():
    torch.cuda.set_device(0)
    device = "cuda"
else:
    device = "cpu"
print(f"Use {device} for torch")

Use cuda for torch


## Define Encoder

In [3]:
class Encoder(nn.Module):
    def __init__(self, model_dim, h):
        super().__init__()
        self.model_dim = model_dim
        self.h = h

        # TODO(completed) : num_identical_layers를 이용할 수 있을까? clone을 이용하면 됨.
        self.identical_layer1 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer2 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer3 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer4 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer5 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer6 = EncoderIdenticalLayer(self.model_dim, self.h).to(device)

    def forward(self, x):
        x = self.identical_layer1(x)
        x = self.identical_layer2(x)
        x = self.identical_layer3(x)
        x = self.identical_layer4(x)
        x = self.identical_layer5(x)
        x = self.identical_layer6(x)
        return x

class EncoderIdenticalLayer(nn.Module):
    def __init__(self, model_dim, h):
        super().__init__()
        self.model_dim = model_dim
        self.h = h

        self.w_i_Q_list = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_K_list = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_V_list = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_O = nn.Linear(in_features = self.h * (self.model_dim // self.h), out_features = self.model_dim, bias = False).to(device)
        self.layer_normalization1 = nn.LayerNorm(self.model_dim).to(device) # exclude batch dimension

        self.feed_forward_network1 = nn.Linear(in_features = 512, out_features = 2048, bias = True).to(device)
        self.feed_forward_network2 = nn.Linear(in_features = 2048, out_features = 512, bias = True).to(device)
        self.layer_normalization2 = nn.LayerNorm(self.model_dim).to(device) # TODO(completed): 맞나?

        self.dropout = nn.Dropout(p = 0.1).to(device)
        self.softmax = nn.Softmax(dim = 2).to(device) # TODO(completed): 이게 맞나? 2인 것 같다.
        self.relu = nn.ReLU().to(device)

    def forward(self, x):
        ### Sublayer 1
        # Multi-Head Attention
        splitted_x_list = []
        for i in range(self.h):
            in_softmax = torch.matmul(
                self.w_i_Q_list[i](x),
                self.w_i_K_list[i](x).transpose(1, 2)
            ) / math.sqrt(self.model_dim // self.h)
            out_softmax = self.w_i_V_list[i](x)
            splitted_x = torch.matmul(self.softmax(in_softmax), out_softmax)
            splitted_x_list.append(splitted_x)
        concatenated_x = torch.cat(splitted_x_list, dim = 2) # TODO(completed): 맞나?
        multi_head_attention_output = self.dropout(self.w_O(concatenated_x))

        # Add & Layer Normalization
        multi_head_attention_output += x
        multi_head_attention_output = self.layer_normalization1(multi_head_attention_output)

        ### Sublayer 2
        # Feed-Forward Network
        positionwise_output_list = [] # (0, 2, 1)
        for position in range(multi_head_attention_output.shape[1]): # token length
            positionwise_input = multi_head_attention_output[:, position, :]
            positionwise_output = self.relu(self.feed_forward_network1(positionwise_input))
            positionwise_output = self.feed_forward_network2(positionwise_output)
            positionwise_output_list.append(positionwise_output)
        ffn_output = self.dropout(torch.stack(positionwise_output_list, dim = 1))

        # Add & Layer Normalization
        ffn_output += multi_head_attention_output
        ffn_output = self.layer_normalization2(ffn_output)
        return ffn_output

## Define Decoder

In [4]:
class Decoder(nn.Module):
    def __init__(self, model_dim, h):
        super().__init__()
        self.model_dim = model_dim
        self.h = h

        self.identical_layer1 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer2 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer3 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer4 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer5 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)
        self.identical_layer6 = DecoderIdenticalLayer(self.model_dim, self.h).to(device)

    def forward(self, x, y):
        x = self.identical_layer1(x, y)
        x = self.identical_layer2(x, y)
        x = self.identical_layer3(x, y)
        x = self.identical_layer4(x, y)
        x = self.identical_layer5(x, y)
        x = self.identical_layer6(x, y)
        return x

class DecoderIdenticalLayer(nn.Module):
    def __init__(self, model_dim, h):
        super().__init__()
        self.model_dim = model_dim
        self.h = h

        self.w_i_Q_list_first = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_K_list_first = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_V_list_first = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_O_first = nn.Linear(in_features = self.h * (self.model_dim // self.h), out_features = self.model_dim, bias = False).to(device)
        self.layer_normalization1 = nn.LayerNorm(self.model_dim).to(device) # exclude batch dimension

        self.w_i_Q_list_second = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_K_list_second = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_i_V_list_second = [nn.Linear(in_features = self.model_dim, out_features = self.model_dim // self.h, bias = False).to(device) for _ in range(self.h)]
        self.w_O_second = nn.Linear(in_features = self.h * (self.model_dim // self.h), out_features = self.model_dim, bias = False).to(device)
        self.layer_normalization2 = nn.LayerNorm(self.model_dim).to(device) # exclude batch dimension
        
        self.feed_forward_network1 = nn.Linear(in_features = 512, out_features = 2048, bias = True).to(device)
        self.feed_forward_network2 = nn.Linear(in_features = 2048, out_features = 512, bias = True).to(device)
        self.layer_normalization3 = nn.LayerNorm(self.model_dim).to(device)

        self.dropout = nn.Dropout(p = 0.1).to(device)
        self.softmax = nn.Softmax(dim = 2).to(device)
        self.relu = nn.ReLU().to(device)

    def forward(self, x, y): # TODO(completed): masking 구현.
        ### Sublayer 1
        # Masked Multi-Head Attention
        splitted_x_list = []
        for i in range(self.h):
            in_softmax = torch.matmul(
                self.w_i_Q_list_first[i](x),
                self.w_i_K_list_first[i](x).transpose(1, 2)
            ) / math.sqrt(self.model_dim // self.h)
            in_softmax = self.masking(in_softmax)
            out_softmax = self.w_i_V_list_first[i](x)
            splitted_x = torch.matmul(self.softmax(in_softmax), out_softmax)
            splitted_x_list.append(splitted_x)
        concatenated_x = torch.cat(splitted_x_list, dim = 2)
        multi_head_attention_output_first = self.dropout(self.w_O_first(concatenated_x))

        # Add & Layer Normalization
        multi_head_attention_output_first += x
        multi_head_attention_output_first = self.layer_normalization1(multi_head_attention_output_first)

        ## Sublayer 2
        # Multi-Head Attention
        """
        queries : come from previous decoder layer
        keys, values : come from the output of the encoder
        """
        splitted_x_list = []
        for i in range(self.h):
            in_softmax = torch.matmul(
                self.w_i_Q_list_second[i](multi_head_attention_output_first),
                self.w_i_K_list_second[i](y).transpose(1, 2)
            ) / math.sqrt(self.model_dim // self.h)
            out_softmax = self.w_i_V_list_second[i](y)
            splitted_x = torch.matmul(self.softmax(in_softmax), out_softmax)
            splitted_x_list.append(splitted_x)
        concatenated_x = torch.cat(splitted_x_list, dim = 2)
        multi_head_attention_output_second = self.dropout(self.w_O_second(concatenated_x))

        # Masked Multi-Head Attention
        multi_head_attention_output_second += multi_head_attention_output_first
        multi_head_attention_output_second = self.layer_normalization2(multi_head_attention_output_second)

        ### Sublayer 3
        # Feed-Forward Network
        positionwise_output_list = [] # (0, 2, 1)
        for position in range(multi_head_attention_output_second.shape[1]): # token length
            positionwise_input = multi_head_attention_output_second[:, position, :]
            positionwise_output = self.relu(self.feed_forward_network1(positionwise_input))
            positionwise_output = self.feed_forward_network2(positionwise_output)
            positionwise_output_list.append(positionwise_output)
        ffn_output = self.dropout(torch.stack(positionwise_output_list, dim = 1))

        # Add & Layer Normalization
        ffn_output += multi_head_attention_output_second
        ffn_output = self.layer_normalization3(ffn_output)
        return ffn_output

    def masking(self, x):
        masking_tensor = torch.triu(torch.empty(x.shape[1], x.shape[2]).fill_(float("-inf")), diagonal = 1).to(device)
        for idx in range(x.shape[0]):
            x[idx] += masking_tensor
        return x

## Define Transformer

In [5]:
class Transformer(nn.Module):
    def __init__(self, model_dim, src_vocab_size, tgt_vocab_size, max_length):
        super().__init__()
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size
        self.max_length = max_length
        self.model_dim = model_dim
        self.h = 8
        self.positional_encoding_constants = self.get_positional_encoding_constants()

        self.input_embedding = nn.Embedding(self.src_vocab_size, self.model_dim).to(device)
        self.output_embedding = nn.Embedding(self.tgt_vocab_size, self.model_dim).to(device)
        self.embedding_dropout = nn.Dropout(p = 0.1).to(device)
        self.encoder = Encoder(self.model_dim, self.h).to(device)
        self.decoder = Decoder(self.model_dim, self.h).to(device)
        self.final_linear = nn.Linear(self.model_dim, self.tgt_vocab_size).to(device)
        self.softmax = nn.Softmax(dim = 2).to(device)

    def forward(self, x, y): # x : encoder_input, y : decoder_input
        encoder_input = self.embedding_dropout(self.positional_encoding(self.input_embedding(x) * math.sqrt(self.model_dim)))
        encoder_output = self.encoder(encoder_input)
        decoder_output = self.decoder(self.embedding_dropout(self.positional_encoding(self.output_embedding(y) * math.sqrt(self.model_dim))), encoder_output)
        final_output = self.softmax(self.final_linear(decoder_output))
        final_output = self.final_linear(decoder_output)
        return final_output

    def positional_encoding(self, embedded_sentence): # TODO(completed): batch 단위가 아니라 문장 단위로 전달되는지 확인. 안 됨.
        for batch_idx in range(embedded_sentence.shape[0]):
            embedded_sentence[batch_idx] += self.positional_encoding_constants[:embedded_sentence.shape[1], :]
        return embedded_sentence

    def get_positional_encoding_constants(self): # avoid redundant calculations
        positional_encoding_constants = []
        for pos in range(self.max_length):
            pos_constants = []
            for idx in range(self.model_dim):
                if idx % 2 == 0: # idx = 2 * i -> 2 * i = idx
                    pos_constants.append(math.sin(pos / 10000 ** (idx / self.model_dim)))
                else: # idx = 2 * i + 1 -> 2 * i = idx - 1
                    pos_constants.append(math.cos(pos / 10000 ** ((idx  - 1) / self.model_dim)))
            positional_encoding_constants.append(pos_constants)
        return torch.tensor(positional_encoding_constants).to(device)
    
    def save_model(self, output_path, epoch, loss, val_loss):
        if not os.path.exists(output_path):
            os.makedirs(output_path)

        output_filename = os.path.join(output_path, f"weights_{epoch:03d}_{loss:.4f}_{val_loss:.4f}.pt")
        torch.save(self.state_dict(), output_filename)
        return output_filename

    def plot(self, output_path, history):
        plt.subplot(2, 1, 1)
        plt.title('Accuracy versus Epoch')
        plt.plot(history['accuracy'])
        plt.plot(history['val_accuracy'])
        plt.legend(['accuracy', 'val_accuracy'], loc = 'upper right')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')

        plt.subplot(2, 1, 2)
        plt.title('Loss versus Epoch')
        plt.plot(history['loss'])
        plt.plot(history['val_loss'])
        plt.legend(['loss', 'val_loss'], loc = 'upper right')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')

        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "training_result.png"))

## Define loss function and some functions

In [6]:
class LabelSmoothingsLoss(nn.Module):
    def __init__(self, num_classes, smoothing, dim, is_train):
        super().__init__()
        self.confidence = 1 - smoothing
        self.smoothing = smoothing
        self.num_classes = num_classes
        self.dim = dim
        self.is_train = is_train

    def forward(self, pred, target, pad_start_idx_list = None):
        pred = torch.log(pred.clone().detach().requires_grad_(True))
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.num_classes - 1))
        true_dist.scatter_(self.dim, target.data.unsqueeze(self.dim), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim = self.dim))

def get_pad_start_idx_list(tgt_batch):
    pad_start_idx_list = []
    for batch_idx in range(tgt_batch.shape[0]):
        try:
            pad_start_idx = list(tgt_batch[batch_idx]).index(2) # <pad>
        except ValueError:
            pad_start_idx = tgt_batch[batch_idx].shape[0]
        pad_start_idx_list.append(pad_start_idx)
    return pad_start_idx_list

def get_learning_rate(model_dim, step_num, warmup_steps):
    return (model_dim ** (-0.5)) * min(step_num ** (-0.5), step_num * (warmup_steps ** (-1.5)))

## Define main function



In [24]:
def main(args):
    src, tgt = load_data(args.path)

    src_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>')
    src_vocab.load(os.path.join(args.path, 'vocab.en'))
    tgt_vocab = Vocab(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', unk_token='<unk>')
    tgt_vocab.load(os.path.join(args.path, 'vocab.de'))

    # TODO(completed): use these information.
    sos_idx = 0
    eos_idx = 1
    pad_idx = 2
    max_length = 50

    # TODO: use these values to construct embedding layers
    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    # Define training parameters
    model_dim = 512
    warmup_steps = 4000

    # Define model
    transformer = Transformer(model_dim, src_vocab_size, tgt_vocab_size, max_length)
    if args.test and args.model_path:
        transformer.load_state_dict(torch.load(args.model_path))
        transformer.eval()

    # Define optimizer
    step_num = 1
    learning_rate = get_learning_rate(model_dim, step_num, warmup_steps)
    optimizer = optim.Adam(transformer.parameters(), lr = learning_rate, betas = (0.9, 0.98), eps = 1e-8)

    # Define loss function
    ## smoothing = 0.1
    ## train_loss_function = LabelSmoothingsLoss(tgt_vocab_size, smoothing, dim = -1, is_train = True)
    ## validation_loss_function = LabelSmoothingsLoss(tgt_vocab_size, smoothing, dim = -1, is_train = False)
    train_loss_function = nn.CrossEntropyLoss(ignore_index = pad_idx).to(device)
    # train_loss_function.requires_grad = True
    validation_loss_function = nn.CrossEntropyLoss().to(device)

    if not args.test:
        train_loader = get_loader(src['train'], tgt['train'], src_vocab, tgt_vocab, batch_size=args.batch_size, shuffle=True)
        valid_loader = get_loader(src['valid'], tgt['valid'], src_vocab, tgt_vocab, batch_size=args.batch_size)

        history = {
            "loss" : [],
            "val_loss" : [],
            "accuracy" : [],
            "val_accuracy" : []
        }

        for epoch in range(args.epochs):
            print(f"Epoch {epoch + 1}/{args.epochs}")
            total_train_size, total_validation_size = 0, 0
            epoch_train_loss, epoch_validation_loss = 0, 0
            epoch_train_correct, epoch_validation_correct = 0, 0

            # TODO(completed): train
            for src_batch, tgt_batch in tqdm(train_loader):
                for g in optimizer.param_groups: # update learning rate first
                    g['lr'] = get_learning_rate(model_dim, step_num, warmup_steps)
                optimizer.zero_grad()

                prd_batch = transformer(torch.tensor(src_batch).to(device), torch.tensor(tgt_batch).to(device))

                loss = train_loss_function( \
                    torch.tensor(prd_batch.transpose(-1, -2), dtype = torch.float, requires_grad = True).clone().to(device), \
                    torch.tensor(tgt_batch).clone().to(device) \
                )
                loss.backward()
                optimizer.step()
                step_num += 1

                total_train_size += len(src_batch)
                epoch_train_loss += loss
                epoch_train_correct += int(torch.sum(torch.argmax(torch.tensor(prd_batch).data, -1) == torch.tensor(tgt_batch).to(device)))

                gc.collect()
                torch.cuda.empty_cache()

            epoch_train_loss /= total_train_size
            epoch_train_accuracy = epoch_train_correct / total_train_size

            history["loss"].append(epoch_train_loss)    
            history["accuracy"].append(epoch_train_accuracy)

            # TODO: validation
            for src_batch, tgt_batch in tqdm(valid_loader):
                prd_batch = transformer(torch.tensor(src_batch).to(device), torch.tensor(tgt_batch).to(device))
                loss = validation_loss_function( \
                    torch.tensor(prd_batch.transpose(-1, -2), dtype = torch.float, requires_grad = True).clone().to(device), \
                    torch.tensor(tgt_batch).clone().to(device), \
                )

                total_validation_size += len(src_batch)
                epoch_validation_loss += loss
                epoch_validation_correct += int(torch.sum(torch.argmax(torch.tensor(prd_batch).data, -1) == torch.tensor(tgt_batch).to(device)))

                gc.collect()
                torch.cuda.empty_cache()

            epoch_validation_loss /= total_validation_size
            epoch_validation_accuracy = epoch_validation_correct / total_validation_size

            history["val_loss"].append(epoch_validation_loss)
            history["val_accuracy"].append(epoch_validation_accuracy)

            print(f"loss : {epoch_train_loss:.6f}, val_loss : {epoch_validation_loss:.6f}") 
            print(f"accuracy : {epoch_train_accuracy:.6f}, val_accuracy : {epoch_validation_accuracy:.6f}")

            transformer.save_model(args.output_path, epoch + 1, epoch_train_loss, epoch_validation_loss)
        transformer.plot(args.output_path, history)

    else: # test
        test_loader = get_loader(src['test'], tgt['test'], src_vocab, tgt_vocab, batch_size=args.batch_size)

        pred = []
        for src_batch, tgt_batch in tqdm(test_loader):
            # TODO: predict pred_batch from src_batch with your model.
            result_batch = torch.tensor([sos_index for _ in range(len(src_batch))]).unsqueeze(-1)
            for idx in range(max_length):
                if idx == max_length - 1:
                    pred_batch = torch.full((len(src_batch), 1), eos_idx)
                else:
                    pred_batch = transformer(torch.tensor(src_batch).to(device), result_batch.to(device))[:, -1, :] # consider only last result
                    pred_batch[:, sos_index] += float("-inf") # exclude <sos>
                    pred_batch[:, pad_index] += float("-inf") # exclude <pad>
                    pred_batch = torch.argmax(pred_batch, dim = -1).unsqueeze(-1)
                result_batch = torch.cat((result_batch.to(device), pred_batch.to(device)), dim = -1)
            result_batch = result_batch[1:, :] # remove <sos> from the result

            max_length = 0
            for batch_idx in range(result_batch.shape[0]):
                max_length = max(max_length, list(result_batch[batch_idx]).find(eos_index)) # find <eos>
            result_batch = result_batch[:max_length, :].tolist()
            pred += seq2sen(result_batch, tgt_vocab)

            gc.collect()
            torch.cuda.empty_cache()

        with open('results/pred.txt', 'w') as f:
            for line in pred:
                f.write('{}\n'.format(line))
            f.close()

        os.system('bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')

## Call main function

In [28]:
args = easydict.EasyDict({
    "path": "multi30k",
    "epochs": 50,
    "batch_size": 1,
    "test": False,
    "output_path": "resources",
    "model_path": ""
})

main(args)

1it [00:00,  9.33it/s]

Epoch 1/50


24219it [29:37, 14.28it/s]

RuntimeError: ignored

## Use console

In [37]:
from kora import console
console.start()

Console URL: https://teleconsole.com/s/fb5f10c3f764a6f4980f7e501189c471d0175932
