# AI504 Project 2

## To-Do : Find better hyperparameters
The goal of this project is improving the performance of Neural Machine Translation(NMT) system. In this project, you will tune the hyperparameters to achieve higher BLEU score without changing architecture and dataset.

In [1]:
from easydict import EasyDict
import os 
# import time

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# In the project, You need to change the below hyperparameters. 
config = EasyDict({
    "emb_dim":256,
    "ffn_dim":512,
    "attention_heads":8,
    "dropout":0.25,
    "encoder_layers":4,
    "decoder_layers":4,
    "lr": 0.001,
    "batch_size":1000,
    "nepochs":100,
    "patience":10,
})


## Template codes
This code is based on the code in [Week 10](https://classum.com/main/course/16076/54). Please refer to codes & descriptions in a link for details.

### Prelims

In [11]:
# !pip install --upgrade torchtext
# !python -m spacy download de
# !python -m spacy download en
# !pip install -Iv --upgrade nltk==3.5
# !pip install tensorflow --use-feature=2020-resolver


# start_time = time.time()

### Data loader

In [12]:
# !pip install msgpack==0.5.6
# !pip install tensorflow --use-feature=2020-resolver

# !python -m spacy download de


In [2]:
import torch
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

torch.manual_seed(1234)
torch.cuda.manual_seed_all(1234)

SRC = Field(tokenize = "spacy",
            tokenizer_language="de",
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 3)
TRG.build_vocab(train_data, min_freq = 3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = config.batch_size,
    device = device,
    shuffle=False)

PAD_IDX = TRG.vocab.stoi['<pad>']



### Load model & optimizer

In [3]:
import torch.nn as nn
import torch.optim as optim


class Transformer(nn.Module):
    def __init__(self, config):
        super(Transformer,self).__init__()
        self.encoder_embedding = nn.Embedding(len(SRC.vocab),config.emb_dim)
        self.decoder_embedding = nn.Embedding(len(TRG.vocab),config.emb_dim)
        self.transformer = nn.Transformer(d_model=config.emb_dim, nhead=config.attention_heads, 
                       num_encoder_layers=config.encoder_layers, num_decoder_layers=config.decoder_layers,
                       dim_feedforward=config.ffn_dim, dropout=config.dropout, activation='gelu')
        self.prediction_head = nn.Linear(config.emb_dim,len(TRG.vocab))
        
    def forward(self, src, trg):
        src_emb = self.encoder_embedding(src)
        trg_emb = self.decoder_embedding(trg)
        output = self.transformer(src_emb, trg_emb,
                       tgt_mask=self.transformer.generate_square_subsequent_mask(trg.size(0)).to(device),
                       src_key_padding_mask=src.eq(PAD_IDX).permute(1,0).to(device),
                       memory_key_padding_mask=src.eq(PAD_IDX).permute(1,0).to(device),
                       tgt_key_padding_mask=trg.eq(PAD_IDX).permute(1,0).to(device))
        prediction = self.prediction_head(output)
        return prediction

CLIP = 1 # For gradient clipping
    
model = Transformer(config)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=config.lr)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60], gamma=0.5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

### Train & Evaluation

In [4]:
# !pip uninstall pytorch
# # !pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
# !pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [4]:
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
from copy import deepcopy
import json

prev_bleu = -1
best_model = None

def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):
    model.train()

    epoch_loss = 0

    for idx, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[:-1].reshape(-1, output.shape[-1])
        trg = trg[1:].reshape(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for _, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg)            
            
            output = output[:-1].reshape(-1, output.shape[-1])
            
            trg = trg[1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def measure_BLEU(model: nn.Module,
             iterator: BucketIterator
                ):
    model.eval()
    iterator.batch_size = 1
    BLEU_scores = list()
    
    with torch.no_grad():
        for idx, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg)
            predicted = [TRG.vocab.itos[token] for token in output[:-1].argmax(dim=2).squeeze().tolist() if token!=PAD_IDX]
            GT = [TRG.vocab.itos[token] for token in trg[1:].squeeze().tolist() if token!=PAD_IDX]
            BLEU_scores.append(sentence_bleu([GT], predicted))
    return sum(BLEU_scores)/len(BLEU_scores)

patience=0

for epoch in tqdm(range(config.nepochs), total=config.nepochs):
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    # scheduler.step()
    valid_loss = evaluate(model, valid_iterator, criterion)
    test_bleu = measure_BLEU(model, test_iterator)
    print("Test BLEU score : {}".format(test_bleu * 100))
    print("Epoch : {} / Training loss : {} / Validation loss : {}".format(epoch+1, train_loss, valid_loss))

    # Early stopping
    # You can change early stop criterion
    if prev_bleu > test_bleu:
        patience += 1
        if patience > config.patience:
            break
    else:
        prev_bleu = test_bleu
        patience = 0
        best_model = deepcopy(model)

  0%|          | 0/100 [00:00<?, ?it/s]

## Test your model

In [5]:
# Total_time = time.time() - start_time
# print(Total_time)

test_bleu = measure_BLEU(best_model, test_iterator)
print("Test BLEU score : {}".format(test_bleu * 100))

Test BLEU score : 29.65953206705337


## Save the result

In [6]:
with open('config.json','w') as f:
    json.dump(vars(config),f)
torch.save(best_model.state_dict(),'model.pt')

## Download files
Before execute this code, you should run the template codes first. This code will automatically downloads the state_dict of your model and configuration file which you use for training & evaluation.

Please change the student ID before you run this.

__CAUTION__ : Please run this code with *Google Chrome* browser. 

In [None]:
# from google.colab import files
# import os

# os.environ['STUDENT_ID']="20201234"

# if os.path.isdir('result'):
#   !rm -rf result

# %mkdir result
# %mv config.json model.pt result

# !zip $STUDENT_ID.zip result/*
# files.download('{}.zip'.format(os.environ['STUDENT_ID']))

In [7]:
# ########################################################################
# If you're using your lab server, uncomment and run the below commands #
# ########################################################################

import os
os.environ['STUDENT_ID']="20201234"

if os.path.isdir('result'):
      !rm -rf result

%mkdir result
%cp ai504_project2.ipynb result/
%mv config.json model.pt result
!zip $STUDENT_ID.zip result/*

updating: result/ai504_project2.ipynb (deflated 80%)
updating: result/config.json (deflated 31%)
updating: result/model.pt (deflated 8%)
