transformer.py

# -*- coding: utf-8 -*-
"""Transformer.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/19zHIVjOUvVlFD2OlDaja0gmW2TsLsXuS
"""

import torch
import argparse
import math
import time
import argparse
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import OrderedDict
from tqdm import tqdm
import torch.optim as optim
import torch.utils.data
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.parallel import DataParallel
from tensorflow.python.client import device_lib

import matplotlib.pyplot as plt


PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
BOS_WORD = '<s>'
EOS_WORD = '</s>'

class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(
            temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head

        sz_b, len_q, _ = q.size()
        sz_b, len_k, _ = k.size()
        sz_b, len_v, _ = v.size()

        residual = q

        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        q = q.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_q, d_k)  # (n*b) x lq x dk
        k = k.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_k, d_k)  # (n*b) x lk x dk
        v = v.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_v, d_v)  # (n*b) x lv x dv

        mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
        output, attn = self.attention(q, k, v, mask=mask)

        output = output.view(n_head, sz_b, len_q, d_v)
        output = output.permute(1, 2, 0, 3).contiguous().view(
            sz_b, len_q, -1)  # b x lq x (n*dv)

        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)

        return output, attn


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Conv1d(d_in, d_hid, 1)  # position-wise
        self.w_2 = nn.Conv1d(d_hid, d_in, 1)  # position-wise
        self.layer_norm = nn.LayerNorm(d_in)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        output = x.transpose(1, 2)
        output = self.w_2(F.relu(self.w_1(output)))
        output = output.transpose(1, 2)
        output = self.dropout(output)
        output = self.layer_norm(output + residual)
        return output

class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

class EncoderLayer(nn.Module):
    ''' Compose with two layers '''

    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(
            n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(
            d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output *= non_pad_mask

        enc_output = self.pos_ffn(enc_output)
        enc_output *= non_pad_mask

        return enc_output, enc_slf_attn


class DecoderLayer(nn.Module):
    ''' Compose with three layers '''

    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(
            n_head, d_model, d_k, d_v, dropout=dropout)
        self.enc_attn = MultiHeadAttention(
            n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(
            d_model, d_inner, dropout=dropout)

    def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None):
        dec_output, dec_slf_attn = self.slf_attn(
            dec_input, dec_input, dec_input, mask=slf_attn_mask)
        dec_output *= non_pad_mask

        dec_output, dec_enc_attn = self.enc_attn(
            dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
        dec_output *= non_pad_mask

        dec_output = self.pos_ffn(dec_output)
        dec_output *= non_pad_mask

        return dec_output, dec_slf_attn, dec_enc_attn

class Beam():
    ''' Beam search '''

    def __init__(self, size, device=False, without_eos_bos=False):

        self.size = size
        self._done = False

        # The score for each translation on the beam.
        self.scores = torch.zeros((size,), dtype=torch.float, device=device)
        self.all_scores = []

        # The backpointers at each time-step.
        self.prev_ks = []

        self.without_eos_bos = without_eos_bos
        # The outputs at each time-step.
        self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)]
        if not without_eos_bos:
            self.next_ys[0][0] = Constants.BOS

    def get_current_state(self):
        "Get the outputs for the current timestep."
        return self.get_tentative_hypothesis()

    def get_current_origin(self):
        "Get the backpointers for the current timestep."
        return self.prev_ks[-1]

    @property
    def done(self):
        return self._done

    def advance(self, word_prob):
        "Update beam status and check if finished or not."
        num_words = word_prob.size(1)

        # Sum the previous scores.
        if len(self.prev_ks) > 0:
            beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
        else:
            beam_lk = word_prob[0]

        flat_beam_lk = beam_lk.view(-1)

        best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True)  # 1st sort
        best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True)  # 2nd sort

        self.all_scores.append(self.scores)
        self.scores = best_scores

        # bestScoresId is flattened as a (beam x word) array,
        # so we need to calculate which word and beam each score came from
        prev_k = best_scores_id / num_words
        self.prev_ks.append(prev_k)
        self.next_ys.append(best_scores_id - prev_k * num_words)

        # End condition is when top-of-beam is EOS.
        # TODO: Add case for without_eos_bos
        end_word = Constants.EOS
        if self.without_eos_bos:
            end_word = Constants.PAD
            
        if self.next_ys[-1][0].item() == end_word:
            self._done = True
            self.all_scores.append(self.scores)

        return self._done

    def sort_scores(self):
        "Sort the scores."
        return torch.sort(self.scores, 0, True)

    def get_the_best_score_and_idx(self):
        "Get the score of the best in the beam."
        scores, ids = self.sort_scores()
        return scores[1], ids[1]

    def get_tentative_hypothesis(self):
        "Get the decoded sequence for the current timestep."

        if len(self.next_ys) == 1:
            dec_seq = self.next_ys[0].unsqueeze(1)
        else:
            _, keys = self.sort_scores()
            hyps = [self.get_hypothesis(k) for k in keys]
            hyps = [[Constants.BOS] + h for h in hyps]
            dec_seq = torch.LongTensor(hyps)

        return dec_seq

    def get_hypothesis(self, k):
        """ Walk back to construct the full hypothesis. """
        hyp = []
        for j in range(len(self.prev_ks) - 1, -1, -1):
            hyp.append(self.next_ys[j+1][k])
            k = self.prev_ks[j][k]

        return list(map(lambda x: x.item(), hyp[::-1]))

def get_non_pad_mask(seq):
    assert seq.dim() == 2
    return seq.ne(PAD).type(torch.float).unsqueeze(-1)


def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    ''' Sinusoid position encoding table '''

    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i)
                               for pos_i in range(n_position)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

    if padding_idx is not None:
        # zero vector for padding dimension
        sinusoid_table[padding_idx] = 0.

    return torch.FloatTensor(sinusoid_table)


def get_attn_key_pad_mask(seq_k, seq_q):
    ''' For masking out the padding part of key sequence. '''

    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = seq_k.eq(PAD)
    padding_mask = padding_mask.unsqueeze(
        1).expand(-1, len_q, -1)  # b x lq x lk

    return padding_mask


def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''

    sz_b, len_s = seq.size()
    subsequent_mask = torch.triu(
        torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)
    subsequent_mask = subsequent_mask.unsqueeze(
        0).expand(sz_b, -1, -1)  # b x ls x ls

    return subsequent_mask


class Encoder(nn.Module):
    ''' A encoder model with self attention mechanism. '''

    def __init__(
            self,
            n_src_vocab, len_max_seq, d_word_vec,
            n_layers, n_head, d_k, d_v,
            d_model, d_inner, dropout=0.1):

        super().__init__()

        n_position = len_max_seq + 1

        self.linear = nn.Linear(21, d_word_vec)

        self.src_word_emb = nn.Embedding(
            n_src_vocab, d_word_vec, padding_idx=PAD)

        self.position_enc = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
            freeze=True)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])

    def forward(self, src_seq, src_sp, src_pos, return_attns=False):
        src_sp = src_sp.transpose(1, 2)
        enc_slf_attn_list = []

        # -- Prepare masks
        slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
        non_pad_mask = get_non_pad_mask(src_seq)

        # -- Forward
        enc_output = self.src_word_emb(
            src_seq) + self.linear(src_sp) + self.position_enc(src_pos)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
                enc_output,
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            if return_attns:
                enc_slf_attn_list += [enc_slf_attn]

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output,


class Decoder(nn.Module):
    ''' A decoder model with self attention mechanism. '''

    def __init__(
            self,
            n_tgt_vocab, len_max_seq, d_word_vec,
            n_layers, n_head, d_k, d_v,
            d_model, d_inner, dropout=0.1):

        super().__init__()
        n_position = len_max_seq + 1

        self.tgt_word_emb = nn.Embedding(
            n_tgt_vocab, d_word_vec, padding_idx=PAD)

        self.position_enc = nn.Embedding.from_pretrained(
            get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
            freeze=True)

        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])

    def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False):

        dec_slf_attn_list, dec_enc_attn_list = [], []

        # -- Prepare masks
        non_pad_mask = get_non_pad_mask(tgt_seq)

        slf_attn_mask_subseq = get_subsequent_mask(tgt_seq)
        slf_attn_mask_keypad = get_attn_key_pad_mask(
            seq_k=tgt_seq, seq_q=tgt_seq)
        slf_attn_mask = (slf_attn_mask_keypad.type(torch.uint8) + slf_attn_mask_subseq.type(torch.uint8)).gt(0)

        dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=tgt_seq)

        # -- Forward
        dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos)

        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
                dec_output, enc_output,
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask,
                dec_enc_attn_mask=dec_enc_attn_mask)

            if return_attns:
                dec_slf_attn_list += [dec_slf_attn]
                dec_enc_attn_list += [dec_enc_attn]

        if return_attns:
            return dec_output, dec_slf_attn_list, dec_enc_attn_list
        return dec_output,


class Transformer(nn.Module):
    ''' A sequence to sequence model with attention mechanism. '''

    def __init__(
            self,
            n_src_vocab, n_tgt_vocab, len_max_seq,
            d_word_vec=512, d_model=512, d_inner=2048,
            n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
            tgt_emb_prj_weight_sharing=True,
            emb_src_tgt_weight_sharing=True):

        super().__init__()

        self.encoder = Encoder(
            n_src_vocab=n_src_vocab, len_max_seq=len_max_seq,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            dropout=dropout)

        self.decoder = Decoder(
            n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            dropout=dropout)

        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)

        assert d_model == d_word_vec, \
            'To facilitate the residual connections, \
         the dimensions of all module outputs shall be the same.'

        if tgt_emb_prj_weight_sharing:
            # Share the weight matrix between target word embedding & the final logit dense layer
            self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
            self.x_logit_scale = (d_model ** -0.5)
        else:
            self.x_logit_scale = 1.

        if emb_src_tgt_weight_sharing:
            # Share the weight matrix between source & target word embeddings
            assert n_src_vocab == n_tgt_vocab, \
                "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight

    def forward(self, src_seq, src_sp, src_pos, tgt_seq, tgt_pos):

        tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]

        enc_output, *_ = self.encoder(src_seq, src_sp, src_pos)
        dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
        seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale

        return seq_logit.view(-1, seq_logit.size(2))

class Translator(object):
    ''' Load with trained model and handle the beam search '''

    def __init__(self, opt):
        self.opt = opt
        self.device = torch.device('cuda' if opt.cuda else 'cpu')

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        model = Transformer(
            model_opt.src_vocab_size,
            model_opt.tgt_vocab_size,
            model_opt.max_token_seq_len,
            tgt_emb_prj_weight_sharing=model_opt.proj_share_weight,
            emb_src_tgt_weight_sharing=model_opt.embs_share_weight,
            d_k=model_opt.d_k,
            d_v=model_opt.d_v,
            d_model=model_opt.d_model,
            d_word_vec=model_opt.d_word_vec,
            d_inner=model_opt.d_inner_hid,
            n_layers=model_opt.n_layers,
            n_head=model_opt.n_head,
            dropout=model_opt.dropout)

        model_state = OrderedDict()
        for key, value in checkpoint['model'].items():
            key = key[7:]
            model_state[key] = value

        model.load_state_dict(model_state)

        print('[Info] Trained model state loaded.')

        model.word_prob_prj = nn.LogSoftmax(dim=1)

        model = model.to(self.device)

        self.model = model
        self.model.eval()

    def translate_batch(self, src_seq, src_sp, src_pos):
        ''' Translation work in one batch '''

        def get_inst_idx_to_tensor_position_map(inst_idx_list):
            ''' Indicate the position of an instance in a tensor. '''
            return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}

        def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
            ''' Collect tensor parts associated to active instances. '''

            _, *d_hs = beamed_tensor.size()
            n_curr_active_inst = len(curr_active_inst_idx)
            new_shape = (n_curr_active_inst * n_bm, *d_hs)

            beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
            beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
            beamed_tensor = beamed_tensor.view(*new_shape)

            return beamed_tensor

        def collate_active_info(
                src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
            # Sentences which are still active are collected,
            # so the decoder will not run on completed sentences.
            n_prev_active_inst = len(inst_idx_to_position_map)
            active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
            active_inst_idx = torch.LongTensor(active_inst_idx).to(self.device)

            active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
            active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
            active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)

            return active_src_seq, active_src_enc, active_inst_idx_to_position_map

        def beam_decode_step(
                inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm):
            ''' Decode and update beam status, and then return active beam idx '''

            def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
                dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
                dec_partial_seq = torch.stack(dec_partial_seq).to(self.device)
                dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
                return dec_partial_seq

            def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
                dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long, device=self.device)
                dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
                return dec_partial_pos

            def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
                dec_output, *_ = self.model.decoder(dec_seq, dec_pos, src_seq, enc_output)
                dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h
                word_prob = F.log_softmax(self.model.tgt_word_prj(dec_output), dim=1)
                word_prob = word_prob.view(n_active_inst, n_bm, -1)

                return word_prob

            def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
                active_inst_idx_list = []
                for inst_idx, inst_position in inst_idx_to_position_map.items():
                    is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
                    if not is_inst_complete:
                        active_inst_idx_list += [inst_idx]

                return active_inst_idx_list

            n_active_inst = len(inst_idx_to_position_map)

            dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
            dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
            word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)

            # Update the beam with predicted word prob information and collect incomplete instances
            active_inst_idx_list = collect_active_inst_idx_list(
                inst_dec_beams, word_prob, inst_idx_to_position_map)

            return active_inst_idx_list

        def collect_hypothesis_and_scores(inst_dec_beams, n_best):
            all_hyp, all_scores = [], []
            for inst_idx in range(len(inst_dec_beams)):
                scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
                all_scores += [scores[:n_best]]

                hyps = [inst_dec_beams[inst_idx].get_hypothesis(
                    i) for i in tail_idxs[:n_best]]
                all_hyp += [hyps]
            return all_hyp, all_scores

        with torch.no_grad():
            #-- Encode
            src_seq, src_sp, src_pos = src_seq.to(self.device), src_sp.to(self.device), src_pos.to(self.device)
            src_enc, *_ = self.model.encoder(src_seq, src_sp, src_pos)

            # -- Repeat data for beam search
            n_bm = self.opt.beam_size
            n_inst, len_s, d_h = src_enc.size()
            src_seq = src_seq.repeat(1, n_bm).view(n_inst * n_bm, len_s)
            src_enc = src_enc.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)

            # -- Prepare beams
            inst_dec_beams = [Beam(n_bm, device=self.device) for _ in range(n_inst)]

            # -- Bookkeeping for active or not
            active_inst_idx_list = list(range(n_inst))
            inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)

            #-- Decode
            for len_dec_seq in range(1, self.model_opt.max_token_seq_len + 1):

                active_inst_idx_list = beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm)

                if not active_inst_idx_list:
                    break  # all instances have finished their path to <EOS>

                src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
                    src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)

        batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, self.opt.n_best)

        return batch_hyp, batch_scores

def plot(train_loss, val_loss):
    epoch_count = range(1, len(train_loss) + 1)
    plt.plot(epoch_count, train_loss, 'r-')
    plt.plot(epoch_count, val_loss, 'b-')
    plt.legend(['Training Loss', 'Validation Loss'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    return plt

def plot2(train_loss, val_loss):
    epoch_count = range(1, len(train_loss) + 1)
    plt.plot(epoch_count, train_loss, 'r-')
    plt.plot(epoch_count, val_loss, 'b-')
    plt.legend(['Training Accuracy', 'Validation Accuracy'])
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    return plt


# calculating accuracy
def get_acc(gt, pred):
    assert len(gt) == len(pred)
    correct = 0
    for i in range(len(gt)):
        if gt[i] == pred[i]:
            correct += 1

    return (1.0 * correct) / len(gt)


def cal_performance(pred, gold, smoothing=False, crossEntropy=None):
    ''' Apply label smoothing if needed '''
    loss = cal_loss(pred, gold, smoothing, crossEntropy)
    pred = pred.max(1)[1]

    gold = gold.contiguous().view(-1)

    non_pad_mask = gold.ne(PAD)
    n_correct = pred.eq(gold)
    n_correct = n_correct.masked_select(non_pad_mask).sum().item()

    test1 = pred.masked_select(pred.ne(PAD)).tolist()
    test2 = gold.masked_select(non_pad_mask).tolist()

    # TODO: Fixing here
    list_of_lists1 = []
    acc = []
    for i in test1:
        acc.append(i)
        if (i == EOS):
            list_of_lists1.append(acc)
            acc = []

    list_of_lists2 = []
    acc = []
    for i in test2:
        acc.append(i)
        if (i == EOS):
            # print(acc)
            list_of_lists2.append(acc)
            acc = []

    accuracies = []
    for test1, test2 in zip(list_of_lists1, list_of_lists2):
        if (len(test1) == len(test2)):
            accuracies.append(get_acc(test1, test2))

    if len(accuracies) == 0:
        mean = 0
    else:
        mean = np.mean(accuracies)

    return loss, n_correct, mean


def cal_loss(pred, gold, smoothing, crossEntropy):
    ''' Calculate cross entropy loss, apply label smoothing if needed. '''
    gold = gold.contiguous().view(-1)
    # return FocalLoss()(pred, gold)

    if smoothing:
        eps = 0.1
        n_class = pred.size(1)

        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        non_pad_mask = gold.ne(PAD)
        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.masked_select(non_pad_mask).sum()  # average later
    else:
        loss = crossEntropy(pred, gold)
        # loss = F.cross_entropy(
        #    pred, gold, ignore_index=Constants.PAD, reduction='sum')

    return loss


def train_epoch(model, training_data, optimizer, device, smoothing, crossEntropy):
    ''' Epoch operation in training phase'''
    model.train()

    total_loss = 0
    n_word_batch_mean = 0
    n_batch = 0

    accu = []
    for batch in tqdm(
            training_data, mininterval=2,
            desc='  - (Training)   ', leave=False):
        # prepare data

        src_seq, src_sp, src_pos, tgt_seq, tgt_pos = map(
            lambda x: x.to(device), batch)
        gold = tgt_seq[:, 1:]

        # forward
        optimizer.zero_grad()
        pred = model(src_seq, src_sp, src_pos, tgt_seq, tgt_pos)

        # backward
        loss, n_correct, accuracy2 = cal_performance(pred, gold, smoothing=smoothing, crossEntropy=crossEntropy)
        loss.backward()

        accu.append(accuracy2)
        # update parameters
        optimizer.step_and_update_lr()

        n_batch += 1
        non_pad_mask = gold.ne(PAD)
        n_word = non_pad_mask.sum().item()
        total_loss += loss.item() / n_word
        n_word_batch_mean += n_correct / n_word

    # loss_per_word = total_loss/n_word_total
    mean_loss = total_loss / n_batch
    accuracy = n_word_batch_mean / n_batch

    return mean_loss, accuracy, np.mean(accu)


def eval_epoch(model, validation_data, device, crossEntropy):
    ''' Epoch operation in evaluation phase '''

    model.eval()

    total_loss = 0
    n_word_batch_mean = 0
    n_batch = 0

    accu = []
    with torch.no_grad():
        for batch in tqdm(
                validation_data, mininterval=2,
                desc='  - (Validation) ', leave=False):
            # prepare data
            src_seq, src_sp, src_pos, tgt_seq, tgt_pos = map(
                lambda x: x.to(device), batch)
            gold = tgt_seq[:, 1:]

            # forward
            pred = model(src_seq, src_sp, src_pos, tgt_seq, tgt_pos)
            loss, n_correct, accuracy2 = cal_performance(pred, gold, smoothing=False, crossEntropy=crossEntropy)

            n_batch += 1
            # note keeping

            non_pad_mask = gold.ne(PAD)
            n_word = non_pad_mask.sum().item()
            total_loss += loss.item() / n_word
            n_word_batch_mean += n_correct / n_word
            accu.append(accuracy2)

    mean_loss = total_loss / n_batch
    accuracy = n_word_batch_mean / n_batch
    return mean_loss, accuracy, np.mean(accu)


def test(model, test_data, device, opt, crossEntropy):
    log_test_file = None

    if opt.log:
        log_test_file = opt.log + '.test.log'

        with open(log_test_file, 'w') as log_test:
            log_test.write('loss,accuracy,real_accuracy,elapsed\n')

    start = time.time()
    valid_loss, valid_accu, new_accu = eval_epoch(model, test_data, device, crossEntropy)
    print('  - (Validation) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, right_accuracy: {accu2:3.3f} % '
          'elapsed: {elapse:3.3f} min'.format(
        ppl=math.exp(min(valid_loss, 100)), accu=100 * valid_accu, accu2=100 * new_accu,
        elapse=(time.time() - start) / 60))

    if log_test_file:
        with open(log_test_file, 'a') as log_test:
            log_test.write('{loss: 8.5f},{accu1: 3.3f},{accu2:3.3f},{elapse:3.3f}\n'.format(
                loss=valid_loss, accu1=100 * valid_accu, accu2=100 * new_accu,
                elapse=(time.time() - start) / 60))


def train(model, training_data, validation_data, optimizer, device, opt, crossEntropy):
    ''' Start training '''

    log_train_file = None
    log_valid_file = None

    if opt.log:
        log_train_file = opt.log + '.train.log'
        log_valid_file = opt.log + '.valid.log'

        print('[Info] Training performance will be written to file: {} and {}'.format(
            log_train_file, log_valid_file))

        with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf:
            log_tf.write('epoch,loss,accuracy,real_accuracy,elapsed\n')
            log_vf.write('epoch,loss,accuracy,real_accuracy,elapsed\n')

    train_acc_all = []
    val_acc_all = []
    train_loss_all = []
    val_loss_all = []
    valid_losses = []
    patience_fixed = 50
    patience = patience_fixed
    for epoch_i in range(opt.epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_accu, train_accuracy2 = train_epoch(
            model, training_data, optimizer, device, smoothing=opt.label_smoothing, crossEntropy=crossEntropy)
        print('  - (Training)   loss: {ppl: 8.5f}, accuracy: {accu:3.3f} %, accuracy_right: {accu2:3.3f} % '
              'elapsed: {elapse:3.3f} min'.format(
            ppl=train_loss, accu=100 * train_accu, accu2=100 * train_accuracy2,
            elapse=(time.time() - start) / 60))

        start = time.time()
        valid_loss, valid_accu, val_accuracy2 = eval_epoch(model, validation_data, device, crossEntropy=crossEntropy)
        print('  - (Validation) loss: {ppl: 8.5f}, accuracy: {accu:3.3f} %, accuracy_right: {accu2:3.3f} % '
              'elapsed: {elapse:3.3f} min'.format(
            ppl=valid_loss, accu=100 * valid_accu, accu2=100 * val_accuracy2,
            elapse=(time.time() - start) / 60))

        valid_losses += [valid_loss]

        model_state_dict = model.state_dict()
        checkpoint = {
            'model': model_state_dict,
            'settings': opt,
            'epoch': epoch_i}

        if opt.save_model:
            if opt.save_mode == 'all':
                model_name = opt.save_model + \
                             '_accu_{accu:3.3f}.chkpt'.format(accu=100 * valid_accu)
                torch.save(checkpoint, model_name)
            elif opt.save_mode == 'best':
                model_name = opt.save_model + '.chkpt'
                if valid_loss <= min(valid_losses):
                    torch.save(checkpoint, model_name)
                    print('    - [Info] The checkpoint file has been updated.')

        if valid_loss > min(valid_losses):
            patience = patience - 1
        else:
            patience = patience_fixed

        if patience < 1:
            print("- [Info] Early Stopping...")
            return train_loss_all, val_loss_all

        if log_train_file and log_valid_file:
            with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
                log_tf.write('{epoch},{loss: 8.5f},{accu1: 3.3f},{accu2:3.3f},{elapse:3.3f}\n'.format(
                    epoch=epoch_i, loss=train_loss, accu1=100 * train_accu, accu2=100 * train_accuracy2,
                    elapse=(time.time() - start) / 60))
                log_vf.write('{epoch},{loss: 8.5f},{accu1: 3.3f},{accu2:3.3f},{elapse:3.3f}\n'.format(
                    epoch=epoch_i, loss=valid_loss, accu1=100 * valid_accu, accu2=100 * val_accuracy2,
                    elapse=(time.time() - start) / 60))
        train_loss_all.append(train_loss)
        val_loss_all.append(valid_loss)
        train_acc_all.append(train_accu)
        val_acc_all.append(valid_accu)

    return train_loss_all, val_loss_all, train_acc_all, val_acc_all


def paired_collate_fn(insts):
        src_insts, tgt_insts, sp_insts = list(zip(*insts))

        src_insts = collate_fn_x(src_insts, sp_insts)
        tgt_insts = collate_fn(tgt_insts)

        return (*src_insts, *tgt_insts)

def collate_fn_x(insts, sp_insts):
        ''' Pad the instance to the max seq length in batch '''

        max_len = max(len(inst) for inst in insts)
        # print(max_len)
        # max_len_sp = max(len(inst) for inst in [sp for sp in sp_insts])
        # print(max_len_sp)

        # print(insts)
        # print(sp_insts)

        batch_seq = np.array([
            inst + [PAD] * (max_len - len(inst))
            for inst in insts])

        batch_sp = np.array([[
            inst.tolist() + [PAD] * (max_len - len(inst))
            for inst in sp]
            for sp in sp_insts])
        batch_pos = np.array([
            [pos_i + 1 if w_i != PAD else 0
             for pos_i, w_i in enumerate(inst)] for inst in batch_seq])

        # print(np.shape(batch_seq))
        # print(np.shape(batch_sp))
        batch_seq = torch.LongTensor(batch_seq)
        batch_sp = torch.FloatTensor(batch_sp)
        batch_pos = torch.LongTensor(batch_pos)

        return batch_seq, batch_sp, batch_pos

def collate_fn_test(insts):
        src_insts, sp_insts = list(zip(*insts))

        max_len = max(len(inst) for inst in src_insts)
        batch_seq = np.array([
            inst + [PAD] * (max_len - len(inst))
            for inst in src_insts])

        batch_sp = np.array([[
            inst.tolist() + [PAD] * (max_len - len(inst))
            for inst in sp]
            for sp in sp_insts])

        batch_pos = np.array([
            [pos_i + 1 if w_i != PAD else 0
             for pos_i, w_i in enumerate(inst)] for inst in batch_seq])

        batch_seq = torch.LongTensor(batch_seq)
        batch_pos = torch.LongTensor(batch_pos)
        batch_sp = torch.FloatTensor(batch_sp)

        return batch_seq, batch_sp, batch_pos

def collate_fn(insts):
        ''' Pad the instance to the max seq length in batch '''

        max_len = max(len(inst) for inst in insts)

        batch_seq = np.array([
            inst + [PAD] * (max_len - len(inst))
            for inst in insts])

        batch_pos = np.array([
            [pos_i + 1 if w_i != PAD else 0
             for pos_i, w_i in enumerate(inst)] for inst in batch_seq])

        batch_seq = torch.LongTensor(batch_seq)
        batch_pos = torch.LongTensor(batch_pos)

        return batch_seq, batch_pos
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(
            self, src_word2idx, tgt_word2idx,
            src_insts=None, tgt_insts=None, sp_insts=None):

        assert src_insts
        assert not tgt_insts or (len(src_insts) == len(tgt_insts))

        src_idx2word = {idx: word for word, idx in src_word2idx.items()}
        self._src_word2idx = src_word2idx
        self._src_idx2word = src_idx2word
        self._src_insts = src_insts

        tgt_idx2word = {idx: word for word, idx in tgt_word2idx.items()}
        self._tgt_word2idx = tgt_word2idx
        self._tgt_idx2word = tgt_idx2word
        self._tgt_insts = tgt_insts

        self.sp_insts = sp_insts

    @property
    def n_insts(self):
        ''' Property for dataset size '''
        return len(self._src_insts)

    @property
    def src_vocab_size(self):
        ''' Property for vocab size '''
        return len(self._src_word2idx)

    @property
    def tgt_vocab_size(self):
        ''' Property for vocab size '''
        return len(self._tgt_word2idx)

    @property
    def src_word2idx(self):
        ''' Property for word dictionary '''
        return self._src_word2idx

    @property
    def tgt_word2idx(self):
        ''' Property for word dictionary '''
        return self._tgt_word2idx

    @property
    def src_idx2word(self):
        ''' Property for index dictionary '''
        return self._src_idx2word

    @property
    def tgt_idx2word(self):
        ''' Property for index dictionary '''
        return self._tgt_idx2word

    def __len__(self):
        return self.n_insts

    def __getitem__(self, idx):
        if self._tgt_insts:
            return self._src_insts[idx], self._tgt_insts[idx], self.sp_insts[idx]
        return self._src_insts[idx], self.sp_insts[idx]

def prepare_dataloaders(data, opt):

    validation_split = 0.1
    shuffle_dataset = True
    random_seed = 42

    initDataset = TranslationDataset(
        src_word2idx=data['dict']['src'],
        tgt_word2idx=data['dict']['tgt'],
        src_insts=data['train']['src'],
        tgt_insts=data['train']['tgt'],
        sp_insts=data['train']['sp'])
    # Creating data indices for training and validation splits:
    dataset_size = len(initDataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    # ========= Preparing DataLoader =========#
    train_loader = torch.utils.data.DataLoader(
        initDataset,
        num_workers=4,
        batch_size=opt.batch_size,
        collate_fn=paired_collate_fn,
        sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        initDataset,
        num_workers=4,
        batch_size=opt.batch_size,
        collate_fn=paired_collate_fn,
        sampler=valid_sampler)

    test_loader = torch.utils.data.DataLoader(
        TranslationDataset(
            src_word2idx=data['dict']['src'],
            tgt_word2idx=data['dict']['tgt'],
            src_insts=data['valid']['src'],
            tgt_insts=data['valid']['tgt'],
            sp_insts=data['valid']['sp']
        ),
        num_workers=4,
        batch_size=opt.batch_size,
        collate_fn=paired_collate_fn)

    return train_loader, valid_loader, test_loader


def evaluate_saved_model(test_data, device, opt, crossEntropy):
  model = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)
  checkpoint = torch.load("modelTransformer2.chkpt" ,map_location='cpu')
  model.load_state_dict(checkpoint['model'])
  test(model, test_data, device, opt, crossEntropy)

print(device_lib.list_local_devices())   
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")

parser.add_argument('-data', default="data.pt")

parser.add_argument('-epoch', type=int, default=30)
parser.add_argument('-batch_size', type=int, default=9)

    #parser.add_argument('-d_word_vec', type=int, default=512)
parser.add_argument('-d_model', type=int, default=256)
parser.add_argument('-d_inner_hid', type=int, default=512)
parser.add_argument('-d_k', type=int, default=64)
parser.add_argument('-d_v', type=int, default=64)

parser.add_argument('-n_head', type=int, default=8)
parser.add_argument('-n_layers', type=int, default=6)
parser.add_argument('-n_warmup_steps', type=int, default=50000)

parser.add_argument('-dropout', type=float, default=0.1)
parser.add_argument('-embs_share_weight', action='store_true')
parser.add_argument('-proj_share_weight', action='store_true')

parser.add_argument('-log', default=None)
parser.add_argument('-save_model', default="model")
parser.add_argument('-save_plot', default="loss.png")
parser.add_argument('-save_mode', type=str,
                        choices=['all', 'best'], default='best')


parser.add_argument('-no_cuda', action='store_true')
parser.add_argument('-label_smoothing', action='store_true')

opt = parser.parse_args()
opt.cuda = not opt.no_cuda
opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
data = torch.load(opt.data)
opt.max_token_seq_len = data['settings'].max_token_seq_len

    #training_data, validation_data, test_data = prepare_dataloaders(data, opt)
training_data, validation_data, test_data = prepare_dataloaders(data, opt)

opt.src_vocab_size = training_data.dataset.src_vocab_size
opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

opt.vocab_src = training_data.dataset.src_word2idx
opt.vocab_tgt = training_data.dataset.tgt_word2idx

    # ========= Preparing Model =========#

if opt.embs_share_weight:
    assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
            'The src/tgt word2idx table are different but asked to share word embedding.'

print(opt)

device = torch.device('cuda' if opt.cuda else 'cpu')

transformer = Transformer(
        opt.src_vocab_size,
        opt.tgt_vocab_size,
        opt.max_token_seq_len,
        tgt_emb_prj_weight_sharing=opt.proj_share_weight,
        emb_src_tgt_weight_sharing=opt.embs_share_weight,
        d_k=opt.d_k,
        d_v=opt.d_v,
        d_model=opt.d_model,
        d_word_vec=opt.d_word_vec,
        d_inner=opt.d_inner_hid,
        n_layers=opt.n_layers,
        n_head=opt.n_head,
        dropout=opt.dropout)

transformer = DataParallel(transformer, range(0, torch.cuda.device_count())).to(device)

optimizer = ScheduledOptim(
        optim.Adam(
            filter(lambda x: x.requires_grad, transformer.parameters()),
            betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

weight_mask = None

crossEntropy = nn.CrossEntropyLoss(weight_mask, reduction='sum', ignore_index=PAD)

train_loss, val_loss, train_acc, val_acc = train(
        transformer, training_data, validation_data, optimizer, device, opt, crossEntropy)
print("Starting Test...")
test(transformer, test_data, device, opt, crossEntropy)
print("Making loss graph...")
plt = plot(train_loss, val_loss)
plt.savefig("loss.png")
plt.show()
plt2 = plot2(train_acc, val_acc)
plt2.savefig("accuracy.png")
plt2.show()
print("Finished!")