In [1]:
import numpy as np
import torch
import torch.nn as nn
import random
import time
import copy
from torch.nn import functional as F
from torch.autograd import Variable
from torch import optim
from rouge import Rouge
from data import *
from utils import *

## Tokens
e.g.
```
[["Musicians to tackle US red tape Musicians ' groups are to tackle US visa regulations which are blamed for hindering",
  "Nigel McCune from the Musicians ' Union said British musicians"],
 ["U2 's desire to be number one U2 , who have won three prestigious Grammy Awards for their hit Vertigo",
  'But they still want more.They have to want to be'],
 ["Rocker Doherty in on-stage fight Rock singer Pete Doherty has been involved in a fight with his band 's guitarist",
  'Babyshambles , which he formed after his acrimonious departure from']]
```

In [2]:
train, dev = load_datasets('./Datasets/BBC_News_business.pkl', './Datasets/BBC_News_business.pkl')

In [3]:
train[0]

["UK economy facing 'major risks '  The UK manufacturing sector will continue to face `` serious challenges '' over the next two years , the British Chamber of Commerce ( BCC ) has said .  The group 's quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years . The rise came despite exchange rates being cited as a major concern . However , the BCC found the whole UK economy still faced `` major risks '' and warned that growth is set to slow . It recently forecast economic growth will slow from more than 3 % in 2004 to a little below 2.5 % in both 2005 and 2006 .  Manufacturers ' domestic sales growth fell back slightly in the quarter , the survey of 5,196 firms found . Employment in manufacturing also fell and job expectations were at their lowest level for a year .  `` Despite some positive news for the export sector , there are worrying signs for manufacturing , '' the BCC said . `` These results reinforce our conce

In [4]:
# train = train[0:10]
# dev = dev[0:1]

## Tokens Index
e.g.
```
[Musicians to tackle US red tape Musicians ' groups are to tackle US visa regulations which are blamed for hindering => Nigel McCune from the Musicians ' Union said British musicians
    indexed as: [2, 3, 4, 5, 6, 7, 2, 8, 9, 10, 3, 4, 5, 11, 12, 13, 10, 14, 15, 16] => [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2],
 U2 's desire to be number one U2 , who have won three prestigious Grammy Awards for their hit Vertigo => But they still want more.They have to want to be
    indexed as: [17, 18, 19, 3, 20, 21, 22, 17, 23, 24, 25, 26, 27, 28, 29, 30, 15, 31, 32, 33] => [13, 14, 15, 16, 17, 18, 19, 16, 19, 20, 2],
 Rocker Doherty in on-stage fight Rock singer Pete Doherty has been involved in a fight with his band 's guitarist => Babyshambles , which he formed after his acrimonious departure from
    indexed as: [34, 35, 36, 37, 38, 39, 40, 41, 35, 42, 43, 44, 36, 45, 38, 46, 47, 48, 18, 49] => [21, 22, 23, 24, 25, 26, 27, 28, 29, 5, 2]]
```

In [5]:
train_data_indexed, dev_data_indexed, vocab_indexer = index_datasets(train, dev)

## Padding
- Pad the train/dev input vectors to the max length of the train/dev input documents.
- Pad the train/dev output vectors to the max length of the train/dev output summerization.

![](https://i.imgur.com/gGlkEEF.png)

In [6]:
def make_padded_input_tensor(exs, vocab_indexer, max_len):
    return np.array([[ex.x_indexed[i] if i < len(ex.x_indexed) else vocab_indexer.index_of(PAD_SYMBOL)
                        for i in range(0, max_len)] for ex in exs])

In [7]:
def make_padded_output_tensor(exs, vocab_indexer, max_len):
    return np.array([[ex.y_indexed[i] if i < len(ex.y_indexed) else vocab_indexer.index_of(PAD_SYMBOL)
                        for i in range(0, max_len)] for ex in exs])

## Batch

In [8]:
def batch_data(input_array, batch_size=2, cuda=False):
    input_batches = []
    batch_num = (int)(input_array.shape[0] / batch_size)
    start = 0
    for i in range(batch_num):
        batch = torch.from_numpy(input_array[start:start+batch_size, :])
        if cuda:
            batch = batch.cuda()
        input_batches.append(batch)
        start += batch_size
    if start != input_array.shape[0]:
        batch = torch.from_numpy(input_array[start:, :])
        if cuda:
            batch = batch.cuda()
        input_batches.append(batch)
    return input_batches

## Embedding

In [9]:
class EmbeddingLayer(nn.Module):
    # Parameters: dimension of the word embeddings, number of words, and the dropout rate to apply
    # (0.2 is often a reasonable value)
    def __init__(self, input_dim, full_dict_size, embedding_dropout_rate):
        super(EmbeddingLayer, self).__init__()
        self.dropout = nn.Dropout(embedding_dropout_rate)
        self.word_embedding = nn.Embedding(full_dict_size, input_dim)

    # Takes either a non-batched input [sent len x input_dim] or a batched input
    # [batch size x sent len x input dim]
    def forward(self, input):
        embedded_words = self.word_embedding(input)
        final_embeddings = self.dropout(embedded_words)
        return final_embeddings

## Encoder

In [10]:
# One-layer RNN encoder for batched inputs -- handles multiple sentences at once. You're free to call it with a
# leading dimension of 1 (batch size 1) but it does expect this dimension.
class RNNEncoder(nn.Module):
    # Parameters: input size (should match embedding layer), hidden size for the LSTM, dropout rate for the RNN,
    # and a boolean flag for whether or not we're using a bidirectional encoder
    def __init__(self, input_size, hidden_size, dropout, bidirect, CUDA=False):
        super(RNNEncoder, self).__init__()
        self.CUDA = CUDA
        self.bidirect = bidirect
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.reduce_h_W = nn.Linear(hidden_size * 2, hidden_size, bias=True)
        self.reduce_c_W = nn.Linear(hidden_size * 2, hidden_size, bias=True)
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True,
                               dropout=dropout, bidirectional=self.bidirect)
        self.init_weight()

    # Initializes weight matrices using Xavier initialization
    def init_weight(self):
        nn.init.xavier_uniform_(self.rnn.weight_hh_l0, gain=1)
        nn.init.xavier_uniform_(self.rnn.weight_ih_l0, gain=1)
        if self.bidirect:
            nn.init.xavier_uniform_(self.rnn.weight_hh_l0_reverse, gain=1)
            nn.init.xavier_uniform_(self.rnn.weight_ih_l0_reverse, gain=1)
        nn.init.constant_(self.rnn.bias_hh_l0, 0)
        nn.init.constant_(self.rnn.bias_ih_l0, 0)
        if self.bidirect:
            nn.init.constant_(self.rnn.bias_hh_l0_reverse, 0)
            nn.init.constant_(self.rnn.bias_ih_l0_reverse, 0)

    def get_output_size(self):
        return self.hidden_size * 2 if self.bidirect else self.hidden_size

    def sent_lens_to_mask(self, lens, max_length):
        return torch.from_numpy(np.asarray([[1 if j < lens.data[i].item() else 0 for j in range(0, max_length)] for i in range(0, lens.shape[0])]))

    # embedded_words should be a [batch size x sent len x input dim] tensor
    # input_lens is a tensor containing the length of each input sentence
    # Returns output (each word's representation), context_mask (a mask of 0s and 1s
    # reflecting where the model's output should be considered), and h_t, a *tuple* containing
    # the final states h and c from the encoder for each sentence.
    def forward(self, embedded_words, input_lens):
        # Takes the embedded sentences, "packs" them into an efficient Pytorch-internal representation
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedded_words, input_lens, batch_first=True)

        # Runs the RNN over each sequence. Returns output at each position as well as the last vectors of the RNN
        # state for each sentence (first/last vectors for bidirectional)
        output, hn = self.rnn(packed_embedding)
        

        # Unpacks the Pytorch representation into normal tensors
        output, sent_lens = nn.utils.rnn.pad_packed_sequence(output)
#         print('kdjfksdjfs: ', output.shape)
        
        # print('input_lens:', input_lens)
        max_length = input_lens.data[0].item()
        context_mask = self.sent_lens_to_mask(sent_lens, max_length)
        if self.CUDA:
            context_mask = context_mask.cuda()

        # Grabs the encoded representations out of hn, which is a weird tuple thing.
        # Note: if you want multiple LSTM layers, you'll need to change this to consult the penultimate layer
        # or gather representations from all layers.
        if self.bidirect:
            h, c = hn[0], hn[1]          # [2, 20, 200]
            # print('encoder hidden:----- ', h.shape)
            # print('encoder cell:----- ', c.shape)
            # Grab the representations from forward and backward LSTMs
            h_, c_ = torch.cat((h[0], h[1]), dim=1), torch.cat((c[0], c[1]), dim=1)      # [20, 400]
            # print('kdjfksdddddddddddjfs: ',h_.shape)
            # Reduce them by multiplying by a weight matrix so that the hidden size sent to the decoder is the same
            # as the hidden size in the encoder
            new_h = self.reduce_h_W(h_)
            new_c = self.reduce_c_W(c_)
            h_t = (new_h, new_c)
        else:
            h, c = hn[0][0], hn[1][0]
            h_t = (h, c)
        return (output, context_mask, h_t)

## Attention-based Decoder

In [11]:
class AttnRNNDecoderBahdanau(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout, bidirect):
        super(AttnRNNDecoderBahdanau, self).__init__()

        self.input_size = input_size
#         self.sent_lens = sent_lens
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.bidirect = bidirect
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers=1, 
                                dropout=dropout, bidirectional=bidirect)
        
        
        self.context = nn.Linear(hidden_size * 2 + input_size, input_size)
        self.W_h = nn.Linear(2*hidden_size, 2*hidden_size)
        self.W_c = nn.Linear(1, hidden_size * 2, bias=False)
        self.W_s = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.v = nn.Linear(hidden_size * 2, 1, bias=False)
        self.V = nn.Linear(hidden_size * 2 + hidden_size, hidden_size)
        self.V_p = nn.Linear(hidden_size, output_size)
        self.P_gen_layer = nn.Linear(hidden_size * 4 + input_size, 1)

        self.init_weight()
        
    # Initializes weight matrices using Xavier initialization
    def init_weight(self):
        nn.init.xavier_uniform_(self.rnn.weight_hh_l0, gain=1)
        nn.init.xavier_uniform_(self.rnn.weight_ih_l0, gain=1)
        if self.bidirect:
            nn.init.xavier_uniform_(self.rnn.weight_hh_l0_reverse, gain=1)
            nn.init.xavier_uniform_(self.rnn.weight_ih_l0_reverse, gain=1)
        nn.init.constant_(self.rnn.bias_hh_l0, 0)
        nn.init.constant_(self.rnn.bias_ih_l0, 0)
        if self.bidirect:
            nn.init.constant_(self.rnn.bias_hh_l0_reverse, 0)
            nn.init.constant_(self.rnn.bias_ih_l0_reverse, 0)

    def forward(self, embedded_words, dec_hidden, enc_output, context_mask, pre_cont, coverage):
        sent_lens = enc_output.shape[0]
        enc_feature = enc_output.view(-1, 2*hidden_size)                # batch_size*sent_lens, 2*hidden_size
        rnn_input = self.context(torch.cat((pre_cont, embedded_words), 1))   # batch_size, input_size
        dec_output, hn = self.rnn(rnn_input.unsqueeze(1).transpose(0, 1), dec_hidden)   # 1, batch_size, hidden_size
        dec_output = dec_output.transpose(0, 1)                             # batch_size, 1, hidden_size
        h_dec, c_dec = hn
        s_t_hat = torch.cat((h_dec.view(-1, hidden_size),
                             c_dec.view(-1, hidden_size)), 1)           # batch_size, 2*hidden_size
        del h_dec, c_dec
        
        # Attention Distribution
        dec_state = self.W_s(s_t_hat)                              # batch_size, 2*hidden_size
        dec_state_expanded = dec_state.unsqueeze(1).expand(dec_state.shape[0], sent_lens, dec_state.shape[1]).contiguous() # batch_size, sent_lens, 2*hidden_size
        dec_state_expanded = dec_state_expanded.view(-1, dec_state.shape[1])  # batch_size*sent_lens, 2*hidden_size
        del dec_state
        
#         print(enc_feature.shape)
#         print(dec_state_expanded.shape)
#         print(coverage.shape)
        e = self.v(torch.tanh(self.W_h(enc_feature) + dec_state_expanded + self.W_c(coverage.view(-1, 1)))).view(-1, sent_lens)  # batch_size, sent_lens
#         att_feature = enc_feature + dec_state_expanded   # batch_size*sent_lens, 2*hidden_size
#         # Coverage
#         coverage_feature = self.W_c(coverage.view(-1, 1))  # batch_size*sent_lens, 2*hidden_size
#         att_feature = att_feature + coverage_feature   # batch_size*sent_lens, 2*hidden_size
#         e = torch.tanh(att_feature)       # batch_size*sent_lens, 2*hidden_size
#         attn_scores = self.v(e).view(-1, self.sent_lens)      # batch_size, sent_lens
        del enc_feature
        attn_distrib_ = F.softmax(e, dim=1)*context_mask.float()   # batch_size, sent_lens
        del e
        norm_factor = attn_distrib_.sum(1, keepdim=True)
        attn_distrib = attn_distrib_ / norm_factor
        del attn_distrib_, norm_factor
        coverage = coverage + attn_distrib                  # batch_size, sent_lens
        attn_distrib = attn_distrib.unsqueeze(1)            # batch_size, 1, sent_lens
        
        # Context Vector
        cont_vec = torch.bmm(attn_distrib, enc_outputs.transpose(0, 1))  # batch_size, 1, 2*hidden_size
        concat_input = torch.cat((dec_output, cont_vec), dim=-1)           # batch_size, 1, enc_hidden_size * num_directions + dec_hidden_size

        # Vocabulary Distribution
        vocab_distrib = torch.softmax(self.V_p(self.V(concat_input)), dim=-1).squeeze(1)  # batch_size, output_size

        # Pointer Generator
        P_gen_input = torch.cat((cont_vec.squeeze(1), s_t_hat, rnn_input), dim=1)  # batch_size, (2*2*hidden_size + input_size)
        P_gen = self.P_gen_layer(P_gen_input)
        P_gen = torch.sigmoid(P_gen)       # batch_size, 1
        
        return (P_gen, vocab_distrib, hn, attn_distrib.squeeze(1), cont_vec.squeeze(1), coverage)
        

In [12]:
class AttnRNNDecoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout, bidirect):
        super(AttnRNNDecoder, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        self.bidirect = bidirect
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers=1, 
                                dropout=dropout, bidirectional=bidirect)
        self.out = nn.Linear(hidden_size, output_size)
        self.concat = nn.Linear(hidden_size * 2 + hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size * 2, hidden_size)
        
        self.W_h = nn.Linear(hidden_size * 2, 1)
        self.W_s = nn.Linear(hidden_size, 1)
        self.W_x = nn.Linear(input_size, 1)
        self.init_weight()
        
    # Initializes weight matrices using Xavier initialization
    def init_weight(self):
        nn.init.xavier_uniform_(self.rnn.weight_hh_l0, gain=1)
        nn.init.xavier_uniform_(self.rnn.weight_ih_l0, gain=1)
        if self.bidirect:
            nn.init.xavier_uniform_(self.rnn.weight_hh_l0_reverse, gain=1)
            nn.init.xavier_uniform_(self.rnn.weight_ih_l0_reverse, gain=1)
        nn.init.constant_(self.rnn.bias_hh_l0, 0)
        nn.init.constant_(self.rnn.bias_ih_l0, 0)
        if self.bidirect:
            nn.init.constant_(self.rnn.bias_hh_l0_reverse, 0)
            nn.init.constant_(self.rnn.bias_ih_l0_reverse, 0)

    # enc_output: batch_size, sent_lens, 2*hidden_size
    def forward(self, embedded_words, dec_hidden, enc_outputs, context_mask):
        embedded_words = embedded_words.view(1, embedded_words.size(0), embedded_words.size(1))   # 1, batch_size, input_size
        context_mask = context_mask.type(torch.uint8).unsqueeze(1)      # batch_size, 1, sent_lens
  
        rnn_output, hn = self.rnn(embedded_words, dec_hidden)           # 1, batch_size, hidden_size
        rnn_output = rnn_output.transpose(0, 1)                         # batch_size, 1, hidden_size        
        
        attn_scores = rnn_output.bmm(self.linear(enc_outputs).transpose(0, 1).transpose(1, 2))      # batch_size, 1, sent_lens
        attn_scores.data.masked_fill(context_mask == 0, float('inf'))      # batch_size, 1, sent_lens
#         print(attn_scores.shape)
        
        # Attention Distribution
        attn_weights = F.softmax(attn_scores.squeeze(1), dim=1).unsqueeze(1)        # batch_size, 1, sent_lens
        # Context Vector
        context = attn_weights.bmm(enc_outputs.transpose(0, 1))                     # batch_size, 1, hidden_size * num_directions
        concat_input = torch.cat((context, rnn_output), dim=-1)     # batch_size, 1, enc_hidden_size * num_directions + dec_hidden_size
        concat_output = torch.tanh(self.concat(concat_input))                       # batch_size, 1, dec_hidden_size
        # Vocabulary Distribution
        output = self.out(concat_output).squeeze(1)                                 # batch_size, output_size
        # Pointer-Generator
        p_gen = torch.sigmoid(self.W_h(context) + self.W_s(rnn_output) + self.W_x(embedded_words.transpose(0, 1))).squeeze(1)  # batch_size, 1
#         print(p_gen.shape)
        
        return (p_gen, output, hn, attn_weights.squeeze(1))

## Encoder to Decoder

In [13]:
def encode_input_for_decoder(x_tensor, inp_lens_tensor, model_input_emb, model_enc):
    input_emb = model_input_emb.forward(x_tensor)
    (enc_output_each_word, enc_context_mask, enc_final_states) = model_enc.forward(input_emb, inp_lens_tensor)
    enc_final_states_reshaped = (enc_final_states[0].unsqueeze(0), enc_final_states[1].unsqueeze(0))
    return (enc_output_each_word, enc_context_mask, enc_final_states_reshaped)

## Loss Function

In [14]:
# Implementation of loss function: masked cross entropy
# Reference to https://github.com/spro/practical-pytorch, make some modifications
def masked_cross_entropy(logits, target, length, context_mask):
    logits_flat = logits.view(-1, logits.size(-1))                  # batch_size * sent_len, vocab_size
    log_probs_flat = F.log_softmax(logits_flat, dim=-1)             # batch_size * sent_len, vocab_size
    target_flat = target.view(-1, 1)                                # batch * sent_len, 1
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)    # batch * max_len, 1
    losses = losses_flat.view(*target.size())     # batch, max_len
    losses = losses * context_mask.float()
    loss = losses.sum() / length.float().sum()
    return loss

In [15]:
def cal_step_loss(final_distrib, attn_distrib, coverage, step_Y_tensor, step_context_mask):
    step_distrib = torch.gather(final_distrib, dim=1, index=step_Y_tensor).squeeze(1)
    step_coverage_loss = torch.sum(torch.min(attn_distrib, coverage), 1)
    step_loss = -torch.log(step_distrib + eps) + cov_loss_wt*step_coverage_loss
    step_loss = step_loss * step_context_mask
    return step_loss

## Training Step

**Training Copy**

In [16]:
def pointer_generate_train(p_gen, dec_output, dec_attn, X_tensors, CUDA):
#     final_distrib = torch.zeros(dec_output.shape, dtype=torch.float)     # batch_size, vocab_size
    final_distrib = p_gen * dec_output
    dec_attn_padding = torch.zeros(X_tensors.shape, dtype=torch.float)
    if CUDA:
        dec_attn_padding = dec_attn_padding.cuda()
    dec_attn_padding[:, 0:dec_attn.shape[1]] = dec_attn 
    final_distrib = final_distrib.scatter_add(1, X_tensors, (1-p_gen)*dec_attn_padding)
    return final_distrib

In [17]:
BATCH_SIZE = 1
lr = 0.0005
input_dim = 100
output_dim = 100
hidden_size = 256
emb_dropout = 0.2
rnn_dropout = 0.2
bidirectional = True
num_epochs = 25
teacher_forcing_ratio = 1
cov_loss_wt = 1.0
eps = 1e-12
CUDA = True
pad_idx = vocab_indexer.index_of(PAD_SYMBOL)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

**Create indexed input/output for training**
- X_tensors_batch/Y_tensors_batch, list[array: [batch_size, sent_len], batch_num]
- inp_lens_batch/oup_lens_batch, list[array: [batch_size,], batch_num]

In [18]:
# Create indexed input/output for training
train_data_indexed.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
input_train_max_len = np.max(np.asarray([len(ex.x_indexed) for ex in train_data_indexed]))
all_train_input_data = make_padded_input_tensor(train_data_indexed, vocab_indexer, input_train_max_len).astype(np.int64)

output_train_max_len = np.max(np.asarray([len(ex.y_indexed) for ex in train_data_indexed]))
all_train_output_data = make_padded_output_tensor(train_data_indexed, vocab_indexer, output_train_max_len).astype(np.int64)

X_tensors_batch = batch_data(all_train_input_data, BATCH_SIZE, cuda=CUDA)   # batch_num, batch_size, sent_len
Y_tensors_batch = batch_data(all_train_output_data, BATCH_SIZE, cuda=CUDA)  # batch_num, batch_size, sent_len
if CUDA:
    inp_lens_batch = [torch.tensor([torch.sum(X_tensor != 0) for X_tensor in X_tensors]).cuda() for X_tensors in X_tensors_batch]  # batch_num, batch_size
    oup_lens_batch = [torch.tensor([torch.sum(Y_tensor != 0) for Y_tensor in Y_tensors]).cuda() for Y_tensors in Y_tensors_batch]  # batch_num, batch_size
else:
    inp_lens_batch = [torch.tensor([torch.sum(X_tensor != 0) for X_tensor in X_tensors]) for X_tensors in X_tensors_batch]  # batch_num, batch_size
    oup_lens_batch = [torch.tensor([torch.sum(Y_tensor != 0) for Y_tensor in Y_tensors]) for Y_tensors in Y_tensors_batch]  # batch_num, batch_size

**Create model**
- model_input_emb/model_output_emb: embedding layer
- model_enc/model_dec: encoder/decoder
- optimizers: encoder/decoder

In [None]:
# Create model
model_input_emb = EmbeddingLayer(input_dim, len(vocab_indexer), emb_dropout)
model_enc = RNNEncoder(input_dim, hidden_size, rnn_dropout, bidirectional, CUDA=CUDA)
model_output_emb = EmbeddingLayer(output_dim, len(vocab_indexer), emb_dropout)
model_dec = AttnRNNDecoderBahdanau(input_size=output_dim, hidden_size=hidden_size, output_size=len(vocab_indexer), dropout=rnn_dropout, bidirect=False)
# CUDA
if CUDA:
    model_input_emb.cuda()
    model_enc.cuda()
    model_output_emb.cuda()
    model_dec.cuda()
# model_dec = AttnRNNDecoder(input_size=output_dim, hidden_size=hidden_size, output_size=len(vocab_indexer), dropout=rnn_dropout, bidirect=False)
enc_optimizer = optim.Adam(model_enc.parameters(), lr=lr)
dec_optimizer = optim.Adam(model_dec.parameters(), lr=lr)

  "num_layers={}".format(dropout, num_layers))


**Train Iteration**

In [None]:
start = time.time()
for epoch in range(0, num_epochs):
    print('--------------------- Epoch %d ---------------------'%(epoch+1))
    for X_tensors, Y_tensors, inp_lens_tensor, oup_lens_tensor in zip(X_tensors_batch, Y_tensors_batch, inp_lens_batch, oup_lens_batch):

        model_enc.train()
        model_dec.train()

        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()
        
        # Encoder
        enc_outputs, enc_context_mask, enc_hidden = encode_input_for_decoder(X_tensors, inp_lens_tensor, model_input_emb, model_enc)
        init_dec_inp = Variable(torch.LongTensor([vocab_indexer.index_of(SOS_SYMBOL)] * X_tensors.shape[0]))  
        if CUDA:
            init_dec_inp = init_dec_inp.cuda()
        dec_input = model_output_emb.forward(init_dec_inp)
        dec_hidden = enc_hidden
        cont = torch.zeros((X_tensors.shape[0], 2 * hidden_size))    # batch_size, 2*hidden_size
        coverage = torch.zeros((X_tensors.shape[0], enc_outputs.shape[0]))             # batch_size, sent_lens
        all_context_mask = torch.from_numpy(np.asarray([[1 if j < oup_lens_tensor.data[i].item() \
            else 0 for j in range(0, Y_tensors.size(1))] for i in range(0, oup_lens_tensor.shape[0])]))
        agr_loss = torch.zeros(X_tensors.shape[0])
        
        if CUDA:
            cont = cont.cuda()
            coverage = coverage.cuda()
            all_context_mask = all_context_mask.cuda()
            agr_loss = agr_loss.cuda()
        
#         # Decoder
#         all_dec_outputs = Variable(torch.zeros(output_train_max_len, X_tensors.shape[0], len(vocab_indexer)))   # sent_len, batch_size, ext_output_size
#         for idx in range(output_train_max_len):
#             p_gen, dec_output, dec_hidden, dec_attn = model_dec.forward(dec_input, dec_hidden, enc_outputs, enc_context_mask)
#             all_dec_outputs[idx] = pointer_generate_train(p_gen, dec_output, dec_attn, X_tensors)
#             max_prob_idx = torch.argmax(all_dec_outputs[idx], dim=1)
#             print('max_prob_idx:', max_prob_idx)
#             print('Y_tensors:', Y_tensors[:, idx])
# #             print(agr_loss)
# #             agr_loss = agr_loss + cal_step_loss(final_distrib, dec_attn, coverage, Y_tensors[:, idx].unsqueeze(1), all_context_mask[:, idx].float())
#             dec_input = model_output_emb.forward(Y_tensors[:, idx])
# #             coverage = next_coverage         
#         loss = masked_cross_entropy(all_dec_outputs.transpose(0, 1).contiguous(), Y_tensors, oup_lens_tensor, all_context_mask)       # batch_size, sent_len, output_size
        
        
        # Decoder2
        for idx in range(output_train_max_len):
            p_gen, dec_output, dec_hidden, dec_attn, cont, next_coverage = model_dec.forward(dec_input, dec_hidden, enc_outputs, enc_context_mask, cont, coverage)
            final_distrib = pointer_generate_train(p_gen, dec_output, dec_attn, X_tensors, CUDA)
            max_prob_idx = torch.argmax(final_distrib, dim=1)
#             print(final_distrib)
#             print('max_prob_idx:', max_prob_idx)
#             print('Y_tensors:', Y_tensors[:, idx])
#             print(agr_loss)
            agr_loss = agr_loss + cal_step_loss(final_distrib, dec_attn, coverage, Y_tensors[:, idx].unsqueeze(1), all_context_mask[:, idx].float())
            dec_input = model_output_emb.forward(Y_tensors[:, idx])
            coverage = next_coverage
        
        batch_avg_loss = agr_loss/oup_lens_tensor.float()
        loss = torch.mean(batch_avg_loss)
        
        loss.backward()

        enc_optimizer.step()
        dec_optimizer.step()

        print('loss', loss.item())
        
elapsed_time = time.time() - start
print('Time: %.2fs'%(elapsed_time))

--------------------- Epoch 1 ---------------------
loss 7.236730575561523
loss 7.124670505523682
loss 6.96816349029541
loss 6.975282669067383
loss 7.01713752746582
loss 6.953428745269775
loss 6.878820419311523
loss 6.938361167907715
loss 6.715261459350586
loss 6.705573081970215
loss 6.748377799987793
loss 6.699498653411865
loss 6.476116180419922
loss 6.441355228424072
loss 6.766495704650879
loss 6.412702560424805
loss 6.577016830444336
loss 6.421826362609863
loss 6.380306720733643
loss 6.497630596160889
loss 6.3838887214660645
loss 6.344750881195068
loss 6.3715739250183105
loss 6.437589168548584
loss 6.234041690826416
loss 6.296297550201416
loss 6.486415863037109
loss 6.428764820098877
loss 6.443448543548584
loss 6.4879841804504395
loss 6.483123302459717
loss 6.384026527404785
loss 6.303886413574219
loss 6.330592155456543
loss 6.124269962310791
loss 6.469512462615967
loss 6.269493579864502
loss 6.2835307121276855
loss 6.2869873046875
loss 6.265032768249512
loss 6.252707481384277
loss 

## Evaluation Step

**Entended Vocabulary**

In [20]:
ext_vocab_indexer = copy.deepcopy(vocab_indexer)
for (x, y) in dev:
    for x_tok, y_tok in zip(tokenize(x), tokenize(y)):
        ext_vocab_indexer.get_index(x_tok)
        ext_vocab_indexer.get_index(y_tok)
        
ext_dev_data_indexed = index_data(dev, ext_vocab_indexer)
ext_dev_data_indexed.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
ext_input_dev_max_len = np.max(np.asarray([len(ex.x_indexed) for ex in ext_dev_data_indexed]))
ext_all_dev_input_data = make_padded_input_tensor(ext_dev_data_indexed, ext_vocab_indexer, ext_input_dev_max_len).astype(np.int64)
ext_X_tensors_batch_dev = batch_data(ext_all_dev_input_data, BATCH_SIZE)   # batch_num, batch_size, sent_len

**Evaluation Copy**

In [21]:
def pointer_generate_dev(p_gen, dec_output, dec_attn, ext_X_tensors, vocab_indexer, ext_vocab_indexer):
    final_distrib = torch.zeros([dec_output.shape[0], len(ext_vocab_indexer)], dtype=torch.float)    # add new words in the end
    final_distrib[:, 0:len(vocab_indexer)] = p_gen * dec_output
    dec_attn_padding = torch.zeros(ext_X_tensors.shape, dtype=torch.float)
    dec_attn_padding[:, 0:dec_attn.shape[1]] = dec_attn
    final_distrib = final_distrib.scatter_add(1, ext_X_tensors, (1-p_gen)*dec_attn_padding)
    final_distrib[:, vocab_indexer.index_of(UNK_SYMBOL)] = 0
    return final_distrib

**Create indexed input/output for development**

In [22]:
# Create indexed input/output for dev
dev_data_indexed.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
input_dev_max_len = np.max(np.asarray([len(ex.x_indexed) for ex in dev_data_indexed]))
all_dev_input_data = make_padded_input_tensor(dev_data_indexed, vocab_indexer, input_dev_max_len).astype(np.int64)
output_dev_max_len = np.max(np.asarray([len(ex.y_indexed) for ex in dev_data_indexed]))
X_tensors_batch_dev = batch_data(all_dev_input_data, BATCH_SIZE)   # batch_num, batch_size, sent_len
inp_lens_batch_dev = [torch.tensor([torch.sum(X_tensor != 0) for X_tensor in X_tensors]) for X_tensors in X_tensors_batch_dev]  # batch_num, batch_size

In [23]:
best_data = []
model_enc.eval()
model_dec.eval()
for X_tensors, inp_lens_tensor, ext_X_tensors in zip(X_tensors_batch_dev, inp_lens_batch_dev, ext_X_tensors_batch_dev):
    enc_outputs, enc_context_mask, enc_hidden = encode_input_for_decoder(X_tensors, inp_lens_tensor, model_input_emb, model_enc)
    dec_hidden = enc_hidden
    dec_input = model_output_emb.forward(Variable(torch.LongTensor([vocab_indexer.index_of(SOS_SYMBOL)] * X_tensors.shape[0])))
    cont = torch.zeros((X_tensors.shape[0], 2 * hidden_size))    # batch_size, 2*hidden_size
    coverage = torch.zeros((X_tensors.shape[0], enc_outputs.shape[0]))             # batch_size, sent_lens
    all_dec_outputs = Variable(torch.zeros(output_dev_max_len, X_tensors.shape[0], len(ext_vocab_indexer)))   # sent_len, batch_size, ext_output_size
    for idx in range(output_dev_max_len):
        p_gen, dec_output, dec_hidden, dec_attn, cont, next_coverage = model_dec.forward(dec_input, dec_hidden, enc_outputs, enc_context_mask, cont, coverage)
        all_dec_outputs[idx] = pointer_generate_dev(p_gen, dec_output, dec_attn, ext_X_tensors, vocab_indexer, ext_vocab_indexer)
        max_prob_idx = torch.argmax(all_dec_outputs[idx], dim=1)
        max_prob_idx[max_prob_idx >= len(vocab_indexer)] = vocab_indexer.index_of(UNK_SYMBOL)   # new words should be UNK when serving as next input
        dec_input = model_output_emb.forward(max_prob_idx)
    for best_sent in torch.argmax(all_dec_outputs, dim=2).transpose(0, 1).contiguous():
        best_ex = []
        for word_idx in best_sent:            # don't need to include EOS tok
            if word_idx.item() == ext_vocab_indexer.index_of(EOS_SYMBOL):
                break
            best_ex.append(ext_vocab_indexer.get_object(word_idx.item()))     # pred tok
        best_data.append(best_ex)
            
rouge = Rouge()
for test_ex, best_ex in zip(dev_data_indexed, best_data):
    test_str = ' '.join(test_ex.y_tok)
    best_str = ' '.join(best_ex)
    print(test_str)
    print(best_str)
    scores = rouge.get_scores(best_str, test_str)
    print(scores)

Nigel McCune from the Musicians ' Union said British musicians
Nigel McCune To sold anyone Musicians firm theory to swiftly
[{'rouge-1': {'f': 0.2999999950000001, 'p': 0.3, 'r': 0.3}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.2999999999995, 'p': 0.3, 'r': 0.3}}]
But they still want more.They have to want to be
But the video want want to want to want to
[{'rouge-1': {'f': 0.4285714239795918, 'p': 0.6, 'r': 0.3333333333333333}, 'rouge-2': {'f': 0.24999999531250006, 'p': 0.3333333333333333, 'r': 0.2}, 'rouge-l': {'f': 0.3723653395784906, 'p': 0.6, 'r': 0.3333333333333333}}]
Babyshambles , which he formed after his acrimonious departure from
Babyshambles , who has been done to match from a
[{'rouge-1': {'f': 0.2999999950000001, 'p': 0.3, 'r': 0.3}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.2999999999995, 'p': 0.3, 'r': 0.3}}]
A Series of Unfortunate Events als

In a separate case relating to the same incident ,
In a separate case relating relating to the same incident
[{'rouge-1': {'f': 0.9473684160664821, 'p': 1.0, 'r': 0.9}, 'rouge-2': {'f': 0.888888883888889, 'p': 0.8888888888888888, 'r': 0.8888888888888888}, 'rouge-l': {'f': 0.9421631000574496, 'p': 1.0, 'r': 0.9}}]
The Jimi Hendrix Experience 's first single Hey Joe ,
The Jimi Hendrix Experience has been licensed by the Scottish
[{'rouge-1': {'f': 0.3999999950000001, 'p': 0.4, 'r': 0.4}, 'rouge-2': {'f': 0.3333333283333334, 'p': 0.3333333333333333, 'r': 0.3333333333333333}, 'rouge-l': {'f': 0.39999999999950003, 'p': 0.4, 'r': 0.4}}]
His song Here We Go Again with Norah Jones won
A Windows Here many with Go Again only Again Norah
[{'rouge-1': {'f': 0.5263157844875347, 'p': 0.5555555555555556, 'r': 0.5}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.4187391555808737, 'p': 0.4444444444444444, 'r': 0.4}}]
A controversial film star

The figures confounded hopes hopes , the of life expansion
[{'rouge-1': {'f': 0.5999999950500001, 'p': 0.6666666666666666, 'r': 0.5454545454545454}, 'rouge-2': {'f': 0.31578946869806096, 'p': 0.3333333333333333, 'r': 0.3}, 'rouge-l': {'f': 0.5883495145628266, 'p': 0.6666666666666666, 'r': 0.5454545454545454}}]
The IMF will also reduce its growth estimate for the
Mr Straw said that 10 % of the gadget phones
[{'rouge-1': {'f': 0.09999999500000027, 'p': 0.1, 'r': 0.1}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0999999999995, 'p': 0.1, 'r': 0.1}}]
Net profit surged 70 % to 2.39bn rupees ( $
Net profit surged surged 70 ( created ) ( )
[{'rouge-1': {'f': 0.5555555508024692, 'p': 0.7142857142857143, 'r': 0.45454545454545453}, 'rouge-2': {'f': 0.31578946869806096, 'p': 0.3333333333333333, 'r': 0.3}, 'rouge-l': {'f': 0.5077658303464327, 'p': 0.7142857142857143, 'r': 0.45454545454545453}}]
The US stock market regulator is investigating troubled insurance broker
The US The Tro

European leaders say Asian states must let their currencies rise
European leaders say Asian say say let currencies currencies the
[{'rouge-1': {'f': 0.7058823480968859, 'p': 0.8571428571428571, 'r': 0.6}, 'rouge-2': {'f': 0.3333333283333334, 'p': 0.3333333333333333, 'r': 0.3333333333333333}, 'rouge-l': {'f': 0.6656738644823815, 'p': 0.8571428571428571, 'r': 0.6}}]
Trade Minister Mark Vaile has said that the bid may
Trade Minister Mark Vaile said that the bid may broadband
[{'rouge-1': {'f': 0.899999995, 'p': 0.9, 'r': 0.9}, 'rouge-2': {'f': 0.7777777727777778, 'p': 0.7777777777777778, 'r': 0.7777777777777778}, 'rouge-l': {'f': 0.8999999999995, 'p': 0.9, 'r': 0.9}}]
`` Obtaining majority control over Gazprom is the beginning of
`` You do the majority mouse control to the beginning
[{'rouge-1': {'f': 0.5263157844875347, 'p': 0.5555555555555556, 'r': 0.5}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.5234239444761888, 'p': 0.5

Kenteris said that them % of all camera networks many
[{'rouge-1': {'f': 0.09999999500000027, 'p': 0.1, 'r': 0.1}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0999999999995, 'p': 0.1, 'r': 0.1}}]
Sotherton has also displayed promise , with a new high
The event will be on the US series was the
[{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}]
Just Chepkemei and Radcliffe were left in contention as the
Just Chepkemei , who died the most of Grid networks
[{'rouge-1': {'f': 0.2999999950000001, 'p': 0.3, 'r': 0.3}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.2999999999995, 'p': 0.3, 'r': 0.3}}]
Brentford face a home tie against holders Manchester United in
The tie the in home Department said that games
[{'rouge-1': {'f': 0.31578946869806096, 'p': 0.3333333333333333, 'r': 0.3}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l':

[{'rouge-1': {'f': 0.2999999950000001, 'p': 0.3, 'r': 0.3}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.2999999999995, 'p': 0.3, 'r': 0.3}}]
The five unions meeting Mr Prescott want the government to
The five unions meeting Mr Prescott was the most of
[{'rouge-1': {'f': 0.699999995, 'p': 0.7, 'r': 0.7}, 'rouge-2': {'f': 0.5555555505555557, 'p': 0.5555555555555556, 'r': 0.5555555555555556}, 'rouge-l': {'f': 0.6999999999994999, 'p': 0.7, 'r': 0.7}}]
Asked whether he had not returned to work immediately because
Asked whether technology thought Mr Brown was stopping to rule
[{'rouge-1': {'f': 0.2999999950000001, 'p': 0.3, 'r': 0.3}, 'rouge-2': {'f': 0.11111110611111134, 'p': 0.1111111111111111, 'r': 0.1111111111111111}, 'rouge-l': {'f': 0.2999999999995, 'p': 0.3, 'r': 0.3}}]
`` But let us not forget two thirds of those
`` The technology is going going to be increasingly as
[{'rouge-1': {'f': 0.1052631529085875, 'p': 0.11111111

In [30]:
dev

[["Musicians to tackle US red tape Musicians ' groups are to tackle US visa regulations which are blamed for hindering",
  "Nigel McCune from the Musicians ' Union said British musicians"],
 ["U2 's desire to be number one U2 , who have won three prestigious Grammy Awards for their hit Vertigo",
  'But they still want more.They have to want to be'],
 ["Rocker Doherty in on-stage fight Rock singer Pete Doherty has been involved in a fight with his band 's guitarist",
  'Babyshambles , which he formed after his acrimonious departure from'],
 ['Snicket tops US box office chart The film adaptation of Lemony Snicket novels has topped the North America box office',
  'A Series of Unfortunate Events also stars Scottish comedian Billy'],
 ["Ocean 's Twelve raids box office Ocean 's Twelve , the crime caper sequel starring George Clooney , Brad Pitt",
  "Ocean 's Twelve , the crime caper sequel starring George"],
 ["'Landmark movies ' of 2004 hailed US film professionals have declared Fahrenhei

In [37]:
dev_data_indexed

[Musicians to tackle US red tape Musicians ' groups are to tackle US visa regulations which are blamed for hindering British acts ' chances of succeeding across the Atlantic . A singer hoping to perform in the US can expect to pay $ 1,300 ( £680 ) simply for obtaining a visa . Groups including the Musicians ' Union are calling for an end to the `` raw deal '' faced by British performers . US acts are not faced with comparable expense and bureaucracy when visiting the UK for promotional purposes . Nigel McCune from the Musicians ' Union said British musicians are `` disadvantaged '' compared to their US counterparts . A sponsor has to make a petition on their behalf , which is a form amounting to nearly 30 pages , while musicians face tougher regulations than athletes and journalists . `` If you make a mistake on your form , you risk a five-year ban and thus the ability to further your career , '' says Mr McCune . `` The US is the world 's biggest music market , which means something ha

In [None]:
# Create indexed input/output for training
# train_data_indexed.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
# input_train_max_len = np.max(np.asarray([len(ex.x_indexed) for ex in train_data_indexed]))
# all_train_input_data = make_padded_input_tensor(train_data, input_indexer, input_train_max_len, args.reverse_input)
# output_train_max_len = np.max(np.asarray([len(ex.y_indexed) for ex in train_data]))
# all_train_output_data = make_padded_output_tensor(train_data, output_indexer, output_train_max_len)


# # Create indexed input/output for dev
# dev_data_indexed.sort(key=lambda ex: len(ex.x_indexed), reverse=True)
# input_dev_max_len = np.max(np.asarray([len(ex.x_indexed) for ex in dev_data_indexed]))
# all_dev_input_data = make_padded_input_tensor(dev_data_indexed, input_indexer, input_dev_max_len, args.reverse_input)
# output_dev_max_len = np.max(np.asarray([len(ex.y_indexed) for ex in dev_data_indexed]))
# all_dev_output_data = make_padded_output_tensor(dev_data_indexed, output_indexer, output_dev_max_len)



#         else:
#             for idx in range(output_train_max_len):
# #                 dec_output, dec_hidden = model_dec.forward(dec_input, dec_hidden)
#                 p_gen, dec_output, dec_hidden, dec_attn = model_dec.forward(dec_input, dec_hidden, enc_outputs, enc_context_mask)
#                 all_dec_outputs[idx] = pointer_generate_train(p_gen, dec_output, dec_attn, X_tensors)
#                 max_prob_idx = torch.argmax(all_dec_outputs[idx], dim=1)
#                 dec_input = model_output_emb.forward(max_prob_idx)

#         all_context_mask = torch.from_numpy(np.asarray([[1 if j < oup_lens_tensor.data[i].item() \
#             else 0 for j in range(0, Y_tensors.size(1))] for i in range(0, oup_lens_tensor.shape[0])], dtype=np.uint8))
#         loss = masked_cross_entropy(all_dec_outputs.transpose(0, 1).contiguous(), Y_tensors, oup_lens_tensor, all_context_mask)       # batch_size, sent_len, output_size
#                                                                                                                                       # batch_size, sent_len
#         Y_resize = torch.transpose(Y_tensors, 0, 1).contiguous()
#         loss = criterion(all_dec_outputs.view(-1, all_dec_outputs.shape[2]), Y_resize.view(-1))

# use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False