#  Sequence to Sequence - Article Summarization

In [75]:
from __future__ import unicode_literals
from io import open
import unicodedata
import string
import re
import random


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

import numpy as np
import pandas as pd
import pickle

from gensim.models import KeyedVectors
from rouge import Rouge

In [2]:
path = './'
vocab = pickle.load(open(path + 'vocab.pkl', "rb" ))
inv_vocab = pickle.load(open(path + 'inv_vocab.pkl', "rb" ))

In [3]:
path = './'
train_df = pickle.load(open(path + 'train_df.pkl', "rb"))
train_df.head()

Unnamed: 0,text,summary
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 148, 17, 149, 150, 112, 151, 136, 60, 79, ..."
1,"[1, 742, 743, 11, 646, 307, 744, 132, 596, 745...","[1, 11, 762, 763, 764, 769, 770, 771, 272, 772..."
2,"[1, 910, 940, 840, 941, 132, 942, 569, 943, 94...","[1, 11, 954, 947, 948, 949, 950, 72, 951, 1027..."
3,"[1, 136, 1153, 1154, 910, 1155, 1156, 1157, 11...","[1, 910, 1155, 1156, 1157, 1158, 265, 167, 422..."
4,"[1, 1413, 132, 1414, 1415, 1416, 1417, 1418, 3...","[1, 1510, 1435, 1427, 1428, 1511, 1413, 17, 15..."


In [None]:
path = './'
dev_df = pickle.load(open(path + 'dev_df.pkl', "rb"))
dev_df.head()

In [4]:
type(train_df.text[0][0])

int

In [5]:
print(vocab['bro'])
print(inv_vocab[27578])

27578
bro


# Dataset

In [6]:
train_df.head()

Unnamed: 0,text,summary
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 148, 17, 149, 150, 112, 151, 136, 60, 79, ..."
1,"[1, 742, 743, 11, 646, 307, 744, 132, 596, 745...","[1, 11, 762, 763, 764, 769, 770, 771, 272, 772..."
2,"[1, 910, 940, 840, 941, 132, 942, 569, 943, 94...","[1, 11, 954, 947, 948, 949, 950, 72, 951, 1027..."
3,"[1, 136, 1153, 1154, 910, 1155, 1156, 1157, 11...","[1, 910, 1155, 1156, 1157, 1158, 265, 167, 422..."
4,"[1, 1413, 132, 1414, 1415, 1416, 1417, 1418, 3...","[1, 1510, 1435, 1427, 1428, 1511, 1413, 17, 15..."


In [None]:
dev_df.head()

In [7]:
a = [1, 2, 4, 6]
torch.Tensor(a)

tensor([1., 2., 4., 6.])

In [8]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (article, summary). 
            - each is list of word indices of variable length
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    articles, summaries = zip(*data)
        
    # Merge sentences
    lengths1 = [len(s) for s in articles]
    lengths2 = [len(s) for s in summaries]
   
    arts = torch.zeros(len(articles), max(lengths1)).long()
    summs = torch.zeros(len(summaries), max(lengths2)).long()
    
    for i, a in enumerate(articles):
        l = lengths1[i]
        arts[i, -l:] = torch.Tensor(a[:l])
        
    for i, s in enumerate(summaries):
        l = lengths2[i]
        summs[i, :l] = torch.Tensor(s[:l])
    
    return arts, summs, lengths1

In [9]:
class SummarizationDataset(Dataset):
    def __init__(self, df): #pairs, input_lang, output_lang):
        self.df = df
        self.article = df.text.values
        self.summary = df.summary.values
    
    def __len__(self):
        return len(self.article)
    
    def __getitem__(self, idx):
        x = self.article[idx]
        y = self.summary[idx]
        return x, y

In [10]:
# import ast
# df['text'].apply(lambda x: ast.literal_eval(x))
# df['summary'].apply(lambda x: ast.literal_eval(x))

In [11]:
text_lens = train_df.text.map(lambda x: len(x))
summary_lens = train_df.summary.map(lambda x: len(x))
text_lens.describe(), summary_lens.describe()

(count    995041.000000
 mean        587.993332
 std         764.028830
 min           2.000000
 25%         246.000000
 50%         467.000000
 75%         751.000000
 max      102471.000000
 Name: text, dtype: float64, count    995041.000000
 mean         25.583946
 std          23.694143
 min           2.000000
 25%          16.000000
 50%          22.000000
 75%          28.000000
 max        5678.000000
 Name: summary, dtype: float64)

In [12]:
train_df = train_df[(text_lens > 246) &(text_lens < 751) & (summary_lens < 28) & (summary_lens > 16)]
train_df.shape

(55508, 2)

In [14]:
train_ds = SummarizationDataset(train_df)
valid_ds = SummarizationDataset(dev_df)

In [15]:
train_ds[0][1]

[1,
 6579,
 106,
 1276,
 1302,
 103,
 3992,
 3719,
 4567,
 132,
 12973,
 2849,
 2129,
 4991,
 14,
 9,
 3385,
 3704,
 3294,
 2]

In [16]:
batch_size=5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn)

In [17]:
arts, summs, l = next(iter(train_dl))

In [18]:
arts.shape, summs.shape

(torch.Size([5, 433]), torch.Size([5, 21]))

## The Seq2Seq Model

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <https://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence.

### The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

![](imgs/encoder-network.png)

In [19]:
word2vec_path = '~/GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
def create_embedding_matrix(word_vecs, inv_vocab, D=300):
    """Creates embedding matrix from word vectors. """
    V = len(inv_vocab)
    W = np.random.randn(V, D)

    for i in range(V):
        if inv_vocab[i] in word_vecs:
            W[i] = word_vecs[inv_vocab[i]]

    return W

In [21]:
embedding_dim = 300

emb_matrix = create_embedding_matrix(word2vec, inv_vocab, embedding_dim)
emb_matrix

array([[ 0.00402832, -0.24707031,  0.09814453, ...,  0.1640625 ,
         0.24023438,  0.6875    ],
       [-0.19433594, -0.05932617, -0.18066406, ..., -0.12988281,
        -0.15527344,  0.14941406],
       [-0.15039062, -0.03063965,  0.02770996, ...,  0.11132812,
         0.06225586,  0.04003906],
       ...,
       [ 0.04492188,  0.05664062,  0.09863281, ..., -0.03173828,
         0.125     ,  0.05371094],
       [-0.11425781,  0.12109375, -0.04418945, ...,  0.10351562,
         0.08203125,  0.08154297],
       [ 0.0402832 , -0.03149414, -0.15332031, ..., -0.00570679,
        -0.06396484, -0.15625   ]])

In [22]:
# encoder is RNN
# input size is number of words in french vocabulary
# choose hidden size for ...
# embedding layer, gru , and dropout
# get output and hidden, output both of them

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embeddings.weight.requires_grad = False
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, lengths):
        x = self.embeddings(x)
        #x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        output, hidden = self.gru(pack)
        return output, hidden

In [23]:
x, y, l = next(iter(train_dl))

In [24]:
x, y

(tensor([[    1,   496,   132,  ...,  1053,  6362,     2],
         [    0,     1,   136,  ...,  1769,   233,     2],
         [    0,     0,     0,  ...,  3689, 22253,     2],
         [    0,     0,     0,  ...,    22,  8303,     2],
         [    0,     0,     0,  ...,   272,  3321,     2]]),
 tensor([[    1,   496,   132,    38,   264,  2841, 32027,  2086,  7811,  7847,
           1423, 26012, 32284,    22, 27431, 30362,     2,     0,     0,     0,
              0],
         [    1,  3421, 21456, 22391, 23444,  3060,    93,  1431,  5030,   768,
            525,   576,  2776,   551,   261,  3658,     8,  2772,   106,  2853,
              2],
         [    1,    11, 16077,  6768,  3475,  6778,    72,  3935, 10654,  6624,
             86,  2554,  4950,  1420,  1756,   181,     2,     0,     0,     0,
              0],
         [    1, 17258,  8170,  2144, 14644,   497,   231,   264,   209,     8,
              9, 29466,    58, 11876,  1178, 43904,    17,    41,    10,   538,
         

In [25]:
vocab_size = len(vocab)
hidden_size = 300
encoder = EncoderRNN(vocab_size, embedding_dim, hidden_size, emb_matrix)

In [26]:
enc_outputs, enc_hidden = encoder(x.long(), l)

In [27]:
# enc_outputs.shape, enc_hidden.shape
# what is size of encoder output and encoder hidden
# 5 is batch size, 17 is max length
# 5 is last state of each of 5 sentences

In [28]:
enc_outputs

PackedSequence(data=tensor([[ 0.0384,  0.0521,  0.0305,  ...,  0.0287,  0.0760,  0.0350],
        [-0.0445, -0.0282,  0.0371,  ..., -0.0816, -0.0561, -0.1053],
        [-0.0445, -0.0282,  0.0371,  ..., -0.0816, -0.0561, -0.1053],
        ...,
        [ 0.0559,  0.0291,  0.0178,  ...,  0.0553, -0.0208, -0.1013],
        [ 0.0617,  0.0107, -0.0138,  ..., -0.0115,  0.0139, -0.0077],
        [ 0.0652,  0.0491,  0.0828,  ...,  0.0892,  0.1814, -0.1452]],
       grad_fn=<CatBackward>), batch_sizes=tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 

The  Decoder
   -----------

In [29]:
# encoder has embedding layer, gru, 
# output size is size of english vocabulary
# loss function is trying to produce 
# output is taking the hidden state of decoder,
# going through linear layer to try to produce "the"
# run the decoder (GRU) word by word, because we need "the" to predict
# next word, "poor"
# sometimes we use the prediction or sometimes we use the actual
class DecoderRNN(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(DecoderRNN, self).__init__()

        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(hidden[-1]) # output is a function of the hidden, what we are comparing to the y
        return output, hidden

In [30]:
output_size = vocab_size
hidden_size = 300

In [31]:
SOS_token = 1
batch_size = y.size(0)
decoder_input = SOS_token*torch.ones(batch_size,1).long()
decoder_input.shape

torch.Size([5, 1])

In [32]:
decoder = DecoderRNN(output_size, hidden_size)

In [33]:
output, hidden = decoder(decoder_input, enc_hidden)

In [34]:
hidden.shape, output.shape

(torch.Size([1, 5, 300]), torch.Size([5, 140326]))

Training
========

In [35]:
def train_batch(x, y, l1, encoder, decoder, encoder_optimizer, decoder_optimizer,
                teacher_forcing_ratio=0.5):
    
    #two models so two optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size = y.size(0)
    target_length = y.size(1)

    enc_outputs, enc_hidden = encoder(x, l1)

    loss = 0
    dec_input = y[:,0].unsqueeze(1) # allways SOS (ec always a 1 which is index of start of sequence)
    hidden = enc_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(1, target_length):
        output, hidden = decoder(dec_input, hidden) # getting new hidden and output
        # output is prediction, bunch of probabilities (kind of) for each of the words in vocab
        yi =  y[:, di]
        if (yi>0).sum() > 0:
            # ignoring padding
            # ec computing loss to ignore index 0, padding gets ignored
            # summing so can divide over number of non-zeros that we have
            loss += F.cross_entropy(output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()
        if use_teacher_forcing:
            # need to decide what is next input
            # by teacher forcing, help at the beginning to make things go faster
            dec_input = y[:, di].unsqueeze(1)  # Teacher forcing: Feed the target as the next input
        else:                
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
    # loss depends on all the parameters. Produce gradients for all the paramters
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [36]:
def train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10,
          teacher_forcing_ratio=0.5):
    for i in range(epochs):
        total_loss = 0
        total = 0
        encoder.train()
        decoder.train()
#        for x, y in train_dl:
        for x, y, l1 in train_dl:
            x = x.long().cuda()
            y = y.long().cuda()
            loss = train_batch(x, y, l1, encoder, decoder, enc_optimizer, dec_optimizer,
                               teacher_forcing_ratio)
            total_loss = loss*x.size(0)
            total += x.size(0)
        if i%2 == 0:
            print("train loss %.3f" % (total_loss / total))   

In [37]:
input_size = vocab_size
output_size = vocab_size
hidden_size = 300
encoder = EncoderRNN(vocab_size, embedding_dim, hidden_size, emb_matrix).cuda()
decoder = DecoderRNN(output_size, hidden_size).cuda()
# same thing just twice
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.01) 

In [38]:
batch_size= 70
train_dl = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
valid_dl = DataLoader(valid_ds, collate_fn=collate_fn, batch_size=batch_size)

In [39]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10)

train loss 0.083


In [40]:
torch.save(encoder.state_dict(), 'encoder')
torch.save(decoder.state_dict(), 'decoder')

In [None]:
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.001) 
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 40)

In [None]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

In [None]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




* `model.eval()` will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
* `torch.no_grad()` impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [67]:
# ec torch.no_grad() makes faster and more efficient
def decoding(x, y, l, encoder, decoder, max_length=50):
    decoder = decoder.eval()
    loss = 0
    with torch.no_grad():   
        batch_size = x.size(0)
        enc_outputs, hidden = encoder(x, l)
        dec_input = SOS_token*torch.ones(batch_size, 1).long().cuda()  # SOS
        decoded_words = []
        # ec decide in advance max length. how big are we going to allow the output to be?
        for di in range(1, y.shape[1]):
            output, hidden = decoder(dec_input, hidden)
            pred = output.argmax(dim=1) # ec this is hard prediction (index of right word)
            # ec bc we want to keep the prediction around
            decoded_words.append(pred.cpu().numpy())
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
            yi =  y[:, di]
            # without if you will get a None or NA(?) due to divide by zero
            if (yi>0).sum() > 0:
                # ignoring padding
                loss += F.cross_entropy(
                    output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()
        return loss.item()/batch_size, np.transpose(decoded_words)

In [None]:
x, y = next(iter(valid_dl)) 
x = x.long().cuda()
y = y.long().cuda()

loss, _ = decoding(x, y, encoder, decoder)
loss

### Training example predictions

In [68]:
batch_size=5
train_dl_2 = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

x, y, l = next(iter(train_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [98]:
def print_results(x, y, l, encoder, decoder, rouge):
    _, decoded_words = decoding(x, y, l, encoder, decoder)
    for i in range(x.shape[0]):
        xi = x[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        y_hat = decoded_words[i]
        x_sent = ' '.join([inv_vocab[t] for t in xi if t > 3])
        y_sent = ' '.join([inv_vocab[t] for t in yi if t > 3])
        y_hat_sent = ' '.join([inv_vocab[t] for t in y_hat if t > 3])
#         print('>', x_sent)
        print('=', y_sent)
        print('<', y_hat_sent)
        if len(yi) > len(y_hat):
             y_hat_sent = y_hat_sent + ' <unk>' * (len(yi) - len(y_hat))
        elif len(yi) < len(y_hat):
             y_sent = y_sent + ' <unk>' * (len(y_hat) - len(yi))
        print(rouge.get_scores(y_hat_sent, y_sent)[0]['rouge-1'])
        print('')

In [99]:
rouge = Rouge()
print_results(x, y, l, encoder, decoder, rouge)

= america made renewed attempt talk up its flagging economy on tuesday despite raft disappointing economic data
< the obama the the the the the the the the the the
{'f': 0.0, 'p': 0.0, 'r': 0.0}

= royal sister princess charlotte is causing global fashion frenzy after her adorable tour wardrobe sells out fast
< the prince is the e princess the the the the the the the
{'f': 0.17391303962192828, 'p': 0.3333333333333333, 'r': 0.11764705882352941}

= the mexican authorities say mart nez planned the killings dozens central south americans hauled from buses trucks heading north
< the hauled nez nez nez nez nez nez nez nez nez nez nez nez nez nez nez nez nez nez
{'f': 0.2727272697520661, 'p': 0.75, 'r': 0.16666666666666666}

= the murder in west jerusalem four men at prayer including three rabbis is tragedy for all israelis palestinians
< the square men are in the men in rifles the men in
{'f': 0.23999999596800003, 'p': 0.42857142857142855, 'r': 0.16666666666666666}

= obama rides high in pol

### Rouge Scores

### Validation example predictions

In [None]:
batch_size=10
valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

x, y = next(iter(valid_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

In [None]:
print_results(x, y, l, encoder, decoder)

## Exercise
-  Replace the embeddings with pre-trained word embeddings. Here are word embeddings for various languages.

https://fasttext.cc/docs/en/crawl-vectors.html 

# Credits
The original notebook was written by Sean Robertson <https://github.com/spro/practical-pytorch>_