#  Sequence to Sequence - Article Summarization

In [2]:
from __future__ import unicode_literals
from io import open
import unicodedata
import string
import re
import random


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import numpy as np
import pandas as pd
import pickle

from gensim.models import KeyedVectors
from rouge import Rouge

In [3]:
path = './'
vocab = pickle.load(open(path + 'vocab_new.pkl', "rb" ))
inv_vocab = pickle.load(open(path + 'inv_vocab_new.pkl', "rb" ))

In [4]:
path = './'
train_df = pickle.load(open(path + 'train_df_new.pkl', "rb"))
train_df.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 16, 31, 20, 21, 22, 23, 24, 25, 150, 68, 1...",728,23,31.652174
1,"[1, 164, 338, 339, 340, 163, 61, 90, 341, 97, ...","[1, 164, 338, 339, 340, 354, 355, 262, 16, 52,...",409,28,14.607143
2,"[1, 8, 407, 7, 59, 482, 483, 484, 485, 132, 48...","[1, 528, 630, 8, 407, 7, 59, 482, 483, 484, 48...",404,19,21.263158
3,"[1, 631, 632, 385, 633, 75, 634, 47, 635, 636,...","[1, 752, 760, 711, 761, 762, 763, 767, 768, 97...",705,26,27.115385
4,"[1, 917, 918, 919, 41, 16, 920, 921, 427, 110,...","[1, 917, 918, 919, 41, 16, 920, 90, 853, 127, ...",708,21,33.714286


In [5]:
path = './'
dev_df = pickle.load(open(path + 'val_df_new.pkl', "rb"))
dev_df.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[1, 141, 27985, 1907, 4217, 718, 1538, 1060, 1...","[1, 1060, 1184, 9781, 781, 8153, 12564, 3858, ...",516,32,16.125
1,"[1, 47, 52, 1318, 10385, 6517, 12543, 16, 694,...","[1, 12543, 1186, 712, 10511, 8110, 145, 320, 9...",565,17,33.235294
2,"[1, 70, 16, 23828, 3437, 28360, 14235, 145, 29...","[1, 41, 320, 1956, 9097, 551, 4208, 628, 2325,...",484,28,17.285714
3,"[1, 43, 399, 1594, 6502, 19817, 90, 390, 1841,...","[1, 16, 339, 551, 1367, 12088, 1956, 5600, 487...",367,16,22.9375
4,"[1, 1546, 4998, 78, 16, 13929, 11574, 5272, 91...","[1, 16, 17219, 90, 907, 551, 526, 7297, 1080, ...",443,36,12.305556


In [6]:
type(train_df.text[0][0])

int

In [7]:
print(vocab['backpack'])
print(inv_vocab[11282])

11282
backpack


In [8]:
len(vocab)

107447

In [9]:
# Calculating Text and Summary lengths for train dataset
train_text_lens = train_df.text.map(lambda x: len(x))
train_summary_lens = train_df.summary.map(lambda x: len(x))
train_text_lens.describe(), train_summary_lens.describe()

(count    190802.000000
 mean        474.481745
 std         122.320603
 min         119.000000
 25%         374.000000
 50%         465.000000
 75%         573.000000
 max        1289.000000
 Name: text, dtype: float64, count    190802.000000
 mean         24.302513
 std           5.304360
 min           6.000000
 25%          20.000000
 50%          24.000000
 75%          28.000000
 max          49.000000
 Name: summary, dtype: float64)

In [10]:
# Filtering down train dataset
train_df = train_df[(train_text_lens > 374) &
                    (train_text_lens < 573) & 
                    (train_summary_lens < 50) & 
                    (train_summary_lens > 16)]
train_df.shape

(90861, 5)

In [11]:
# Calculating Text and Summary lengths for validation dataset
dev_text_lens = dev_df.text.map(lambda x: len(x))
dev_summary_lens = dev_df.summary.map(lambda x: len(x))
dev_text_lens.describe(), dev_summary_lens.describe()

(count    20892.000000
 mean       474.069500
 std        122.335284
 min        166.000000
 25%        373.000000
 50%        465.000000
 75%        573.000000
 max       1144.000000
 Name: text, dtype: float64, count    20892.000000
 mean        24.262062
 std          5.283030
 min          7.000000
 25%         20.000000
 50%         24.000000
 75%         28.000000
 max         46.000000
 Name: summary, dtype: float64)

In [12]:
# Filtering down validation dataset on same conditions as train
dev_df = dev_df[(dev_text_lens > 374) &
                (dev_text_lens < 573) & 
                (dev_summary_lens < 50) & 
                (dev_summary_lens > 16)]
dev_df.shape

(9903, 5)

## Padding

In [13]:
max_seq_length_text = train_text_lens.max()
max_seq_length_summ = train_summary_lens.max()
max_seq_length_text, max_seq_length_summ

(1289, 49)

In [14]:
train_df.loc[:, 'text'] = train_df['text'].apply(lambda x: ((max_seq_length_text - len(x)) * [0]) + x)
train_df.loc[:, 'summary'] = train_df['summary'].apply(lambda x: (x + (max_seq_length_summ - len(x)) * [0]))

In [15]:
dev_df.loc[:, 'text'] = dev_df['text'].apply(lambda x: np.array((max_seq_length_text - len(x)) * [0] + x))
dev_df.loc[:, 'summary'] = dev_df['summary'].apply(lambda x: np.array(x + (max_seq_length_summ - len(x)) * [0]))

In [16]:
dev_df.head()

Unnamed: 0,text,summary,text_length,summ_length,compression
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1060, 1184, 9781, 781, 8153, 12564, 3858, ...",516,32,16.125
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 41, 320, 1956, 9097, 551, 4208, 628, 2325,...",484,28,17.285714
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 16, 17219, 90, 907, 551, 526, 7297, 1080, ...",443,36,12.305556
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 551, 6746, 12779, 267, 262, 327, 73, 628, ...",438,21,20.857143
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 13640, 262, 10200, 50165, 385, 10470, 47, ...",435,31,14.032258


# Dataset

In [24]:
class SummarizationDataset(Dataset):
    def __init__(self, df): #pairs, input_lang, output_lang):
#         self.df = df
        self.article = df.text.values
        self.summary = df.summary.values
    
    def __len__(self):
        return len(self.article)
    
    def __getitem__(self, idx):
        x = np.array(self.article[idx])
        y = np.array(self.summary[idx])
        return x, y

In [25]:
train_ds = SummarizationDataset(train_df)
valid_ds = SummarizationDataset(dev_df)

#### Dataset/DataLoader Testing

In [26]:
train_ds[0][1]

array([   1, 1180, 1184, 1185, 1186,  208, 1187, 1188,   43,   16, 1189,
       1190,  198,  327,   47, 1191,   83, 1192, 1193,  570, 1194, 1195,
       1196, 1197, 1198, 1199, 1200,  163,   16,    2,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [27]:
batch_size=5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [28]:
arts, summs = next(iter(train_dl))

In [30]:
arts.shape, summs.shape

(torch.Size([5, 1289]), torch.Size([5, 49]))

In [31]:
arts, summs

(tensor([[   0,    0,    0,  ..., 1087, 2019,    2],
         [   0,    0,    0,  ...,   18, 2459,    2],
         [   0,    0,    0,  ..., 3851, 5151,    2],
         [   0,    0,    0,  ...,   16, 3383,    2],
         [   0,    0,    0,  ...,   18, 4170,    2]]),
 tensor([[     1,    551,   1409,    512,     16,  34290,   2380,    145,     68,
             818,    141,   4017,  12812,   1391,  13891,    816,    311,    366,
               2,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0],
         [     1,     43,    864,    908,    327, 104295,   1300,   2387,    512,
            5309,    192,   1218,     70,   1615,  30079,   2460,   2906,    171,
            2607,   6419,    598,     43,  11587,   6664,  10832,     47,   1365,
            4719,     41,   1350, 

## The Seq2Seq Model

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <https://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence.

### The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

![](imgs/encoder-network.png)

In [32]:
word2vec_path = '~/GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [33]:
def create_embedding_matrix(word_vecs, inv_vocab, D=300):
    """Creates embedding matrix from word vectors. """
    V = len(inv_vocab)
    W = np.random.randn(V, D)

    for i in range(V):
        if inv_vocab[i] in word_vecs:
            W[i] = word_vecs[inv_vocab[i]]

    return W

In [34]:
embedding_dim = 300

emb_matrix = create_embedding_matrix(word2vec, inv_vocab, embedding_dim)
emb_matrix

array([[ 0.00402832, -0.24707031,  0.09814453, ...,  0.1640625 ,
         0.24023438,  0.6875    ],
       [-0.19433594, -0.05932617, -0.18066406, ..., -0.12988281,
        -0.15527344,  0.14941406],
       [-0.15039062, -0.03063965,  0.02770996, ...,  0.11132812,
         0.06225586,  0.04003906],
       ...,
       [-0.08984375, -0.16210938,  0.08251953, ..., -0.01513672,
        -0.03039551,  0.14550781],
       [ 0.06591797,  0.140625  , -0.09619141, ...,  0.05761719,
         0.12255859,  0.16796875],
       [ 0.02526855,  0.24804688,  0.08349609, ...,  0.02160645,
         0.11181641,  0.01239014]])

In [35]:
emb_matrix.shape

(107447, 300)

In [36]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embeddings.weight.requires_grad = False
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        output, hidden = self.gru(x)
        return output, hidden

The  Decoder
   -----------

In [37]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(DecoderRNN, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embeddings.weight.requires_grad = False
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x, hidden):
        embedded = self.embeddings(x)
        embedded = self.dropout(embedded)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(hidden[-1]) # output is a function of the hidden, what we are comparing to the y
        return output, hidden

#### Encoder Testing

In [38]:
input_size = len(vocab)
hidden_size = 300
encoder = EncoderRNN(input_size, embedding_dim, hidden_size, emb_matrix)

In [39]:
enc_outputs, enc_hidden = encoder(arts.long())

In [40]:
enc_outputs[0]

tensor([[ 0.0005, -0.0833, -0.0856,  ..., -0.0276,  0.1284,  0.0565],
        [ 0.0226, -0.1819, -0.1398,  ..., -0.0573,  0.1693,  0.0593],
        [ 0.0063, -0.1936, -0.1692,  ..., -0.0806,  0.0995,  0.0564],
        ...,
        [ 0.0694, -0.0193,  0.0985,  ...,  0.0370,  0.1077, -0.1241],
        [ 0.0073,  0.0551,  0.0967,  ...,  0.0359,  0.0728, -0.0816],
        [ 0.0307,  0.0716,  0.0371,  ...,  0.0131,  0.0945, -0.0470]],
       grad_fn=<SelectBackward>)

#### Decoder Testing

In [41]:
SOS_token = 1
batch_size = summs.size(0)
decoder_input = SOS_token*torch.ones(batch_size,1).long()
decoder_input.shape

torch.Size([5, 1])

In [42]:
output_size = len(vocab)
hidden_dim = 300
decoder = DecoderRNN(output_size, embedding_dim, hidden_dim, emb_matrix)

In [43]:
output, hidden = decoder(decoder_input, enc_hidden)

In [44]:
hidden.shape, output.shape

(torch.Size([1, 5, 300]), torch.Size([5, 107447]))

Training
========

In [45]:
def rouge_format(arr):
    sents = []
    for l in arr:
        sent = []
        for n in l:
            if n not in (0,1,2):
                sent.append(str(n))
        sents.append(' '.join(sent))
    
    return sents

In [54]:
# ec torch.no_grad() makes faster and more efficient
def decoding(x, y, encoder, decoder, max_length=40):
    decoder = decoder.eval()
    loss = 0
    with torch.no_grad():   
        batch_size = x.size(0)
        enc_outputs, hidden = encoder(x)
        dec_input = SOS_token*torch.ones(batch_size, 1).long().cuda()  # SOS
        decoded_words = []
        # ec decide in advance max length. how big are we going to allow the output to be?
#         unpacked_out = pad_packed_sequence(enc_outputs, batch_first=True) # FOR ATTENTION
        for di in range(1, y.shape[1]):
#             output, hidden, attention = decoder(dec_input, hidden, unpacked_out[0]) # FOR ATTENTION
            output, hidden = decoder(dec_input, hidden)
            pred = output.argmax(dim=1) # ec this is hard prediction (index of right word)
            # ec bc we want to keep the prediction around
            decoded_words.append(pred.cpu().numpy())
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
            yi =  y[:, di]
            # without if you will get a None or NA(?) due to divide by zero
            if (yi>0).sum() > 0:
                # ignoring padding
                loss += F.cross_entropy(output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()

        refs = y.cpu().numpy()[:, 1:]
        refs = rouge_format(refs)

        hyps = np.transpose(decoded_words)
        hyps = rouge_format(hyps)
        
        r = Rouge()
        return loss.item(), r.get_scores(hyps, refs, avg=True), np.transpose(decoded_words)

In [55]:
def combine_rouge_scores(score_list):
    combined_rouge = {'rouge-1': {'f': 0, 'p': 0, 'r': 0},
                      'rouge-2': {'f': 0, 'p': 0, 'r': 0},
                      'rouge-l': {'f': 0, 'p': 0, 'r': 0}}
    combined_rouge = pd.DataFrame(combined_rouge)
    for score in score_list:
        combined_rouge = combined_rouge + pd.DataFrame(score)
    
    return combined_rouge/len(score_list)

In [62]:
def val_metrics(encoder, decoder, valid_dl):
    rouge_scores = []
    
    sum_loss = 0
    total = 0
    for x, y in valid_dl:
        x = x.long().cuda()
        y = y.long().cuda()
        loss, rouge_score, _ = decoding(x, y, encoder, decoder)
        sum_loss += loss
        total += y.shape[0]
        rouge_scores.append(rouge_score)
    
    return sum_loss/total, combine_rouge_scores(rouge_scores)

In [57]:
def train_batch(x, y, encoder, decoder, encoder_optimizer, decoder_optimizer,
                teacher_forcing_ratio=0.5):
    
    #two models so two optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size = y.size(0)
    target_length = y.size(1)

    enc_outputs, enc_hidden = encoder(x)

    loss = 0
    dec_input = y[:,0].unsqueeze(1) # allways SOS (ec always a 1 which is index of start of sequence)
    hidden = enc_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(1, target_length):
#         unpacked_out = pad_packed_sequence(enc_outputs, batch_first=True) # FOR ATTENTION
#         output, hidden, attention = decoder(dec_input, hidden, unpacked_out[0]) # FOR ATTENTION
        output, hidden = decoder(dec_input, hidden)
        # output is prediction, bunch of probabilities (kind of) for each of the words in vocab
        yi =  y[:, di]
        if (yi>0).sum() > 0:
            # ignoring padding
            # ec computing loss to ignore index 0, padding gets ignored
            # summing so can divide over number of non-zeros that we have
            loss += F.cross_entropy(output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()
        if use_teacher_forcing:
            # need to decide what is next input
            # by teacher forcing, help at the beginning to make things go faster
            dec_input = y[:, di].unsqueeze(1)  # Teacher forcing: Feed the target as the next input
        else:                
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
    # loss depends on all the parameters. Produce gradients for all the paramters
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [58]:
def train(train_dl, valid_dl, encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10,
          teacher_forcing_ratio=0.5):
    for i in range(epochs):
        total_loss = 0
        total = 0
        encoder.train()
        decoder.train()
        for x, y in train_dl:
            x = x.long().cuda()
            y = y.long().cuda()
            loss = train_batch(x, y, encoder, decoder, enc_optimizer, dec_optimizer,
                               teacher_forcing_ratio)
            total_loss = loss*x.size(0)
            total += x.size(0)
        val_loss, scores = val_metrics(encoder, decoder, valid_dl)
#         if i%2 == 0:
        print("train loss %.3f val loss %.3f" % (total_loss/total, val_loss))
        print(scores)

In [59]:
hidden_size = 300
encoder = EncoderRNN(input_size, embedding_dim, hidden_size, emb_matrix).cuda()

decoder = DecoderRNN(output_size, embedding_dim, hidden_size, emb_matrix).cuda()


# same thing just twice
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.01) 

In [60]:
batch_size= 50
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [None]:
train(train_dl, valid_dl, encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10) # 

train loss 0.039 val loss 6.788
    rouge-1  rouge-2   rouge-l
f  0.067145      0.0  0.037301
p  0.382965      0.0  0.382915
r  0.036953      0.0  0.036950
train loss 0.027 val loss 6.951
    rouge-1   rouge-2   rouge-l
f  0.080608  0.003460  0.049915
p  0.252152  0.009367  0.252152
r  0.048339  0.002144  0.048339


In [None]:
# running validation separate from training
val_loss, scores = val_metrics(encoder, decoder, valid_dl)
print("val loss %.3f" % (val_loss))
print(scores)

In [None]:
# saving models
torch.save(encoder.state_dict(), 'encoder_s2s_wa')
torch.save(decoder.state_dict(), 'decoder_s2s_wa')

In [None]:
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.001) 
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 40)

In [None]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

In [None]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




* `model.eval()` will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
* `torch.no_grad()` impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [None]:
x, y = next(iter(valid_dl)) 
x = x.long().cuda()
y = y.long().cuda()

loss, _ = decoding(x, y, encoder, decoder)
loss

### Training example predictions

In [None]:
batch_size=5
train_dl_2 = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

x, y = next(iter(train_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [None]:
def print_results(x, y, encoder, decoder, rouge):
    _, _, decoded_words = decoding(x, y, l, encoder, decoder)
    for i in range(x.shape[0]):
        xi = x[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        y_hat = decoded_words[i]
        x_sent = ' '.join([inv_vocab[t] for t in xi if t > 3])
        y_sent = ' '.join([inv_vocab[t] for t in yi if t > 3])
        y_hat_sent = ' '.join([inv_vocab[t] for t in y_hat if t > 3])
        print('=', y_sent)
        print('<', y_hat_sent)
#         if len(yi) > len(y_hat):
#              y_hat_sent = y_hat_sent + ' <unk>' * (len(yi) - len(y_hat))
#         elif len(yi) < len(y_hat):
#              y_sent = y_sent + ' <unk>' * (len(y_hat) - len(yi))
#         print(rouge.get_scores(y_hat_sent, y_sent)[0]['rouge-1'])
        print('')

In [None]:
rouge = Rouge()
print_results(x, y, l, encoder, decoder, rouge)

### Rouge Scores

### Validation example predictions

In [None]:
batch_size=10
valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

x, y = next(iter(valid_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

In [None]:
print_results(x, y, l, encoder, decoder)

## Exercise
-  Replace the embeddings with pre-trained word embeddings. Here are word embeddings for various languages.

https://fasttext.cc/docs/en/crawl-vectors.html 

# Credits
The original notebook was written by Sean Robertson <https://github.com/spro/practical-pytorch>_