#  Sequence to Sequence - Article Summarization

In [54]:
from __future__ import unicode_literals
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

import numpy as np
import pandas as pd

%matplotlib inline

In [122]:
import pickle
path = '/Users/evan/data/deep_learning/newsroom/df_vocabs/'
vocab = pickle.load(open(path + 'vocab.pkl', "rb" ))
inv_vocab = pickle.load(open(path + 'inv_vocab.pkl', "rb" ))

In [None]:
path = '/Users/evan/data/deep_learning/train_df.pkl'
df = pickle.load(open(path, "rb"))
df.head()

In [63]:
type(df.text[0][0])

int

In [9]:
print(vocab['bro'])
print(inv_vocab[27578])

27578
bro


# Dataset

In [10]:
df.head()

Unnamed: 0,text,summary
0,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1, 148, 17, 149, 150, 112, 151, 136, 60, 79, ..."
1,"[1, 742, 743, 11, 646, 307, 744, 132, 596, 745...","[1, 11, 762, 763, 764, 769, 770, 771, 272, 772..."
2,"[1, 910, 940, 840, 941, 132, 942, 569, 943, 94...","[1, 11, 954, 947, 948, 949, 950, 72, 951, 1027..."
3,"[1, 136, 1153, 1154, 910, 1155, 1156, 1157, 11...","[1, 910, 1155, 1156, 1157, 1158, 265, 167, 422..."
4,"[1, 1413, 132, 1414, 1415, 1416, 1417, 1418, 3...","[1, 1510, 1435, 1427, 1428, 1511, 1413, 17, 15..."


In [65]:
a = [1, 2, 4, 6]
torch.Tensor(a)

tensor([1., 2., 4., 6.])

In [97]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (article, summary). 
            - each is list of word indices of variable length
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    articles, summaries = zip(*data)
        
    # Merge sentences
    lengths1 = [len(s) for s in articles]
    lengths2 = [len(s) for s in summaries]
   
    arts = torch.zeros(len(articles), max(lengths1)).long()
    summs = torch.zeros(len(summaries), max(lengths2)).long()
    
    for i, a in enumerate(articles):
        l = lengths1[i]
        arts[i, -l:] = torch.Tensor(a[:l])
        
    for i, s in enumerate(summaries):
        l = lengths2[i]
        summs[i, :l] = torch.Tensor(s[:l])
    
    return arts, summs, lengths1

In [98]:
class SummarizationDataset(Dataset):
    def __init__(self, df): #pairs, input_lang, output_lang):
        self.df = df
        self.article = df.text.values
        self.summary = df.summary.values
    
    def __len__(self):
        return len(self.article)
    
    def __getitem__(self, idx):
        x = self.article[idx]
        y = self.summary[idx]
        return x, y

In [99]:
# import ast
# df['text'].apply(lambda x: ast.literal_eval(x))
# df['summary'].apply(lambda x: ast.literal_eval(x))

In [100]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [101]:
train_ds = SummarizationDataset(train)
valid_ds = SummarizationDataset(test)

In [102]:
train_ds[0][1]

[1,
 1600,
 24863,
 30,
 27489,
 17,
 2995,
 8,
 7039,
 1277,
 3988,
 16338,
 11228,
 846,
 197,
 10291,
 2]

In [103]:
batch_size=5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn)

In [104]:
arts, summs, l = next(iter(train_dl))

In [105]:
arts.shape, summs.shape

(torch.Size([5, 1073]), torch.Size([5, 97]))

## The Seq2Seq Model

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <https://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence.

### The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

![](imgs/encoder-network.png)

In [114]:
# encoder is RNN
# input size is number of words in french vocabulary
# choose hidden size for ...
# embedding layer, gru , and dropout
# get output and hidden, output both of them

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, lengths):
        x = self.embedding(x)
        #x = self.dropout(x)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        output, hidden = self.gru(pack)
        return output, hidden

In [115]:
x, y, l = next(iter(train_dl))

In [116]:
x, y

(tensor([[   1, 7272,   30,  ...,  163,  377,    2],
         [   0,    0,    0,  ..., 1883, 1296,    2],
         [   0,    0,    0,  ...,  215, 2963,    2],
         [   0,    0,    0,  ...,  118, 3374,    2],
         [   0,    0,    0,  ..., 6037, 1652,    2]]),
 tensor([[    1, 10726, 59482,  3916,  6149,  1101,     9,  2310,  4448,  7131,
           3203,   177,   440,  2548,  1892,    11,  1294,   516,   251, 20922,
            371,   820,  1101,     2,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0],
         [    1, 11607,    11,  2577,    11,  2955,  2809,  4540,  4995,   291,
            343,   699,   601,    41,  1276,  4093,  4545,  1898,  2052,    11,
           2288,     2,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [117]:
input_size = len(vocab.keys())
hidden_size = 300
encoder = EncoderRNN(input_size, hidden_size)

In [118]:
enc_outputs, enc_hidden = encoder(x.long(), l)

In [119]:
# enc_outputs.shape, enc_hidden.shape
# what is size of encoder output and encoder hidden
# 5 is batch size, 17 is max length
# 5 is last state of each of 5 sentences

AttributeError: 'PackedSequence' object has no attribute 'shape'

In [121]:
enc_outputs

PackedSequence(data=tensor([[ 0.0171, -0.4171, -0.2677,  ..., -0.0815, -0.2681,  0.2939],
        [ 0.0211, -0.0014,  0.0323,  ..., -0.0088,  0.0007, -0.0148],
        [ 0.0211, -0.0014,  0.0323,  ..., -0.0088,  0.0007, -0.0148],
        ...,
        [ 0.4204,  0.2869,  0.2777,  ...,  0.1623,  0.0882,  0.2575],
        [ 0.1629,  0.1709,  0.0658,  ...,  0.0047,  0.1071,  0.4158],
        [-0.0920,  0.3704,  0.0194,  ...,  0.0372,  0.2614, -0.2748]],
       grad_fn=<CatBackward>), batch_sizes=tensor([5, 5, 5,  ..., 1, 1, 1]))

The  Decoder
   -----------

In [36]:
# encoder has embedding layer, gru, 
# output size is size of english vocabulary
# loss function is trying to produce 
# output is taking the hidden state of decoder,
# going through linear layer to try to produce "the"
# run the decoder (GRU) word by word, because we need "the" to predict
# next word, "poor"
# sometimes we use the prediction or sometimes we use the actual
class DecoderRNN(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(DecoderRNN, self).__init__()

        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(hidden[-1]) # output is a function of the hidden, what we are comparing to the y
        return output, hidden

In [37]:
output_size = output_lang.n_words
hidden_size = 300

In [38]:
batch_size = y.size(0)
decoder_input = SOS_token*torch.ones(batch_size,1).long()
decoder_input.shape

torch.Size([5, 1])

In [39]:
decoder = DecoderRNN(output_size, hidden_size)

In [40]:
output, hidden = decoder(decoder_input, enc_hidden)

In [41]:
hidden.shape, output.shape

(torch.Size([1, 5, 300]), torch.Size([5, 3331]))

Training
========

In [42]:
def train_batch(x, y, encoder, decoder, encoder_optimizer, decoder_optimizer,
                teacher_forcing_ratio=0.5):
    
    #two models so two optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size = y.size(0)
    target_length = y.size(1)

    enc_outputs, enc_hidden = encoder(x, l1, l2)

    loss = 0
    dec_input = y[:,0].unsqueeze(1) # allways SOS (ec always a 1 which is index of start of sequence)
    hidden = enc_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    for di in range(1, target_length):
        output, hidden = decoder(dec_input, hidden) # getting new hidden and output
        # output is prediction, bunch of probabilities (kind of) for each of the words in vocab
        yi =  y[:, di]
        if (yi>0).sum() > 0:
            # ignoring padding
            # ec computing loss to ignore index 0, padding gets ignored
            # summing so can divide over number of non-zeros that we have
            loss += F.cross_entropy(output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()
        if use_teacher_forcing:
            # need to decide what is next input
            # by teacher forcing, help at the beginning to make things go faster
            dec_input = y[:, di].unsqueeze(1)  # Teacher forcing: Feed the target as the next input
        else:                
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
    # loss depends on all the parameters. Produce gradients for all the paramters
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [43]:
def train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10,
          teacher_forcing_ratio=0.5):
    for i in range(epochs):
        total_loss = 0
        total = 0
        encoder.train()
        decoder.train()
#        for x, y in train_dl:
        for x, y, l1, l2 in train_dl:
            x = x.long().cuda()
            y = y.long().cuda()
            loss = train_batch(x, y, encoder, decoder, enc_optimizer, dec_optimizer,
                               teacher_forcing_ratio)
            total_loss = loss*x.size(0)
            total += x.size(0)
        if i%10 == 0:
            print("train loss %.3f" % (total_loss / total))   

In [44]:
input_size = input_lang.n_words
output_size = output_lang.n_words
hidden_size = 300
encoder = EncoderRNN(input_size, hidden_size).cuda()
decoder = DecoderRNN(output_size, hidden_size).cuda()
# same thing just twice
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.01) 

In [45]:
batch_size= 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [46]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 20)

train loss 2.365
train loss 1.618


In [47]:
enc_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
dec_optimizer = optim.Adam(decoder.parameters(), lr=0.001) 
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 40)

train loss 1.128
train loss 1.058
train loss 0.338
train loss 0.299


In [48]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

train loss 0.818
train loss 0.836
train loss 0.806
train loss 0.738
train loss 0.808
train loss 0.657
train loss 0.688
train loss 0.580
train loss 0.561
train loss 0.540
train loss 0.636
train loss 0.624
train loss 0.588
train loss 0.484
train loss 0.490
train loss 0.501
train loss 0.489
train loss 0.463
train loss 0.445
train loss 0.478
train loss 0.389
train loss 0.412
train loss 0.348
train loss 0.462
train loss 0.416
train loss 0.544
train loss 0.332
train loss 0.380
train loss 0.500
train loss 0.417


In [49]:
train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)

train loss 0.335
train loss 0.292
train loss 0.306
train loss 0.311
train loss 0.280
train loss 0.345
train loss 0.372
train loss 0.290
train loss 0.262
train loss 0.355
train loss 0.258
train loss 0.352
train loss 0.252
train loss 0.444
train loss 0.236
train loss 0.238
train loss 0.257
train loss 0.266
train loss 0.240
train loss 0.237
train loss 0.248
train loss 0.321
train loss 0.247
train loss 0.169
train loss 0.208
train loss 0.207
train loss 0.206
train loss 0.244
train loss 0.198
train loss 0.172


Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




* `model.eval()` will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
* `torch.no_grad()` impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [50]:
# ec torch.no_grad() makes faster and more efficient
def decoding(x, y, encoder, decoder, max_length=MAX_LENGTH+2):
    decoder = decoder.eval()
    loss = 0
    with torch.no_grad():   
        batch_size = x.size(0)
        enc_outputs, hidden = encoder(x)
        dec_input = SOS_token*torch.ones(batch_size, 1).long().cuda()  # SOS
        decoded_words = []
        # ec decide in advance max length. how big are we going to allow the output to be?
        for di in range(1, max_length):
            output, hidden = decoder(dec_input, hidden)
            pred = output.argmax(dim=1) # ec this is hard prediction (index of right word)
            # ec bc we want to keep the prediction around
            decoded_words.append(pred.cpu().numpy())
            dec_input = output.argmax(dim=1).unsqueeze(1).detach()
            yi =  y[:, di]
            # without if you will get a None or NA(?) due to divide by zero
            if (yi>0).sum() > 0:
                # ignoring padding
                loss += F.cross_entropy(
                    output, yi, ignore_index = 0, reduction="sum")/(yi>0).sum()
        return loss.item()/batch_size, np.transpose(decoded_words)

In [51]:
batch_size=300
valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

x, y = next(iter(valid_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

loss, _ = decoding(x, y, encoder, decoder)
loss

0.14845184326171876

In [52]:
batch_size=5
train_dl_2 = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x, y = next(iter(train_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [53]:
def print_results(x, y, encoder, decoder):
    _, decoded_words = decoding(x, y, encoder, decoder)
    for i in range(x.shape[0]):
        xi = x[i].cpu().numpy()
        yi = y[i].cpu().numpy()
        y_hat = decoded_words[i]
        x_sent = ' '.join([input_lang.index2word[t] for t in xi if t > 3])
        y_sent = ' '.join([output_lang.index2word[t] for t in yi if t > 3])
        y_hat_sent = ' '.join([output_lang.index2word[t] for t in y_hat if t > 3])
        print('>', x_sent)
        print('=', y_sent)
        print('<', y_hat_sent)
        print('')

In [54]:
print_results(x, y, encoder, decoder)

> je suis quelqu un de bien .
= i m a nice guy .
< i m a nice guy .

> vous etes rusee .
= you re crafty .
< you re crafty .

> je suis un peu desoriente .
= i m a little confused .
< i m a little confused .

> actuellement je me trouve a l aeroport de narita .
= i m at narita airport right now .
< i m at narita airport right now .

> je suis juste ici .
= i m right here .
< i m just here .



In [55]:
batch_size=10
valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

x, y = next(iter(valid_dl_2)) 
x = x.long().cuda()
y = y.long().cuda()

In [56]:
print_results(x, y, encoder, decoder)

> je ne suis pas cette sorte de fille .
= i m not that kind of girl .
< i m not in a of girl .

> nous sommes en securite ici .
= we re safe here .
< we re here here . couple .

> ils le font correctement .
= they re doing it right .
< they re doing it right .

> j ai raison .
= i m right .
< i m correct .

> vous etes tres avises .
= you re very wise .
< you re very wise .

> nous nous marions .
= we re getting married .
< we re undressing .

> j ai une mauvaise impression .
= i m getting a bad feeling .
< i m a of . .

> ce n est pas le genre de type a abandonner facilement .
= he is not the sort of guy who gives in easily .
< he s not very well off at at s . .

> c est un homme cruel .
= he is a cruel person .
< he s a man of .

> ils ne sont pas plus semblables qu une vache a un canari .
= they are no more alike than a cow and a canary .
< they are as a rock rock band in . .



## Exercise
-  Replace the embeddings with pre-trained word embeddings. Here are word embeddings for various languages.

https://fasttext.cc/docs/en/crawl-vectors.html 

# Credits
The original notebook was written by Sean Robertson <https://github.com/spro/practical-pytorch>_