In [None]:
!pip install rouge-score

In [None]:
from __future__ import unicode_literals, print_function, division
import random
import math
import time
import numpy as np
import pandas as pd
import torch
import unicodedata
import re
import spacy
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

SEED = 42
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# Abstractive Text Summarization
- Automatic text summarization is the task of producing a coherent and fluent summary while preserving key information
- Abstractive summarization techniques generates completely new sentences, similarly to the way humans do it.
- A common solution for this NLP task is using a Seq2seq deep learning model based on coupled RNNs and Attention Mechanism


# Dataset
- almost 300.000 news articles like:

>    Sky have won the bidding war for the rights to screen Floyd Mayweather v Manny Pacquiao in the UK, as revealed by Sportsmail last Friday. The richest fight of all time will not come cheap either — for Sky Sports or their subscribers — even though Sky are keeping faith with their core following by keeping the base price below £20. It has taken what is described by industry insiders as ‘a very substantial offer’ for Sky to fend off fierce competition from Frank Warren’s BoxNation. Floyd Mayweather's hotly-anticipated bout with Manny Pacquiao will be shown on Sky Sports. Pacquiao headed for the playground after working out in Los Angeles previously. The price for the fight has been set at £19.95 until midnight of Friday May 1. The cost will remain the same for those paying via remote control or online, but will be £24.95 if booked via phone after Friday.Sky are flirting with their threshold of £20 by charging £19.95 a buy on their Sports Box Office channel until midnight on May 1, rising to £24.95 on May 2, the day of the fight in Las Vegas. Since they are understood to have broken past protocol by offering the US promoters a cut of that revenue as well as a hefty up-front payment, it is expected they will have to shatter the pay-per-view record in this country to break even. The current Sky record stands at 1.2million buys for Ricky Hatton’s Vegas loss to Mayweather in 2007. Warren is believed to have offered a higher lump sum than Sky in the hope of attracting another two million customers to his £12-a-month subscription channel. It is doubtful if Sky can reach that number at £20 per sale at 4am on a Sunday morning, but if they get 1.5m buys they should be out of the red. Mayweather continued to work on the pads in his Las Vegas gym as he prepares for the fight. Pacquiao will take on Mayweather at the MGM Grand in Las Vegas on May 2 in one of the biggest fights ever
>
>    **@highlight**
    Sky has been in fierce competition with Frank Warren's BoxNation
>
>    **@highlight**
    The broadcaster has won the right to show the $300m (£200m) bout
>
>    **@highlight**
    Sky has set the price for Floyd Mayweather vs Manny Pacquiao at £19.95
>
>    **@highlight**
    The mega-fight takes place at the MGM Grand in Las Vegas on May 2
>
>    **@highlight**
    Read how Jeff Powell broke the news of Sky's deal 
 


---
# Reading Data

- Extract all the articles' and summaries' text into one DataFrame
- Save data to a .csv file

In [None]:
import glob
from io import open

path_CNN = r'../input/cnnsummarizationraw/cnn/stories' + '/*.story'
path_DailyMail = r'../input/dailymailsummarizationraw/dailymail/stories' + '/*.story'
path_CSV = r'../input/cnndm150/data_all_raw_500.csv'
path_GloVe=r'../input/glove6b/glove.6B.300d.txt'

class DataReader():
    def readAllStories(self):
        dailyMail = self.readDailyMailStories(path_DailyMail)
        cnn = self.readCnnStories(path_CNN)
        return cnn.append(dailyMail, ignore_index=True)
    
    def readStories(self, path):
        print("Reading stories from files:", path)
        
        all_files = glob.glob(path)
        all_stories = []
        for f in all_files:
            with open(f, 'rt', encoding="utf-8") as file:
                all_stories.append(file.read())
                
        return self.transformStoriesToDataFrame(all_stories)
    
    
    def transformStoriesToDataFrame(self, stories):
        print("Transforming stories to DataFrame")
        
        all_stories = []
        for story in stories:
             full_text = story.split("@highlight")
             text = full_text[0]
             summary = " . ".join(full_text[1:]) 
             all_stories.append({"text": " ".join(text.split()), "summary": " ".join(summary.split())})  
             
        return pd.DataFrame(all_stories)
    
    def saveDataFrame(self, df):
        print("Saving stories to CSV:", path_CSV)
        df.to_csv(path_CSV, encoding='utf-8', index=False, sep=";")
       
    def readDataFrame(self):
        print("Reading stories from CSV:", path_CSV)
        return pd.read_csv(path_CSV, encoding='utf-8', sep=";")     

In [None]:
# Read data
dataReader = DataReader()
data = dataReader.readDataFrame()
data.head()

---
# Pre-processing Data

- **Tokenizing** with SpaCy
- **Filtering** data by number of words
- Optional **cleaning**: removing stop words, punctuations and lemmatizing

In [None]:
MAX_LENGTH = 300  # by words in stories after preparation

class DataPreprocesser():
    def __init__(self, rm_punctation=False, rm_stop_words=False, lemmatizing=False):
        self.removing_punctation = rm_punctation
        self.removing_stop_words = rm_stop_words
        self.lemmatizing = lemmatizing
        
        self.spacy = spacy.load('en_core_web_sm')
        
    
    def preprocessData(self, data):
        print("Preprocessing data")
        print(f" -Rm stopwords: {self.removing_stop_words}, Rm punct: {self.removing_punctation}, Lemmatizing: {self.lemmatizing}")
        
        data = self.filterData(data)
        
        data["text"] = data["text"].apply(self.normalizeText)
        data["summary"] = data["summary"].apply(self.normalizeText)
        return data
    
    
    def normalizeText(self, text):
        text = self.unicodeToAscii(str(text).lower().strip())
        
        # making every non-letter/number standalone (and decreasing num of whitespaces to one)
        text = " ".join(re.sub(r"([^a-zA-Z0-9])", r" \1 ", text).split())
        
        tokens = [token for token in self.spacy(text)]
        
        if(self.removing_punctation):
            tokens = [token for token in tokens if not token.is_punct]
        
        if(self.removing_stop_words):
            tokens = [token for token in tokens if not token.is_stop]
            
        if(self.lemmatizing):
            return " ".join([token.lemma_ for token in tokens])
        else:
            return " ".join([token.text for token in tokens])
        
    
    def filterData(self, data, max_length=MAX_LENGTH, min_length=10):
        print("Number of stories:", len(data.index))
        
        data["text_len"] = data["text"].apply(lambda x: len(str(x).split()))
        data["summary_len"] = data["summary"].apply(lambda x: len(str(x).split()))
        
        data = data[(data["text_len"] <= max_length) & (data["text_len"] >= min_length)]
        data = data[(data["summary_len"] < data["text_len"]) & (data["summary_len"] >= min_length)]
        
        print(" -after length filtering:", len(data.index))
        return data
    
    
    # Turn a Unicode string to plain ASCII, thanks to # https://stackoverflow.com/a/518232/2809427
    def unicodeToAscii(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
# Pre-process data
dataPreprocesser = DataPreprocesser(rm_stop_words=False, rm_punctation=False, lemmatizing=False)
data = dataPreprocesser.preprocessData(data)
data.head()

___
# Indexing & Embedding

- Indexing word tokens
- Calculate GloVe embbedding vectors
- **word/token** for human representation -> **index** for model input -> **GloVe vector** for model embedding


In [None]:
PAD_token = "<PAD>"
SOS_token = "<SOS>"
EOS_token = "<EOS>"
UNK_token = "<UNK>"
PAD_index = 0
SOS_index = 1
EOS_index = 2
UNK_index = 3

class Tokenizer():
    def __init__(self):
        self.word2index = {
            PAD_token: PAD_index, 
            SOS_token: SOS_index, 
            EOS_token: EOS_index, 
            UNK_token: UNK_index
        }
        self.word2count = {}
        self.index2word = { index : word for word, index in self.word2index.items() }
        self.n_words = 4  # Count PAD, SOS, EOS and UNK     
      
    
    def tokenizeData(self, data):
        print(f"Tokenizing data")
        
        # Create and store word indexes from texts
        for text, summary in zip(data["text"], data["summary"]):
            for word in text.split():
                self.addWord(word)
            for word in summary.split():
                self.addWord(word)
    
    def addWord(self, word):
        # Calc and store a word's index
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
                
    
    def indexesFromText(self, text):
        # Calculate word indexes from text
        indexes = [self.word2index.get(word, UNK_index) for word in str(text).split()]
        indexes.append(EOS_index)
        return indexes
    
    
    def initGloVe(self):
        print("Initialing GloVe tokenizer with:", path_GloVe)
        # Initialize
        self.word2gloVe = {}
        self.weights_len = None
        
        # Loading GloVe vectors
        # GloVe vectors' len (50...300) and hidden size should be equal for this embedding
        with open(path_GloVe, 'rt', encoding="utf-8") as file:
            for line in file:
                values = line.split()
                word = values[0] # the word
                vector = np.asarray(values[1:], dtype="float32") # the vector representing the embedding of the word

                self.word2gloVe[word] = vector

                if self.weights_len is None:
                    self.weights_len = len(vector)
        
        # Set up vectors for PAD, SOS, EOS and UNK 
        self.word2gloVe[SOS_token] = np.ones((self.weights_len, ), dtype="float32")
        self.word2gloVe[EOS_token] = np.ones((self.weights_len, ), dtype="float32")
        self.word2gloVe[UNK_token] = np.ones((self.weights_len, ), dtype="float32")
        self.word2gloVe[PAD_token] = np.zeros((self.weights_len, ), dtype="float32")
    
    
    def getGloVeEmbedding(self):
        # Load GloVe vectors for calculating pretrained embeddings
        self.initGloVe() 
            
        # Initialize the embedding matrix
        weights_matrix = np.zeros((self.n_words, self.weights_len))

        # Create embedding matrix from GloVe weights
        for word, index in self.word2index.items():
            try: 
                weights_matrix[index] = self.word2gloVe[word]
            except KeyError:
                # Random weights for out-of-vocabulary tokens
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.weights_len, ))
                
        return torch.tensor(weights_matrix, dtype=torch.float, device=device)
    

In [None]:
# Convert data to word indexes
tokenizer = Tokenizer()
tokenizer.tokenizeData(data)
tokenizer.indexesFromText("san lorenzo will play real madrid in saturday")

---
# Data Loader

- **Split** dataset to train/validation/test set
- **Sort** datasets by decreasing lengths
- Data **batch generator** for train/validation/test iterations
    - with optional batch shuffling
    - padding properly for PyTorch



In [None]:
class TextSummaryData():
    def __init__(self, data, textTokenizer, summaryTokenizer, batch_size=1, shuffle=False, pad_index=PAD_index):

        # Calculate vector of word indexes from texts by Tokenizers
        data["text_vec"] = data["text"].apply(textTokenizer.indexesFromText)
        data["summary_vec"] = data["summary"].apply(summaryTokenizer.indexesFromText)
        # Calculate sequence lengths 
        data["text_vec_len"] = data["text_vec"].apply(lambda x: len(x))
        data["summary_vec_len"] = data["summary_vec"].apply(lambda x: len(x))

        # Split data to train/validation/test set
        test, train = train_test_split(data, test_size=0.9, random_state=SEED, shuffle=True)
        test, validation = train_test_split(test, test_size=0.5, random_state=SEED)

        # Sort datasets by decreasing lengths
        self.train_data = train.sort_values(by=["text_vec_len"], ascending=False)
        self.validation_data = validation.sort_values(by=["text_vec_len"], ascending=False)
        self.test_data = test.sort_values(by=["text_vec_len"], ascending=False)
            
        # Initialize helper informations
        self.batch_size = batch_size
        self.train_batches = math.ceil(len(train.index) / batch_size)
        self.validation_batches = math.ceil(len(validation.index) / batch_size)
        self.test_batches = math.ceil(len(test.index) / batch_size)
        self.pad_index = pad_index
        self.shuffle = shuffle
 
            
    def batch(self, mode="train"):
        # Selecting the proper dataset for training/validating/testing
        data = {"test": self.test_data, "validation": self.validation_data}.get(mode, self.train_data)
        
        l = len(data.index)
        batch_range = range(0, l, self.batch_size)
        # Shuffle batches (by their index)
        if self.shuffle:
            batch_range = random.sample(batch_range, len(batch_range))

        for i in batch_range:
            # Pad to tensor batches for PyTorch 
            yield self.padding_Batch(data.iloc[i: i+self.batch_size])
            
    
    def padding_Batch(self, batch):
        current_batch_size = len(batch)
        
        # Text tensors
        texts = [self.createBatchTensor(item) for item in batch["text_vec"].values]
        summaries = [self.createBatchTensor(item) for item in batch["summary_vec"].values]
        
        # Seq. lengths tensors
        texts_lens = self.createLengthsTensor(batch["text_vec_len"].values)
        summaries_lens = self.createLengthsTensor(batch["summary_vec_len"].values)
        
        # Max lengths
        max_texts_len = torch.max(texts_lens)
        max_summaries_len = torch.max(summaries_lens)

        # Pad text batch
        texts = pad_sequence(texts, padding_value=self.pad_index)
        summaries = pad_sequence(summaries, padding_value=self.pad_index)
        
        texts_data = (texts, texts_lens, max_texts_len)
        summaries_data = (summaries, summaries_lens, max_summaries_len)
        
        return texts_data, summaries_data, current_batch_size
    
    def createBatchTensor(self, vectors):
        return torch.tensor(vectors, dtype=torch.long, device=device)
    
    def createLengthsTensor(self, vectors):
        return torch.tensor(vectors, dtype=torch.int64, device=torch.device("cpu"))

In [None]:
dataLoader = TextSummaryData(data, tokenizer, tokenizer, batch_size=32, shuffle=False)
for batch in dataLoader.batch():
    print("batch size:", batch[2])
    
data.head()

---
# Model

- Encoder-Decoder Seq2seq with Attention Mechanism

## Encoder

- Embedding
- Bi-LSTM with a given number of layers

In [None]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0.1):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize LSTM; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.lstm = nn.LSTM(
            hidden_size, 
            hidden_size, 
            n_layers,
            bidirectional=True,
            dropout=(0 if n_layers==1 else dropout)
        )
        # output layer for concatinated LSTM directions
        self.dense = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, input_seq, input_lengths):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = pack_padded_sequence(embedded, input_lengths)
        # Forward pass through LSTM
        outputs, (hidden, cell) = self.lstm(packed)
        # Unpack padding
        outputs, _ = pad_packed_sequence(outputs)
        # Convert the (concatinated) bidirectional LSTM outputs
        outputs = self.dense(outputs)      
        # Return output and final hidden state
        return outputs, hidden, cell

## Attention

- **Luong attention mechanism** with 3 different attention calculations:
    - dot 
    - concat 
    - general 

In [None]:
# Luong attention layer
class Attention(nn.Module):
    def __init__(self, hidden_size, method='dot'):
        super(Attention, self).__init__()
        if method not in ['dot', 'general', 'concat']:
            raise ValueError(method, "is not an appropriate attention method.")
        self.method = method
        self.hidden_size = hidden_size
        
        if method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        if method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        if self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        if self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()
        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

## Decoder

- Embedding
- Uni-LSTM with optional number of layers
- Attention layer


In [None]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, embedding, attn_model, n_layers=1, dropout=0.1):
        super(AttnDecoder, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.lstm = nn.LSTM(
            hidden_size, 
            hidden_size, 
            n_layers, 
            dropout=(0 if n_layers == 1 else dropout)
        )
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attention(hidden_size, attn_model)


    def forward(self, input_step, last_hidden, last_cell, encoder_outputs):  # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        
        # Forward through unidirectional LSTM (we could use LSTMCell here)
        rnn_output, (hidden, cell) = self.lstm(embedded, (last_hidden, last_cell))
        
        # Calculate attention weights (scores) from the current LSTM output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = torch.bmm(attn_weights, encoder_outputs.transpose(0, 1))
        
        # Concatenate weighted context vector and LSTM output using Luong eq. 5
        concat_input = torch.cat((rnn_output.squeeze(0), context.squeeze(1)), 1)
        concat_output = torch.tanh(self.concat(concat_input))
    
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.log_softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden, cell

---
# Training

### One Training Iteration
- passing through: encoder in one step, decoder step by step
- **Teacher forcing** for mre efficient training
- **Gradient clipping** to avoid exploding gradients

In [None]:
def trainStep(input_batch, input_lens, target_batch, target_lens, target_max_len, batch_size,
          encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip, max_length=MAX_LENGTH, teacher_forcing_ratio=0.5):

    # Initialize loss
    loss = 0
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Forward pass through encoder
    encoder_outputs, encoder_hidden, encoder_cell = encoder(input_batch, input_lens)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.tensor([[SOS_index for _ in range(batch_size)]], device=device, dtype=torch.long)
    # Set initial decoder hidden state to the encoder's final (lasts) hidden state
    decoder_hidden = encoder_hidden[-decoder.n_layers:]
    decoder_cell = encoder_cell[-decoder.n_layers:] 

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    for t in range(target_max_len):
        decoder_output, decoder_hidden, decoder_cell = decoder(
            decoder_input, decoder_hidden, decoder_cell, encoder_outputs
        )
            
        if use_teacher_forcing:
            # Teacher forcing: next input is current target
            decoder_input = target_batch[t].view(1, -1) # (+adding new dim around)
        else:
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.tensor([[topi[i][0] for i in range(batch_size)]], device=device, dtype=torch.long)
                
        # Calculate and accumulate loss
        current_loss = criterion(decoder_output, target_batch[t])
        loss += current_loss
            
    # Perform backpropatation
    loss.backward()

    # Clip gradients to avoid exploding gradients
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss / target_lens.sum()

### One Validation Iteration

- Same as training, but without backward propagation and teacher forcing 

In [None]:
def validationStep(input_batch, input_lens, target_batch, target_lens, target_max_len, batch_size,
          encoder, decoder, criterion, max_length=MAX_LENGTH):
    # Initialize loss
    loss = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden, encoder_cell = encoder(input_batch, input_lens)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.tensor([[SOS_index for _ in range(batch_size)]], device=device, dtype=torch.long)

    # Set initial decoder hidden state to the encoder's final (lasts) hidden state
    decoder_hidden = encoder_hidden[-decoder.n_layers:]
    decoder_cell = encoder_cell[-decoder.n_layers:] #torch.zeros_like(decoder_hidden)

    for t in range(target_max_len):
        decoder_output, decoder_hidden, decoder_cell = decoder(
            decoder_input, decoder_hidden, decoder_cell, encoder_outputs   
        )
        # No teacher forcing: next input is decoder's own current output
        _, topi = decoder_output.topk(1)
        decoder_input = torch.tensor([[topi[i][0] for i in range(batch_size)]], device=device, dtype=torch.long)
       
        # Calculate and accumulate loss
        current_loss = criterion(decoder_output, target_batch[t])
        loss += current_loss
                
    return loss / target_lens.sum()

### The Whole Training

- Iterating through epochs and batches
- **Validation**, **early stopping** with patience
- Optional learning rate scheduler

In [None]:
def trainIters(dataLoader, 
              encoder, decoder, encoder_optimizer, decoder_omptimzer, encoder_scheduler=None, decoder_scheduler=None, 
              clip=0.5, n_epochs=1, stopping_patience=5):
    
    print("\nTrainig started:")
    print(f" -number of epochs: {n_epochs}, number of batches: {dataLoader.train_batches}, batch size: {dataLoader.batch_size}")

    # Initializations for early stopping
    start = time.time()
    tr_losses = 0
    val_losses = 0
    less_val_loss = float('inf')
    epochs_no_improve = 0
    
    criterion = nn.NLLLoss(ignore_index=PAD_index)

    # Training loop per epoch
    for epoch_i in range(1, n_epochs+1):
        # print(f"Epoch {epoch_i} of {n_epochs} started")
        
        # Ensure models are in train mode
        encoder.train()
        decoder.train()
        
        # Training loop per batch
        for batch_data in dataLoader.batch():
            # Pack out the batch of data
            input_data, target_data, batch_size = batch_data
            input_batch, input_lens, max_input_len = input_data
            target_batch, target_lens, max_target_len = target_data
            
            # Run a training iteration with a batch
            loss = trainStep(input_batch, input_lens, target_batch, target_lens, max_target_len, batch_size,
                         encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, clip) 
            tr_losses += loss.detach().item()
            
        # Step with learning rate schedulers
        if encoder_scheduler is not None:
            encoder_scheduler.step()
        if decoder_scheduler is not None:
            decoder_scheduler.step()
            
        # Ensure models are in evaluation mode
        encoder.eval()
        decoder.eval()
        
        # Validation loop per batch
        with torch.no_grad():
            for batch_data in dataLoader.batch(mode="validation"):
                # Packing out the batch of data
                input_data, target_data, batch_size = batch_data
                input_batch, input_lens, max_input_len = input_data
                target_batch, target_lens, max_target_len = target_data

                loss = validationStep(input_batch, input_lens, target_batch, target_lens, max_target_len, batch_size,
                                      encoder, decoder, criterion)
                val_losses += loss.detach().item()
    
    
        # Print one epoch's progress
        avg_tr_loss = tr_losses / dataLoader.train_batches
        avg_val_loss = val_losses / dataLoader.validation_batches
        print(" {}   Epoch: {} ({:.1f}%); Train loss: {:.4f}; Validation loss: {:.4f}"
              .format(timeSince(start, epoch_i / n_epochs), epoch_i, epoch_i / n_epochs * 100, avg_tr_loss, avg_val_loss))
        
        # Early stopping
        if less_val_loss - val_losses < 0:
            epochs_no_improve += 1
            if epochs_no_improve > stopping_patience:
                print(f"Training stopped early after {epoch_i} of {n_epochs} epochs!")
                break
        else:
            epochs_no_improve = 0 
            less_val_loss = val_losses
        tr_losses = 0
        val_losses = 0
    
    print("Trainig finished")

---
# Evalutation
- Evaluating with human readable text outputs
- Testing with **Rouge** scores


In [None]:
from rouge_score import rouge_scorer

def evaluate(input_text, encoder, decoder, encoderTokenizer, decoderTokenizer, max_length=MAX_LENGTH):
    # Ensure models are in evaluation mode
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        # indexing and converting into a 1 size batch
        input_batch = torch.tensor(encoderTokenizer.indexesFromText(input_text), dtype=torch.long, device=device).unsqueeze(0).transpose(0, 1)
        input_length = torch.tensor([input_batch.size(0)], dtype=torch.int64, device=torch.device("cpu"))
        
        # Forward input through encoder model
        encoder_outputs, encoder_hidden, encoder_cell = encoder(input_batch, input_length)
        
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[-decoder.n_layers:]
        decoder_cell = encoder_cell[-decoder.n_layers:]
        
        # Initialize decoder input with SOS_index
        decoder_input = torch.tensor([[SOS_index]], device=device, dtype=torch.long)
        
        # Initialize tensors to append decoded words to
        all_indexes= torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden, decoder_cell = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs
            )
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_indexes = torch.cat((all_indexes, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
            
            if decoder_input.item() == EOS_index:
                break
        
        # Compute words from output indexes
        decoded_words = [decoderTokenizer.index2word[index.item()] for index in all_indexes]
        # Return collections of words and attention scores 
        return ' '.join(decoded_words), all_scores


def testIters(dataLoader, encoder, decoder, encoderTokenizer, decoderTokenizer):
    print("\nTesting:")
     
    metrics = ["rouge1", "rouge2", "rougeL"] # choosing Rouge metric types
    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True) # configure the scoring module
    results = { metric: np.array([]) for metric in metrics} # initialize results
         
    # Loop iteration through test set and evaluation
    test = dataLoader.test_data 
    for text, target_summary in zip(test["text"], test["summary"]):
        
        # Evaluate text summarization
        predicted_summary, _ = evaluate(text, encoder, decoder, encoderTokenizer, decoderTokenizer)
        # Calculate score between prediction and target summary
        scores = scorer.score(target_summary, predicted_summary)
         
        for metric in metrics: # record rouge scores
             results[metric]= np.append(results[metric], scores[metric].fmeasure)
             
    for metric in metrics:
        avg = results[metric].mean()
        print(" -{:s}: {:.4f}".format(metric, avg))
    

def evaluateRandomly(data, encoder, decoder, encoderTokenizer, decoderTokenizer, n=1):
    print("\nEvaluation:")
     
    random = data.sample(n=n, random_state=SEED)
    for text, target_summary in zip(random["text"], random["summary"]):
        # Evaluate text summarization on a random sample
        predicted_summary, attentions = evaluate(text, encoder, decoder, encoderTokenizer, decoderTokenizer)
    
        print('\n > TEXT:')
        print(text)
        print(' = TARGET SUMMARY:')
        print(target_summary)
        print(' < PREDICTED SUMMARY:')
        print(predicted_summary)

In [None]:
attn_model = 'dot' # concat/general
hidden_size = 300
n_layers = 1
dropout = 0.1
encoderEmbedding = nn.Embedding(tokenizer.n_words, hidden_size, padding_idx=PAD_index)
# encoderEmbedding = tokenizer.getGloVeEmbedding()
decoderEmbedding = nn.Embedding(tokenizer.n_words, hidden_size, padding_idx=PAD_index)
# decoderEmbedding = tokenizer.getGloVeEmbedding()

# Initialize encoder & decoder models
encoder = Encoder(hidden_size, encoderEmbedding, n_layers, dropout)
decoder = AttnDecoder(hidden_size, tokenizer.n_words, decoderEmbedding, attn_model, n_layers, dropout)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)

# Configure training
learning_rate = 0.0005
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

# Run training iterations with given parameters
trainIters(dataLoader, encoder, decoder, encoder_optimizer, decoder_optimizer,n_epochs=5)

# Run testing
testIters(dataLoader, encoder, decoder, tokenizer, tokenizer)
    
# Run readable evaluation
evaluateRandomly(data, encoder, decoder, tokenizer, tokenizer, n=5)