In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import random
from torch.utils.tensorboard import SummaryWriter
# from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

## LOAD DATA

In [2]:
# val_df = pd.read_csv("./cnn_dailymail/validation.csv")
# test_df = pd.read_csv("./cnn_dailymail/test.csv")
train_df = pd.read_csv("./cnn_dailymail/train.csv")
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [3]:
train_df = train_df.drop(columns=['id'])
train_df.head()

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


## PREPROCESSING

#### TOKENIZATION

In [4]:
articles_to_tokenize = []
summaries_to_tokenize = []
data = train_df.sample(1100)
articles = list(data['article'])
summaries = list(data['highlights'])
for i in range(len(data)):
    article = data.iloc[i]['article']
    summary = data.iloc[i]['highlights']
    new_summary = "<sos> " + summary + " <eos>"
    articles_to_tokenize.append(article)
    summaries_to_tokenize.append(new_summary)

tokenizer = get_tokenizer('basic_english')

def yield_article_tokens():
    for article in articles_to_tokenize:
        tokens = tokenizer(article)
        yield tokens

articles_token_generator = yield_article_tokens()
articles_vocab = build_vocab_from_iterator(articles_token_generator, min_freq=5, specials=['<unk>'], max_tokens=20000) # to be used in the encoder
articles_vocab.set_default_index(articles_vocab['<unk>'])

def yield_summary_tokens():
    for summary in summaries_to_tokenize:
        tokens = tokenizer(summary)
        yield tokens

summaries_token_generator = yield_summary_tokens()
summaries_vocab = build_vocab_from_iterator(summaries_token_generator, min_freq=3, specials=['<sos>', '<unk>', '<eos>'], max_tokens=20000) # to be used in the decoder
summaries_vocab.set_default_index(summaries_vocab['<unk>'])

In [5]:
# How to tokenize a summary and get back its original version

summary_to_tokenize = "<sos> " + data.iloc[0]['highlights'] + " <eos>"
tokenized_summary = tokenizer(summary_to_tokenize)

tokens = [summaries_vocab[token] for token in tokenized_summary]

def tokens_to_text(tokens, vocab):
    itos = vocab.get_itos()  # Get the list of tokens
    return " ".join(itos[index] for index in tokens)

original_summary = tokens_to_text(tokens, summaries_vocab)
# data.iloc[0]['highlights'], original_summary

In [11]:
encoder_vocab_size = len(articles_vocab)
decoder_vocab_size = len(summaries_vocab)
encoder_vocab_size, decoder_vocab_size

(11869, 3114)

## ARCHITECTURE

#### ENCODER

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers, dropout_p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, embedding_size) # (vocab size of articles - encoder input, embedding dimensions)
        self.rnn = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size*2, hidden_size) # hidden state
        self.fc_cell = nn.Linear(hidden_size*2, hidden_size) # LSTM cell output
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        # Create embedding based on current token x
        embedding = self.dropout(self.embedding(x))

        # Pass embedding to the LSTM unit
        # concatenated hidden states (since LSTM is bidirectional) for each token in the sequence, hidden states (one for each direction) of last layer, cell states of last layer
        encoder_states, (hidden, cell) = self.rnn(embedding)

        # Concatenate last hidden state returned by the LSTM on dimension 2 (2,1,50) -> (1,1,100), then pass that to a linear layer to reduce it to (1,1,50) for the decoder
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell

In [7]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, dropout_p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size*2 + embedding_size, hidden_size, n_layers) # will take the context vector and the embedding of the current token

        self.energy = nn.Linear(hidden_size*3, 1) 
        # will calculate the energy for each encoder token for the current step based on its encoder states (hidden_size*2 because of bidirectional LSTM) and the last hidden state
        self.fc = nn.Linear(hidden_size, output_size) # fianl fully-connected layer
        self.dropout = nn.Dropout(dropout_p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        # Add a dimension to x: 1 --> 2. x here is the current token
        x = x.unsqueeze(0)

        # Create embedding for the current token
        embedding = self.dropout(self.embedding(x))

        # Calculate energy for the attention mechanism
        sequence_length = encoder_states.shape[0] # number of tokens in the sequence
        h_reshaped = hidden.repeat(sequence_length, 1, 1) # make a copy of the last hidden state of the encoder for every token in the sequence
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))

        # Pass energy score through a softmax to get the attention score for each token in the sequence
        attention = self.softmax(energy)

        # Compute the context vector for the current token based on the attention scores and encoder states
        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        # Pass the concatenated context vector and embedding through an LSTM layer. The hidden and cell states will be used in the next pass through the decoder
        rnn_input = torch.cat((context_vector, embedding), dim=2)
        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))

        # Make the prediction based on the outputs of the LSTM layer, passing it into a final linear layer
        predictions = self.fc(outputs).squeeze(0)
        
        return predictions, hidden, cell

In [12]:
# Test pass on a sample sentence
source = data.iloc[0]['article']
source_ = "<sos> " + source + " <eos>"
tokenized_source = tokenizer(source_)
tokens = [articles_vocab[token] for token in tokenized_source]
article_tensor = torch.LongTensor(tokens).unsqueeze(1)

target = data.iloc[0]['highlights']
target_ = "<sos> " + target + " <eos>"
tokenized_target = tokenizer(target_)
tgt_tokens = [summaries_vocab[token] for token in tokenized_target]
summary_tensor = torch.LongTensor(tgt_tokens).unsqueeze(1)

embedding_size = 100
hidden_size = 50
n_layers = 1
dropout_p = 0.2

enc = Encoder(encoder_vocab_size, embedding_size, hidden_size, n_layers, dropout_p)
encoder_states, hidden, cell = enc(article_tensor)
encoder_states.shape, hidden.shape, cell.shape

output_size = decoder_vocab_size
current_word = "hello"
tokenized_word = tokenizer(current_word)
current_token = [summaries_vocab[token] for token in tokenized_word]
current_tensor = torch.LongTensor(current_token)

dec = Decoder(decoder_vocab_size, embedding_size, hidden_size, output_size, n_layers, dropout_p)
predictions, hidden, cell = dec(current_tensor, encoder_states, hidden, cell)
predictions.shape

torch.Size([1, 3114])

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1] # number of sequences in the source batch
        target_len = target.shape[0] # number of tokens in the target vector
        target_vocab_size = decoder_vocab_size

        # outputs variable to store the predictions - a prediction for each target token in the batch, the prediction is a one-hot encoded vector
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        # run encoder through the batch's sequences
        encoder_states, hidden, cell = self.encoder(source)

        # Define the first decoder input token - will always be the <sos> token
        x = target[0]

        # Loop through every token in the target after the <sos>
        for t in range(1, target_len):
            # run decoder based on encoder's outputs and update the hidden and cell states
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for time step t (current time step)
            outputs[t] = output

            # Best word prediction - index in the vocab
            best_guess = output.argmax(1)

            # Teacher forcing
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs # outputs will end up being a sequence with a mapping of indexes to words a in the summaries_vocab

In [14]:
# Test summary done by model with random weights - now need to set up training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

s2s = Seq2Seq(enc, dec)
preds = s2s(article_tensor, summary_tensor)

final_pred = ""
for pred_tensor in preds[1:]:
    for pred in pred_tensor:
        guess_idx = pred.argmax(0)
        word_guess = summaries_vocab.lookup_token(guess_idx)
        final_pred += ' ' + word_guess

final_pred

' actor details does confirms yesterday dancers politics adam enjoys short action tent efforts reagan reagan wind november development countries porn mental learned development want well palace we victims between lift golden so are increasing tent actions foot between row between compared witness outbreak apparel cable kenny korea faith 30 bigger systems taylor moved customers overcome announced turkey breathe reject bodies spread compared'

## Training

In [15]:
# Data Preparation
train_data = data[:100]
test_data = data[100:110]

## add '<sos>' and '<eos>' tokens to each input and target
train_data.loc[:, 'article'] = '<sos> ' + train_data['article'] + ' <eos>'
train_data.loc[:, 'highlights'] = '<sos> ' + train_data['highlights'] + ' <eos>'
test_data.loc[:, 'article'] = '<sos> ' + test_data['article'] + ' <eos>'
test_data.loc[:, 'highlights'] = '<sos> ' + test_data['highlights'] + ' <eos>'

## reset indexes
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

## tokenize and convert to numerical form
train_data['article'] = train_data['article'].apply(lambda text: [articles_vocab[token] for token in tokenizer(text)])
train_data['highlights'] = train_data['highlights'].apply(lambda text: [summaries_vocab[token] for token in tokenizer(text)])

test_data['article'] = test_data['article'].apply(lambda text: [articles_vocab[token] for token in tokenizer(text)])
test_data['highlights'] = test_data['highlights'].apply(lambda text: [summaries_vocab[token] for token in tokenizer(text)])


In [16]:
class TextDataset(Dataset):
    def __init__(self, articles, highlights):
        self.articles = articles
        self.highlights = highlights

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        return self.articles[idx], self.highlights[idx]
    
def collate_fn(data):
    data.sort(key=lambda x: len(x[0]), reverse=True)
    articles, highlights = zip(*data)

    # pad articles and highlights
    pad_idx = summaries_vocab["<pad>"]
    articles = pad_sequence([torch.tensor(sentence) for sentence in articles], padding_value=pad_idx, batch_first=True)
    highlights = pad_sequence([torch.tensor(sentence) for sentence in highlights], padding_value=pad_idx, batch_first=True)

    return articles, highlights

# Dataset creation
train_dataset = TextDataset(train_data['article'].tolist(), train_data['highlights'].tolist())
test_dataset = TextDataset(test_data['article'].tolist(), test_data['highlights'].tolist())

batch_size = 16
# Dataloader creation
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)


In [21]:
learning_rate = 0.001
n_epochs = 3
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 512
n_layers = 1
dropout_p = 0.2
decoder_output_size = decoder_vocab_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model and optimizer definition
encoder = Encoder(encoder_vocab_size, encoder_embedding_size, hidden_size, n_layers, dropout_p).to(device)
decoder = Decoder(decoder_vocab_size, decoder_embedding_size, hidden_size, decoder_output_size, n_layers, dropout_p).to(device)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = summaries_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Training loop
model.train()
for epoch in range(n_epochs):
    total_loss = 0
    for batch_idx, (articles, highlights) in enumerate(train_dataloader): # dataloader needs to be defined

        articles = articles.to(device)
        highlights = highlights.to(device)

        # Permute the dimensions of the tensors
        articles = articles.permute(1, 0)
        highlights = highlights.permute(1, 0)
        
        # Pass inputs through the model
        output = model(articles, highlights)

        output = output[1:].view(-1, output.shape[-1])
        highlights = highlights[1:].reshape(-1)

        # Reset gradients
        optimizer.zero_grad()

        # Compute loss and backpropagate
        loss = criterion(output, highlights)
        loss.backward()

        # Clip to avoid exploding gradient issues
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        total_loss += loss.item()

        print(batch_idx)

    print(f"Epoch: {epoch}, Loss: {total_loss / len(train_dataloader)}")


0
1
2
3
4
5
6
Epoch: 0, Loss: 7.297936303274972
0
1
2
3
4
5
6
Epoch: 1, Loss: 6.137227603367397
0
1
2
3
4
5
6
Epoch: 2, Loss: 5.989615372249058


In [23]:
# Test summary done by model after one epoch of training in 1000 samples
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

preds = model(article_tensor, summary_tensor)

final_pred = ""
for pred_tensor in preds[1:]:
    for pred in pred_tensor:
        guess_idx = pred.argmax(0)
        word_guess = summaries_vocab.lookup_token(guess_idx)
        final_pred += ' ' + word_guess

final_pred

' the the the . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .'