In [25]:
import re
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F

In [2]:
train_data = pd.read_csv("../cnn_dailymail/train.csv")
val_data = pd.read_csv("../cnn_dailymail/validation.csv")
test_data = pd.read_csv("../cnn_dailymail/test.csv")

train_data.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


## PRE-PROCESSING

In [3]:
train_data = train_data.drop(['id'], axis=1)
train_data = train_data.reset_index(drop=True)
test_data = test_data.drop(['id'], axis=1)
test_data = test_data.reset_index(drop=True)

In [4]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_text(text, remove_stopwords=True):
    # Lowercase all letters
    text = text.lower()
    text = text.split()
    tmp = []
    # Transform contractions
    for word in text:
        if word in contractions:
            tmp.append(contractions[word])
        else:
            tmp.append(word)
    text = ' '.join(tmp)
    
    # Remove URLs
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    # Remove <a HTML character
    text = re.sub(r'\<a href', ' ', text)
    # Remove &amp character
    text = re.sub(r'&amp;', '', text)
    # Remove some special characters
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    # Remove the <br /> HTML character
    text = re.sub(r'<br />', ' ', text)
    # Remove single quotation marks
    text = re.sub(r'\'', ' ', text)
    
    # Remove stopwords
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words('english'))
        text = [w for w in text if w not in stops]
        text = ' '.join(text)
        
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guilherme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Get list of cleaned summaries (not removing stopwords)
clean_summaries = []
for summary in train_data.highlights:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print('Cleaning Summaries Complete')

# Get list of cleaned training texts (removing stopwords)
clean_texts = []
for text in train_data.article:
    clean_texts.append(clean_text(text))
print('Cleaning Texts Complete')

# Delete the training data variable to save space
del train_data

Cleaning Summaries Complete
Cleaning Texts Complete


In [7]:
# Create new dataframe to store the training data to be used (not all data is used to minimize memory consumption)
clean_df = pd.DataFrame()
clean_df['text'] = clean_texts[:10000]
clean_df['summary'] = clean_summaries[:10000]
# Drop rows with empty summaries
clean_df['summary'].replace('', np.nan, inplace=True)
clean_df.dropna(axis=0, inplace=True)

# Add start of sentence and end of sentence tokens to the summaries (targets)
clean_df['summary'] = clean_df['summary'].apply(lambda x: '<sostok>' + ' ' + x + ' ' + '<eostok>')

# Delete variables that will not be used anymore
del clean_texts
del clean_summaries

In [8]:
# Train test split
train_x, test_x, train_y, test_y = train_test_split(clean_df['text'], clean_df['summary'], test_size=0.1, random_state=0)
del clean_df

## TOKENIZATION

In [9]:
# Get a tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize your text and count the words
word_counter = Counter()
for sentence in train_x:
    word_counter.update(tokenizer(sentence))

# Then you can continue as before
thresh = 4
count = 0
total_count = 0
frequency = 0
total_frequency = 0

for key, value in word_counter.items():
    total_count += 1
    total_frequency += value
    if value < thresh:
        count += 1
        frequency += value

In [10]:
print('% of rare words in vocabulary: ', (count/total_count)*100.0)
print('Total Coverage of rare words: ', (frequency/total_frequency)*100.0)
t_max_features = total_count - count
print('Text Vocab: ', t_max_features)

% of rare words in vocabulary:  58.125684692759684
Total Coverage of rare words:  2.448459711489284
Text Vocab:  42046


In [11]:
# Same as above but for summaries and have a higher threshold (word needs to appear a minimum of 6 times to not be rare - rare words are not included in the vocab)

# Get a tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize your text and count the words
word_counter = Counter()
for sentence in train_y:
    word_counter.update(tokenizer(sentence))

# Then you can continue as before
thresh = 6
count = 0
total_count = 0
frequency = 0
total_frequency = 0

for key, value in word_counter.items():
    total_count += 1
    total_frequency += value
    if value < thresh:
        count += 1
        frequency += value

In [12]:
print('% of rare words in vocabulary: ', (count/total_count)*100.0)
print('Total Coverage of rare words: ', (frequency/total_frequency)*100.0)
s_max_features = total_count-count
print('Summary Vocab: ', s_max_features)

% of rare words in vocabulary:  77.1330121363664
Total Coverage of rare words:  9.840701372995234
Summary Vocab:  7499


In [13]:
# Set a max length for both the article and the summary - if any goes over this limit, only the first part of the sample will be included (first n tokens)
maxlen_text = 800
maxlen_summ = 150

In [14]:
val_x = test_x
# Convert text data to integer IDs
word_counter_x = Counter()
for sentence in train_x:
    word_counter_x.update(sentence)

# Keep only the most common `t_max_features` tokens
most_common_words_x = word_counter_x.most_common(t_max_features)
vocab_x = {word: i+1 for i, (word, _) in enumerate(most_common_words_x)}  # +1 to leave 0 for padding

# Convert tokens to numerical ids using the built vocab_x
train_x = [[vocab_x.get(token, 0) for token in tokens] for tokens in train_x]  # 0 for unknown tokens
val_x = [[vocab_x.get(token, 0) for token in tokens] for tokens in val_x]  # 0 for unknown tokens

# Pad the sequences
train_x = pad_sequence([torch.tensor(sentence) for sentence in train_x], batch_first=True, padding_value=0)
val_x = pad_sequence([torch.tensor(sentence) for sentence in val_x], batch_first=True, padding_value=0)

# Truncate or pad each sequence to have exactly `maxlen_text` tokens
if train_x.size(1) > maxlen_text:
    train_x = train_x[:, :maxlen_text]
else:
    train_x = F.pad(train_x, (0, maxlen_text - train_x.size(1)), value=0)

if val_x.size(1) > maxlen_text:
    val_x = val_x[:, :maxlen_text]
else:
    val_x = F.pad(val_x, (0, maxlen_text - val_x.size(1)), value=0)


In [15]:
val_y = test_y
# Convert text data to integer IDs
word_counter_y = Counter()
for sentence in train_y:
    word_counter_y.update(sentence)

# Keep only the most common `t_max_features` tokens
most_common_words_y = word_counter_y.most_common(s_max_features)
vocab_y = {word: i+1 for i, (word, _) in enumerate(most_common_words_y)}  # +1 to leave 0 for padding

# Convert tokens to numerical ids using the built vocab_y
train_y = [[vocab_y.get(token, 0) for token in tokens] for tokens in train_y]  # 0 for unknown tokens
val_y = [[vocab_y.get(token, 0) for token in tokens] for tokens in val_y]  # 0 for unknown tokens

# Pad the sequences
train_y = pad_sequence([torch.tensor(sentence) for sentence in train_y], batch_first=True, padding_value=0)
val_y = pad_sequence([torch.tensor(sentence) for sentence in val_y], batch_first=True, padding_value=0)

# Truncate or pad each sequence to have exactly `maxlen_summ` tokens
if train_y.size(1) > maxlen_summ:
    train_y = train_y[:, :maxlen_summ]
else:
    train_y = F.pad(train_y, (0, maxlen_summ - train_y.size(1)), value=0)

if val_y.size(1) > maxlen_summ:
    val_y = val_y[:, :maxlen_summ]
else:
    val_y = F.pad(val_y, (0, maxlen_summ - val_y.size(1)), value=0)

In [16]:
print("Training Sequence", train_x.shape)
print('Target Values Shape', train_y.shape)
print('Test Sequence', val_x.shape)
print('Target Test Shape', val_y.shape)

Training Sequence torch.Size([9000, 800])
Target Values Shape torch.Size([9000, 150])
Test Sequence torch.Size([1000, 800])
Target Test Shape torch.Size([1000, 150])


## EMBEDDINGS

In [17]:
embed_index = {}
embed_dim = 100
with open('./glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embed_index[word] = coefs


In [18]:
t_embed = torch.zeros((t_max_features, embed_dim))
for word, i in vocab_x.items():
    vec = embed_index.get(word)
    if i < t_max_features and vec is not None:
        t_embed[i] = torch.from_numpy(vec)


In [21]:
s_embed = torch.zeros((s_max_features, embed_dim))
for word, i in vocab_y.items():
    vec = embed_index.get(word)
    if i < s_max_features and vec is not None:
        s_embed[i] = torch.from_numpy(vec)

In [23]:
del embed_index

## MODEL ARCHITECTURE

In [26]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, enc_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)

        # Concatenate the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return outputs, hidden


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, dec_hid_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear(dec_hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, hidden.unsqueeze(0))
        prediction = self.fc_out(output.squeeze(0))

        return prediction, hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            top1 = output.argmax(1) 

            input = trg[t] if random.random() < teacher_forcing_ratio else top1

        return outputs

In [27]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.functional import cross_entropy

# Define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters())

# Define the early stopping criteria
no_improve = 0
min_val_loss = float('inf')
early_stop_epochs = 5  # stop if no improvement for 5 epochs

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for batch in train_dataloader:
        input_tensor = batch[0].to(device)
        target_tensor = batch[1].to(device)
        
        optimizer.zero_grad()
        output = model(input_tensor, target_tensor)
        
        # Pack the sequence of the output tensor to calculate loss
        output_packed = pack_padded_sequence(output, lengths, batch_first=True).data
        target_packed = pack_padded_sequence(target_tensor, lengths, batch_first=True).data
        
        loss = criterion(output_packed, target_packed)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_dataloader:
            input_tensor = batch[0].to(device)
            target_tensor = batch[1].to(device)
            
            output = model(input_tensor, target_tensor)
            
            # Pack the sequence of the output tensor to calculate loss
            output_packed = pack_padded_sequence(output, lengths, batch_first=True).data
            target_packed = pack_padded_sequence(target_tensor, lengths, batch_first=True).data
            
            loss = criterion(output_packed, target_packed)
            val_loss += loss.item()
    
    print(f"Epoch {epoch}, Train Loss: {epoch_loss / len(train_dataloader)}, Val Loss: {val_loss / len(val_dataloader)}")

    # Early stopping
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        no_improve = 0
    else:
        no_improve += 1
        if no_improve == early_stop_epochs:
            print('Early stopping')
            break

NameError: name 'PAD_IDX' is not defined