<a href="https://colab.research.google.com/github/ipavlopoulos/modern_nlp/blob/main/Modern_NLP_S2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ModernNLP: #2
* Discussing text restoration by [Sommerschield et al.](https://www.aclweb.org/anthology/D19-1668/).
* Experimenting with a vanilla RNN encoder in Pytorch.
* Performing text classification to predict the next character.
* Instead of Ancient Greek text, we will use Plato in English. 

> Authored by John Pavlopoulos & Vasiliki Kougia

In [1]:
import nltk; nltk.download('punkt')
from urllib import request
from nltk import tokenize
from nltk.corpus import stopwords
import random; random.seed(42)
import numpy as np
import math
import torch
import torch.nn as nn
from torch.nn.utils import rnn
import torch.nn.functional as F
from torch import autograd

[nltk_data] Downloading package punkt to /Users/guilherme/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Download and pre-process the data

In [2]:
class Corpus:
    def __init__(self):
        data = request.urlopen("http://www.gutenberg.org/cache/epub/1497/pg1497.txt").read().decode("utf8")
        self.data = data[760:-19110] # cut editorial notes and licences
        sentences = tokenize.sent_tokenize(self.data)
        self.sentences = [sentence.strip().replace("\n", "").replace("\r", " ").lower() for sentence in sentences]
    
    @property
    def all_letters(self):
        return list(set(" ".join(self.sentences)))

In [3]:
# tokenise the text, and remove any noise
corpus = Corpus()
np.random.shuffle(corpus.sentences)

# The vocabulary will comprise characters
print(corpus.all_letters)

['w', 'e', 'k', 'p', 'x', '+', 'a', 'j', ':', '7', '3', '0', 'r', '6', '4', '!', 'i', 'z', 's', '8', '1', '/', 't', 'l', 'f', 'h', 'd', '-', ';', 'm', '"', 'c', '*', 'b', 'o', 'v', 'u', '2', 'n', ',', "'", ' ', 'g', '.', '(', 'y', '?', '5', 'q', ')', '=', '9']


In [4]:
print(corpus.sentences[np.random.randint(len(corpus.sentences))])

the resolution of some philosophical or theological question seems to them more interesting and important than any substantial knowledge of literature or science or even than a good life.


### Build the dataset
* Use text sequences.
* The |sequence|+1 will be the target.

In [5]:
class Data:
    def __init__(self, sentences, maxlen: int):
        self.maxlen = 128
        self._inputs = []
        self._targets = []
        for sentence in sentences:
            if len(sentence)<10: 
                continue
            txt = sentence[-maxlen*2:]
            split = len(txt) // 2
            self._inputs.append(txt[:split])
            self._targets.append(txt[split:])
        self._train_limit = 5000
        self._val_limit = 5500
        
    @property
    def train(self):
        return self._inputs[:self._train_limit], self._targets[:self._train_limit] 
    
    @property
    def val(self):
        return self._inputs[:self._val_limit], self._targets[:self._val_limit] 
    
    @property
    def test(self):
        return self._inputs[self._val_limit:], self._targets[self._val_limit:]
    
    @property
    def inputs(self):
        return self._inputs
    
    @property
    def targets(self):
        return self._targets
    

In [6]:
class Vocabulary:
    def __init__(self, inputs, targets):
        self.inputs = sorted(list(set("".join(inputs))))
        self.targets = sorted(list(set("".join(targets))))

In [7]:
maxlen = 64

In [8]:
data = Data(corpus.sentences, maxlen=maxlen)

In [9]:
vocabulary = Vocabulary(data.inputs, data.targets)

In [10]:
_x, _y = data.test
_x[0], _y[0]

('like, he', ' replied.')

* Use the character indices as input/output.

In [11]:
def encode_chars(text, vocabulary, maxlen):
    sequence = np.zeros(maxlen, dtype=int)
    # Assign an index to each input character
    for i, char in enumerate(text):
        if i < maxlen:
            sequence[i] = vocabulary.index(char) + 1 # Index 0 is used for padding
    return sequence

In [12]:
batch_size = 16

* Build a generator

In [13]:
def create_generator_fn(maxlen, vocabulary):
    def generator(inputs, targets, batch_size):
        lengths = [min(len(sentence), maxlen) for sentence in inputs]
        while True:
            # Loop over all instances
            data = list(zip(inputs, lengths, targets))
            random.shuffle(data)
            inputs, lengths, targets = zip(*data)
            for i in range(0, len(inputs), batch_size):
                x_inputs, x_lengths, y_targets = list(), list(), list()
                # Loop over the images in the batch and yield their instances
                for j in range(i, min(len(inputs), i + batch_size)):
                    x_inputs.append(encode_chars(inputs[j], vocabulary.inputs, maxlen))
                    x_lengths.append(lengths[j])
                    y_targets.append(encode_chars(targets[j], vocabulary.targets, maxlen))

                yield torch.LongTensor(x_inputs), torch.LongTensor(x_lengths), torch.tensor(y_targets)
    return generator

In [14]:
generator_fn = create_generator_fn(maxlen, vocabulary)

In [15]:
train_generator = generator_fn(data.train[0], data.train[1], batch_size)
val_generator = generator_fn(data.val[0], data.val[1], batch_size)

### Build the model

In [16]:
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_size=200,
        hidden_size=128,
        embedding_tensor=None,
        padding_index=0,
        num_layers=1,
        dropout=0,
    ):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.num_layers = num_layers

        # Define the layers in our architecture
        self.embedding_layer = nn.Embedding(
            vocab_size, embed_size, padding_idx=padding_index, _weight=embedding_tensor
        )
        self.drop = nn.Dropout(self.dropout)
        self.rnn = nn.GRU(
            input_size=embed_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True,
        )

    def forward(self, inputs, seq_lengths):
        # Pass the input through the embedding layer
        text_embed = self.embedding_layer(inputs)
        # Apply dropout
        x_embed = self.drop(text_embed)

        # Pass the inputs to the GRU
        packed_input = rnn.pack_padded_sequence(
            x_embed, seq_lengths, batch_first=True, enforce_sorted=False
        )
        # this fn returns the last hidden state,
        # but we want to apply dropout
        # so we ignore it
        packed_output, _ = self.rnn(packed_input)
        # Get the hidden states of all time steps
        output_rnn, lengths = rnn.pad_packed_sequence(packed_output, batch_first=True)
        # Apply dropout
        output_rnn = self.drop(output_rnn)

        # Get the last hidden state as sentence representation
        row_indices = torch.arange(0, inputs.size(0)).long()
        col_indices = seq_lengths - 1

        last_hidden_state = output_rnn[row_indices, col_indices, :]
        return last_hidden_state


class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_output,
        embed_size=200,
        hidden_size=128,
        embedding_tensor=None,
        padding_index=0,
        num_layers=1,
        dropout=0,
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.num_output = num_output
        self.num_layers = num_layers

        # Define the layers in our architecture
        self.embedding_layer = nn.Embedding(
            vocab_size, embed_size, padding_idx=padding_index, _weight=embedding_tensor
        )
        self.drop = nn.Dropout(self.dropout)
        self.rnn = nn.GRU(
            input_size=embed_size + hidden_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_layers,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, num_output)
        self.tanh = nn.Tanh()
        self.fc_2 = nn.Linear(num_output, num_output)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, inputs, context, initial_hidden_state=None):
        # Pass the input through the embedding layer
        text_embed = self.embedding_layer(inputs)
        x_embed = torch.cat((text_embed, torch.unsqueeze(context, 1)), dim=-1)
        # Apply dropout
        x_embed = self.drop(x_embed)
        # Pass the inputs to the GRU
        packed_input = rnn.pack_padded_sequence(
            x_embed, torch.ones(len(inputs)), batch_first=True, enforce_sorted=False
        )
        # this fn returns the last hidden state,
        # but we want to apply dropout
        # so we ignore it
        packed_output, _ = self.rnn(packed_input, initial_hidden_state)
        # Get the hidden states of all time steps
        output_rnn, _ = rnn.pad_packed_sequence(packed_output, batch_first=True)
        # Apply dropout
        # B x T x 128
        output_rnn = self.drop(output_rnn)

        # Get the last hidden state as sentence representation
        row_indices = torch.arange(0, inputs.size(0)).long()
        # for each sample in batch
        # run GRU using last layer as a starting point, and repeat until max_seq (we have no <eos>)
        last_hidden_state = output_rnn[row_indices, -1, :]
        # Feed the representation to the classifier and return its output
        # out = torch.cat((context, last_hidden_state), dim=-1)
        out = self.fc(last_hidden_state)
        out = out.squeeze(1)
        out = self.tanh(out)
        out = self.fc_2(out)
        out = self.softmax(out)
        return out, last_hidden_state


class SeqModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(SeqModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.bos = torch.LongTensor([0])
    
    # TODO: toggle single prediction instead of max
    def forward(self, inputs, seq_lengths):
        encoded_seq_ht = self.encoder(inputs, seq_lengths)
        hidden_state = None
        context = encoded_seq_ht
        max_length = torch.max(seq_lengths)
        decoder_inputs = (
            torch.ones(size=(inputs.shape[0], 1), dtype=torch.int32) * self.bos
        )
        predictions = []
        for step in range(max_length):
            softmax_prob, hidden_state = self.decoder(
                decoder_inputs, context, hidden_state
            )
            hidden_state = torch.unsqueeze(hidden_state, dim=0)
            y_pred = torch.argmax(softmax_prob, dim=1)
            decoder_inputs = torch.unsqueeze(y_pred, dim=1)
            predictions.append(softmax_prob)
        predictions = torch.reshape(
            torch.cat(predictions), shape=(len(inputs), max_length, -1)
        )
        return predictions


### Training

In [17]:
from tqdm.notebook import tqdm
from sklearn import metrics

In [47]:
def train(model, epochs, name, data):
    # Define optimizer and loss
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
    criterion = nn.NLLLoss()

    # Train and validate at the epoch's end, keep the best (based on val f1)
    highest_val_f1 = 0

    for idx in tqdm(range(epochs), desc="Epoch"):
        epoch = idx+1
        #Switch to train mode
        model.train()
        
        # TODO compute loss for the final token only
        for batch in tqdm(range(math.ceil(len(data.train[0])/batch_size)), desc="Iteration"):
            input_t, lengths_t, target_t = next(train_generator)
            output = model(input_t, lengths_t)
#             loss = 0.0
#             for i in range(len(target_t)):
#                 y_pred = torch.squeeze(output[i, :, :])
#                 y_true = target_t[i, :]
#                 print(y_pred.shape, y_true.shape)
#                 loss += criterion(y_pred, y_true)
            print(output.shape, target_t.shape)
            loss = criterion(output, target_t)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        #Switch to eval mode
        model.eval()
        val_loss = []
        val_targets = []
        val_outputs = []

        for i in range(math.ceil(len(data.train[0])/batch_size)):
            input_t, lengths_t, target_t = next(val_generator)
            output = model(input_t,lengths_t)
            val_outputs.append(torch.argmax(torch.squeeze(output[:, -1, :]), dim=1))
            val_targets.append(target_t[:, -1])
            val_loss.append(criterion(torch.squeeze(output[:, -1, :]), target_t[:, -1]).detach().numpy())

        val_outputs = torch.cat(val_outputs) 
        val_targets = torch.cat(val_targets)        
        f1 = metrics.f1_score(val_targets.cpu().numpy(), val_outputs.cpu().detach().numpy(), average="macro")
        print(f"EPOCH: {epoch} val loss: {sum(val_loss)/len(val_loss):.4f}, val f1: {f1:.3f}")
        if f1 > highest_val_f1:
            print("Save model....")
            torch.save({'model_state_dict': model.state_dict()}, f"{name}_pytorch_model.bin")
            highest_val_f1 = f1
    return model

In [19]:
def load_model(name, model):
    checkpoint = torch.load(f"{name}_pytorch_model.bin", map_location="cpu")
    model.load_state_dict(checkpoint['model_state_dict'])
    return model

In [48]:
encoder = Encoder(vocab_size=len(vocabulary.inputs) + 1, dropout=0.2)
decoder = Decoder(vocab_size=len(vocabulary.targets) + 1, num_output=len(vocabulary.targets), dropout=0.2)
seq_model = SeqModel(encoder, decoder)
seq_model = train(seq_model, epochs=20, name="seq", data=data)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/313 [00:00<?, ?it/s]

torch.Size([16, 64, 51]) torch.Size([16, 64])


ValueError: Expected target size (16, 51), got torch.Size([16, 64])

In [22]:
loaded_model = load_model(name="seq", model=SeqModel(encoder, decoder))

In [23]:
loaded_model

SeqModel(
  (encoder): Encoder(
    (embedding_layer): Embedding(52, 200, padding_idx=0)
    (drop): Dropout(p=0.2, inplace=False)
    (rnn): GRU(200, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding_layer): Embedding(52, 200, padding_idx=0)
    (drop): Dropout(p=0.2, inplace=False)
    (rnn): GRU(328, 128, batch_first=True)
    (fc): Linear(in_features=128, out_features=51, bias=True)
    (tanh): Tanh()
    (fc_2): Linear(in_features=51, out_features=51, bias=True)
    (softmax): Softmax(dim=-1)
  )
)

In [24]:
_x, _y = data.test

In [25]:
_x[0], _y[0]

('like, he', ' replied.')

In [27]:
def evaluate_model(model, data, vocabulary, idx):
    model.eval()
    input_seq, labels = data.test
    prompt = input_seq[idx]
    text = prompt[:10]
    print("Prompt: ", prompt)
#     for i in range(50):
    encoded_text = np.expand_dims(encode_chars(text, vocabulary.inputs, maxlen), 0)
    # Get the character with the largest probability as the next character
    inputs = torch.LongTensor(encoded_text)
    seq_len = torch.LongTensor([len(text)])
    outputs = model(inputs, seq_len)
    outputs =  outputs[0, :, :]
    print(outputs.shape)
    output_argmax = outputs.argmax(-1)
    print(output_argmax.shape)
    print(output_argmax)
    predicted = [vocabulary.targets[i] for i in output_argmax]
    print(f"{text} --> {predicted}")
    # Add the predicted character to the input
#     text = text+predicted
evaluate_model(loaded_model, data, vocabulary, 0)        

Prompt:  like, he
torch.Size([8, 51])
torch.Size([8])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
like, he --> [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


* Infer some characters for a test text to see how it works.

In [None]:
#evaluate_model(model_loaded)

# Missing parts (try to improve it)
* Improve the decoding: We used an RNN encoder and simply fed the sentence representation to the classifier to produce the next character. Use an RNN decoder to generate the next characters of the sentence.
* Add attention: Compute the self attention of the encoder and feed the attention vector to the decoder. Remember to mask.
* Bi-direction: Use a bi-directional encoder and also use bi-directional context.