<a href="https://colab.research.google.com/github/ghchen18/habvln/blob/dev/SUFE_NER_with_LSTMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Writing Code
Look for the keyword "TODO" and fill in your code in the empty space.
Feel free to change function signatures, but be careful that you might need to also change how they are called in other parts of the notebook.

Let's import all the packages at once:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import re
from collections import Counter
from typing import List, Tuple, Dict, Optional, Any

## Long Short-term Memory (LSTM) 

Now we implement an one-layer LSTM for the NER task.

### Data Loading 

Like before, we first implement the data loader. EEach data example is a variable-length sentence. How can we pack multiple sentences with different lengths into the same batch? One possible solution is to pad them to the same length using a special token. The code below illustrates the idea: 

In [None]:
# 3 sentences with different lengths
sentence_1 = torch.tensor([6, 1, 2])
sentence_2 = torch.tensor([4, 2, 7, 7, 9])
sentence_3 = torch.tensor([3, 4])
# Form a batch by padding 0
sentence_batch = torch.tensor([
    [6, 1, 2, 0, 0],
    [4, 2, 7, 7, 9],
    [3, 4, 0, 0, 0],
])

We implement the above idea in a customized batching function `form_batch`. Optionally, see [here](https://pytorch.org/docs/stable/data.html#loading-batched-and-non-batched-data) for how batching works in PyTorch.

In [None]:
class SequenceDataset(Dataset):
    """
    Each data example is a sentence, including its words and NER tags.
    """

    def __init__(
        self, datapath: str, vocab: Optional[Dict[str, Vocab]] = None
    ) -> None:
        """
        Initialize the dataset by reading from datapath.
        """
        super().__init__()
        self.sentences: List[Sentence] = []
        UNKNOWN = "<UNKNOWN>"
        PAD = "<PAD>"  # Special token used for padding 

        print("Loading data from %s" % datapath)
        self.sentences, word_cnt, tag_cnt = read_data_file(datapath)
        print("%d sentences loaded." % len(self.sentences))

        if vocab is None:
            # If the vocabulary is not provided, calcuate it from data.
            self.vocab = {
                "words": Vocab(word_cnt, specials=[PAD, UNKNOWN]),
                "tags": Vocab(tag_cnt, specials=[]),
            }
        else:
            # Otherwise, reuse the existing vocabulary.
            self.vocab = vocab
        self.unknown_idx = self.vocab["words"].stoi[UNKNOWN]
        self.pad_idx = self.vocab["words"].stoi[PAD]

    def __getitem__(self, idx: int) -> Sentence:
        """
        Get the idx'th sentence in the dataset.
        """
        return self.sentences[idx]

    def __len__(self) -> int:
        """
        Return the number of sentences in the dataset.
        """
        # TODO: Implement this method
        # START HERE
        raise NotImplementedError
        # END

    def form_batch(self, sentences: List[Sentence]) -> Dict[str, Any]:
        """
        A customized function for batching a number of sentences together.
        Different sentences have different lengths. Let max_len be the longest length.
        When packing them into one tensor, we need to pad all sentences to max_len. 
        Return values:
            `words`: a list in which each element itself is a list of words in a sentence
            `word_idxs`: a batch_size x max_len tensor. 
                       word_idxs[i][j] is the index of the j'th word in the i'th sentence .
            `tags`: a list in which each element itself is a list of tags in a sentence
            `tag_idxs`: a batch_size x max_len tensor
                      tag_idxs[i][j] is the index of the j'th tag in the i'th sentence.
            `valid_mask`: a batch_size x max_len tensor
                        valid_mask[i][j] is True if the i'th sentence has the j'th word.
                        Otherwise, valid[i][j] is False.
        """
        words: List[List[str]] = []
        tags: List[List[str]] = []
        max_len = -1  # length of the longest sentence
        for sent in sentences:
            words.append([])
            tags.append([])
            for w, t in sent:
                words[-1].append(w)
                tags[-1].append(t)
            max_len = max(max_len, len(words[-1]))

        batch_size = len(sentences)
        word_idxs = torch.full(
            (batch_size, max_len), fill_value=self.pad_idx, dtype=torch.int64
        )
        tag_idxs = torch.full_like(word_idxs, fill_value=self.vocab["tags"].stoi["O"])
        valid_mask = torch.zeros_like(word_idxs, dtype=torch.bool)

        ## TODO: Fill in the values in word_idxs, tag_idxs, and valid_mask
        ## Caveat: There may be out-of-vocabulary words in validation data
        ## See torchtext.vocab.Vocab: https://pytorch.org/text/stable/vocab.html#torchtext.vocab.Vocab
        ## START HERE
        raise NotImplementedError
        # END

        return {
            "words": words,
            "word_idxs": word_idxs,
            "tags": tags,
            "tag_idxs": tag_idxs,
            "valid_mask": valid_mask,
        }


def create_sequence_dataloaders(
    batch_size: int, shuffle: bool = True
) -> Tuple[DataLoader, DataLoader, Vocab]:
    """
    Create the dataloaders for training and validaiton.
    """
    ds_train = SequenceDataset("eng.train")
    ds_val = SequenceDataset("eng.val", vocab=ds_train.vocab)
    loader_train = DataLoader(
        ds_train,
        batch_size,
        shuffle,
        collate_fn=ds_train.form_batch,  # customized function for batching
        drop_last=True,
        pin_memory=True,
    )
    loader_val = DataLoader(
        ds_val, batch_size, collate_fn=ds_val.form_batch, pin_memory=True
    )
    return loader_train, loader_val, ds_train.vocab

Here is a simple sanity-check. Try to understand its output.

In [None]:
def check_sequence_dataloader() -> None:
    loader_train, _, _ = create_sequence_dataloaders(batch_size=3, shuffle=False)
    print("Iterating on the training data..")
    for i, data_batch in enumerate(loader_train):
        if i == 0:
            print(data_batch)
    print("Done!")


check_sequence_dataloader()

### Implement the Model 

Next, implement LSTM for predicting NER tags from input words. [nn.LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM) is definitely useful. Further, it is tricky to handle sentences in the same batch with different lengths. You will probably need to use [nn.utils.rnn.pack_padded_sequence](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html#torch.nn.utils.rnn.pack_padded_sequence) and [nn.utils.rnn.pad_packed_sequence](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_packed_sequence.html#torch.nn.utils.rnn.pad_packed_sequence). Please read their documentation in detail.


In [None]:
class LSTM(nn.Module):
    """
    Long short-term memory for NER
    """

    def __init__(self, vocab: Dict[str, Vocab], d_emb: int, d_hidden: int, bidirectional: bool) -> None:
        """
        Initialize an LSTM
        Parameters:
            `vocab`: vocabulary of words and tags
            `d_emb`: dimension of word embeddings (D)
            `d_hidden`: dimension of the hidden layer (H)
        """
        super().__init__()
        # TODO: Create the word embeddings (nn.Embedding),
        #       the LSTM (nn.LSTM) and the output layer (nn.Linear).
        # START HERE
        raise NotImplementedError
        # END

    def forward(
        self, word_idxs: torch.Tensor, valid_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        Given words in sentences, predict the logits of the NER tag.
        Parameters:
            `word_idxs`: a batch_size x max_len tensor
            `valid_mask`: a batch_size x max_len tensor
        Return values:
            `logits`: a batch_size x max_len x 5 tensor
        """
        # TODO: Implement the forward pass
        # Hint: You may need to use nn.utils.rnn.pack_padded_sequence and nn.utils.rnn.pad_packed_sequence
        #       to handle sentences with different lengths.
        # START HERE
        raise NotImplementedError
        # END
        return logits

We do a sanity-check by loading a batch of data examples and pass it through the network.

In [None]:
def check_lstm() -> None:
    # Hyperparameters
    batch_size = 2
    d_emb = 64
    d_hidden = 128
    bidirectional = True
    # Create the dataloaders and the model
    loader_train, _, vocab = create_sequence_dataloaders(batch_size)
    model = LSTM(vocab, d_emb, d_hidden, bidirectional)
    device = get_device()
    model.to(device)
    print(model)
    # Get the first batch
    data_batch = next(iter(loader_train))
    # Move data to GPU
    word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
    tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
    valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)
    # Calculate the model
    print("Input word_idxs shape:", word_idxs.size())
    print("Input valid_mask shape:", valid_mask.size())
    logits = model(word_idxs, valid_mask)
    print("Output logits shape:", logits.size())


check_lstm()

### Training and Validation 

Complete the functions for training and validating the LSTM model. When calculating the loss function, you only want to include values from valid positions (where `valid_mask` is `True`). The `reduction` parameter in [F.cross_entropy](https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.cross_entropy) may be useful.

In [None]:
def train_lstm(
    model: nn.Module,
    loader: DataLoader,
    optimizer: optim.Optimizer,
    device: torch.device,
    silent: bool = False,  # whether to print the training loss
) -> Tuple[float, Dict[str, Any]]:
    """
    Train the LSTM model.
    Return values:
        1. the average training loss
        2. training metrics such as accuracy and F1 score
    """
    model.train()
    ground_truth = []
    predictions = []
    losses = []
    report_interval = 100

    for i, data_batch in enumerate(loader):
        word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
        tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
        valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)

        # TODO: Calculate the loss
        # START HERE
        # Caveat: When calculating the loss, you should only consider positions where valid_mask == True
        raise NotImplementedError
        # END

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        ground_truth.extend(tag_idxs_flat[valid_mask_flat].tolist())
        predictions.extend(logits_flat[valid_mask_flat].argmax(dim=-1).tolist())

        if not silent and i > 0 and i % report_interval == 0:
            print(
                "\t[%06d/%06d] Loss: %f"
                % (i, len(loader), np.mean(losses[-report_interval:]))
            )

    return np.mean(losses), eval_metrics(ground_truth, predictions)


def validate_lstm(
    model: nn.Module, loader: DataLoader, device: torch.device
) -> Tuple[float, Dict[str, Any]]:
    """
    Validate the model.
    Return the validation loss and metrics.
    """
    model.eval()
    ground_truth = []
    predictions = []
    losses = []

    with torch.no_grad():

        for data_batch in loader:
            word_idxs = data_batch["word_idxs"].to(device, non_blocking=True)
            tag_idxs = data_batch["tag_idxs"].to(device, non_blocking=True)
            valid_mask = data_batch["valid_mask"].to(device, non_blocking=True)

            # TODO: Calculate the loss
            # START HERE
            # Caveat: When calculating the loss, you should only consider positions where valid_mask == True
            raise NotImplementedError
            # END

            losses.append(loss.item())
            ground_truth.extend(tag_idxs_flat[valid_mask_flat].tolist())
            predictions.extend(logits_flat[valid_mask_flat].argmax(dim=-1).tolist())

    return np.mean(losses), eval_metrics(ground_truth, predictions)


def train_val_loop_lstm(hyperparams: Dict[str, Any]) -> None:
    """
    Train and validate the LSTM model for a number of epochs.
    """
    print("Hyperparameters:", hyperparams)
    # Create the dataloaders
    loader_train, loader_val, vocab = create_sequence_dataloaders(
        hyperparams["batch_size"]
    )
    # Create the model
    model = LSTM(
        vocab,
        hyperparams["d_emb"],
        hyperparams["d_hidden"],
        hyperparams["bidirectional"],
    )
    device = get_device()
    model.to(device)
    print(model)
    # Create the optimizer
    optimizer = optim.RMSprop(
        model.parameters(), hyperparams["learning_rate"], weight_decay=hyperparams["l2"]
    )

    # Train and validate
    for i in range(hyperparams["num_epochs"]):
        print("Epoch #%d" % i)

        print("Training..")
        loss_train, metrics_train = train_lstm(model, loader_train, optimizer, device)
        print("Training loss: ", loss_train)
        print("Training metrics:")
        for k, v in metrics_train.items():
            print("\t", k, ": ", v)

        print("Validating..")
        loss_val, metrics_val = validate_lstm(model, loader_val, device)
        print("Validation loss: ", loss_val)
        print("Validation metrics:")
        for k, v in metrics_val.items():
            print("\t", k, ": ", v)

    print("Done!")

Run the experiment:

In [None]:
train_val_loop_lstm({
    "bidirectional": True,
    "batch_size": 512,
    "d_emb": 64,
    "d_hidden": 128,
    "num_epochs": 15,
    "learning_rate": 0.005,
    "l2": 1e-6,
})