In [None]:
# Install miscellaneous libraries.
!pip install torchdata==0.4.1
!pip install transformers==4.22.2
!pip install torchvision==0.13.1
!pip install torchtext==0.13.1
!pip install torchaudio==0.12.1

Collecting torchdata==0.4.1
  Downloading torchdata-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker>=2.0.0 (from torchdata==0.4.1)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting torch==1.12.1 (from torchdata==0.4.1)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, portalocker, torchdata
  Attempting uninstall: torch
    Found existing installation: torch 2.0.1+cu118
    Uninstalling torch-2.0.1+cu118:
      Successfully uninstalled torch-2.0.1+cu118
  Attempting uninstall: torchdata
    Found existing installation: torchdata 0.6.1
    Uninstalling torchdata-0.6.1:
      Successfully uninstalled torchdata-0.6.1
[31mERROR:

In [None]:
# Import libraries used throughout - if you need other libraries,
# you are free to import them.
import functools
import random
import gc
from typing import Any

import numpy as np
import torch
import torchtext
import torchtext.functional as F
import torchtext.transforms
from torch import nn
from torch.utils.data import DataLoader
from torchtext.datasets import UDPOS
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertModel, BertTokenizer

In [None]:
# Constants and hyperparameters - you are free to change these for
# the bonus question.
SEED = 42
BATCH_SIZE = 32
EPOCHS = 3
LR = 2e-5
DEVICE = "cuda"
TRANSFORMER = "bert-base-uncased"

In [None]:
# Reproducibility.
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.use_deterministic_algorithms(False)

In [None]:
# Setting up dataloaders for training.
tokenizer = BertTokenizer.from_pretrained(TRANSFORMER)
init_token = tokenizer.cls_token
pad_token = tokenizer.pad_token
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
max_input_length = tokenizer.max_model_input_sizes[TRANSFORMER]

train_datapipe = UDPOS(split="train")
valid_datapipe = UDPOS(split="valid")
pos_vocab = build_vocab_from_iterator(
    [i[1] for i in list(train_datapipe)],
    specials=[init_token, pad_token],
)


def prepare_words(tokens, tokenizer, max_input_length):
    """Preprocesses words such that they may be passed into BERT.

    Parameters
    ---
    tokens : List
        List of strings, each of which corresponds to one token in a sequence.
    tokenizer : transformers.models.bert.tokenization_bert.BertTokenizer
        Tokenizer to be used for transforming word strings into word indices
        to be used with BERT.
    max_input_length : int
        Maximum input length of each sequence as expected by our version of BERT.
    Returns
    ---
    tokens : List
        List of preprocessed tokens.
    """
    # Append beginning of sentence and end of sentence markers
    # lowercase each token and cut them to the maximum length
    # (minus two to account for beginning and end of sentence).
    tokens = (
        [i.lower() for i in tokens[: max_input_length]]
    )
    # Convert word strings to indices.
    tokens = tokenizer.convert_tokens_to_ids(tokens)
    return tokens


def prepare_tags(tokens, max_input_length):
    """Convert tag strings into indices for use with torch. For symmetry, we perform
        identical preprocessing as on our words, even though we do not need beginning
        of sentence and end of sentence markers for our tags.

    Parameters
    ---
    tokens : List
        List of strings, each of which corresponds to one token in a sequence.
    max_input_length : int
        Maximum input length of each sequence as expected by our version of BERT.
    Returns
    ---
    tokens : List
        List of preprocessed tags.
    """
    # Append beginning of sentence and end of sentence markers
    # cut the tagging sequence to the maximum length (minus two to account for beginning and end of sentence).
    tokens = tokens[: max_input_length]
    # Convert tag strings to indices.
    tokens = torchtext.transforms.VocabTransform(pos_vocab)(tokens)
    return tokens


text_preprocessor = functools.partial(
    prepare_words,
    tokenizer=tokenizer,
    max_input_length=max_input_length,
)

tag_preprocessor = functools.partial(
    prepare_tags,
    max_input_length=max_input_length,
)


def apply_transform(x):
    return text_preprocessor(x[0]), tag_preprocessor(x[1])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_datapipe = (
    train_datapipe.map(apply_transform)
    .batch(BATCH_SIZE)
    .rows2columnar(["words", "pos"])
)
train_dataloader = DataLoader(train_datapipe, batch_size=None, shuffle=False)
valid_datapipe = (
    valid_datapipe.map(apply_transform)
    .batch(BATCH_SIZE)
    .rows2columnar(["words", "pos"])
)
valid_dataloader = DataLoader(valid_datapipe, batch_size=None, shuffle=False)

In [None]:
class TagLSTM(nn.Module):
    """Models an LSTM on top of a transformer to predict POS in a Neural CRF."""

    def __init__(self, nb_labels, emb_dim, hidden_dim=256):
        """Constructor.

        Parameters
        ---
        nb_labels : int
            Number of POS tags to be considered.

        emb_dim : int
            Input_size of the LSTM - effectively embedding dimension of our pretrained transformer.

        hidden_dim : int
            Hidden dimension of the LSTM.
        """
        super().__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            emb_dim, hidden_dim // 2, bidirectional=True, batch_first=True
        )
        self.tag = nn.Linear(hidden_dim, nb_labels).to(DEVICE)
        self.hidden = None

    def init_hidden(self, batch_size):
        return (
            torch.randn(2, batch_size, self.hidden_dim // 2).to(DEVICE),
            torch.randn(2, batch_size, self.hidden_dim // 2).to(DEVICE),
        )

    def forward(self, x):
        self.hidden = self.init_hidden(x.shape[0])
        x, self.hidden = self.lstm(x, self.hidden)
        x = self.tag(x)
        return x


class NeuralCRF(nn.Module):
    """Class modeling a neural CRF for POS tagging.
    We model tag-tag dependencies with a weight for each transition
    and word-tag influence through an LSTM on top of a pretrained transformer.
    """

    def __init__(
        self,
        pad_idx_word,
        pad_idx_pos,
        bot_idx,
        t_cal,
        transformer,
        lstm_hidden_dim=64,
        beta=0,
    ):
        """Constructor.

        Parameters
        ---
        pad_idx_word : int
            Index corresponding to padding in the word sequences.
        pad_idx_pos : int
            Index corresponding to padding in the tag sequences.
        bot_idx : int
            Index corresponding to beginning of tag marker in the tag sequences.
        t_cal : List[int]
            List containing all indices corresponding to tags in the tag sequences.
        transformer : BertModel
            Pretrained transformer used to embed sentences before feeding them
            into the LSTM.
        lstm_hiden_dim : int
            Hidden dimension of the LSTM used for POS tagging. Note that
            since we are bidirectional, the effective hidden dimension
            is half of this number.
        beta : float
            Regularization hyperparameter of the entropy regularizer.
            Entropy regularization is only applied for \beta > 0.
        """
        super().__init__()
        self.pad_idx_word = pad_idx_word
        self.pad_idx_pos = pad_idx_pos
        self.bot_idx = bot_idx
        self.t_cal = t_cal
        self.transformer = transformer
        self.lstm_hidden_dim = lstm_hidden_dim
        self.beta = beta
        self.transitions = nn.Parameter(torch.empty(len(t_cal), len(t_cal))).to(DEVICE)
        self.emissions = TagLSTM(
            len(t_cal),
            transformer.config.to_dict()["hidden_size"],
            lstm_hidden_dim,
        ).to(DEVICE)
        self.init_weights()

    def init_weights(self):
        nn.init.uniform_(self.transitions, -0.1, 0.1)

    def forward(self, W):
        """Decode each sentence within W and return predicted tagging.

        Parameters
        ---
        W : torch.tensor
            Word sequences of dimension batch size x max sentence length within batch.

        Returns
        ---
        sequences : list
            List of tensors, each of which contains the predicted tag indices for a particular
            word sequence.
        """
        # Calculate scores.
        emissions = self.calculate_emissions(W)
        # Run viterbi sentence by sentence.
        sequences = []
        for sentence in range(W.shape[0]):
            # Exclude beginning and end markers from each word sequence.
            scores, backpointers = self.backward_viterbi_log(
                W[sentence, ], emissions[sentence, :]
            )
            sequences += [self.get_viterbi(backpointers)]
        return sequences

    def calculate_emissions(self, W):
        """Calculate emissions (i.e., scores for each word and batch).

        Parameters
        ---
        W : torch.tensor
            Word sequences of dimension batch size x max sentence
            length within batch.

        Returns
        ---
        emissions : torch.tensor
            Word level scores for each tag of dimension batch_size x max
            sentence length within batch x |T|.
        """
        return self.emissions(self.transformer(W)[0])

    def loss(self, T, W):
        """Calculate the loss for a batch.

        Parameters
        ---
        T : torch.tensor
            True taggings for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings, have been added to T for symmetry.
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note that paddings have been added to W.

        Returns
        ---
        torch.tensor
            Mean loss for the batch.
        """
        emissions = self.calculate_emissions(W)
        scores = self.score(emissions, W, T)
        log_normalizer = self.backward_log_Z(W, emissions)

        loss = torch.negative(torch.mean(scores - log_normalizer))
        if self.beta > 0.0:
            unnormalized_entropy = self.backward_entropy(
                W, emissions
            )
            entropy = (
                (unnormalized_entropy / torch.exp(log_normalizer))
                + log_normalizer
            )
            if torch.isinf(torch.max(torch.exp(log_normalizer))):
                return loss
            else:
                return loss + torch.negative(self.beta * torch.mean(entropy))
        else:
            return loss

    def score(self, emissions, W, T):
        """Calculate scores for specified taggings and word sequences.

        Parameters
        ---
        emissions : torch.tensor
        T : torch.tensor
            Taggings for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings have been added to T.
            We expect T to already have the initial BOT tag indices removed
            (see `loss` for details).
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings have been added to W so we mask them out here.
            (see `loss` for details).

        Returns
        ---
        scores : torch.tensor
            score(T, W) for all samples in W.
        """
        scores = (
            emissions[:, 0].gather(1, (T[:, 0]).unsqueeze(1)).squeeze()
            + self.transitions[self.bot_idx, T[:, 0]]
        )
        for word in range(1, emissions.shape[1]):
            mask = torch.where(
                W[:, word] == self.pad_idx_word, 0, 1
            )
            scores += mask * (
                emissions[:, word]
                .gather(1, (T[:, word]).unsqueeze(1))
                .squeeze()
                + self.transitions[T[:, word - 1], T[:, word]]
            )
        return scores

    def viterbi_naive(self, W, emissions):
        """Calculate best tagging naively and return both the best score and best tagging in log space.

        NB: This naive version is not vectorized over samples.

        Parameters
        ---
        W : torch.tensor
            Of dimension longest sequence within batch or less.
            Note the paddings have been added to W so we manually remove them here if present.
        emissions : torch.tensor
            Word level scores for each tag of dimension max
            sentence length within batch x |T|

        Returns
        ---
        Tuple[torch.tensor, torch.tensor]
            Tuple containing the log-score of the best tagging and the
            indices of the best tagging for W.
        """
        T = self.t_cal
        # Remove padding.
        if torch.any(W == self.pad_idx_word):
            W = W[torch.where(W != self.pad_idx_word)[0]]
        T_abs = len(T)
        combinations = torch.combinations(
            T, r=W.shape[0], with_replacement=True
        ).to(DEVICE)
        combinations = torch.cartesian_prod(*[T for ix in range(W.shape[0])]).to(DEVICE)
        best_score = torch.tensor(0.0, dtype=torch.float64).to(DEVICE)
        best_tag = torch.tensor([]).to(DEVICE)
        for ix, combination in enumerate(combinations):
            if W.shape[0] == 1:
                current_score = (
                    emissions[0, combination]
                    + self.transitions[self.bot_idx, combination]
                )
            else:
                current_score = (
                    emissions[0, combination[0]]
                    + self.transitions[self.bot_idx, combination[0]]
                )
                for qx in range(1, combination.shape[0]):
                    current_score += (
                        emissions[qx, combination[qx]]
                        + self.transitions[
                            combination[qx - 1], combination[qx]
                        ]
                    )

            if (current_score) > best_score:
                best_score = current_score.double()
                best_tag = combination
        return best_score, best_tag

    def log_Z_naive(self, W, emissions):
        """Calculate log Z naively.

        NB: This naive version is not vectorized over samples.

        Parameters
        ---
        W : torch.tensor
            Of dimension longest sequence within batch or less.
            Note the paddings have been added to W so we manually remove them here if present.
        emissions : torch.tensor
            Word level scores for each tag of dimension max
            sentence length within batch x |T|

        Returns
        ---
        torch.tensor
            Log Z for W.
        """
        T = self.t_cal
        # Remove padding
        W = W[torch.where(W != self.pad_idx_word)[0]]
        T_abs = len(T)

        # Generate \mathcal{T}^N.
        combinations = torch.cartesian_prod(*[T for ix in range(W.shape[0])]).to(DEVICE)
        log_normalizer = torch.zeros(
            combinations.shape[0], dtype=torch.float64
        ).to(DEVICE)
        # Loop over all possible combinations naively.
        # NB: This is essentially line one on Slide 50.
        for ix, combination in enumerate(combinations):
            # Kludge since indexing is slightly different for one-dim
            # tensors vs two tensors.
            if W.shape[0] == 1:
                # Calculate score as the sum of emissions (i.e., how well
                # does a word match a tag based on BERT embeddings) and
                # transitions (globally, how likely is a transition
                # from the previous tag to the current tag).
                # NB: For the first word, the initial tag is always BOT.
                # NB 2: Since we are in log-space, the exp
                # of the score goes away.
                log_normalizer[ix] = (
                    emissions[0, combination]
                    + self.transitions[self.bot_idx, combination]
                )

            else:
                # Initial score is identical to above.
                log_normalizer[ix] = (
                    emissions[0, combination[0]]
                    + self.transitions[self.bot_idx, combination[0]]
                )
                for qx in range(1, combination.shape[0]):
                    # Score within each potential tagging
                    # is calculated the same as above except that we now
                    # actually use the previous tag instead of always
                    # BOT.
                    log_normalizer[ix] += (
                        emissions[qx, combination[qx]]
                        + self.transitions[
                            combination[qx - 1], combination[qx]
                        ]
                    )
        # Calculate logsumexp numerically stable
        # since we are in log-space.
        return torch.logsumexp(log_normalizer, 0)


    def entropy_naive(self, W, emissions):
        """Calculate the unnormalized entropy naively.

        NB: This naive version is not vectorized over samples.

        Parameters
        ---
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension longest sequence within batch or less.
            Note the paddings have been added to W so we manually remove them here if present.
        emissions : torch.tensor
            Word level scores for each tag of dimension max
            sentence length within batch x |T|

        Returns
        ---
        torch.tensor
            Log Z for W.
        """
        T = self.t_cal
        # Remove padding
        W = W[torch.where(W != self.pad_idx_word)[0]]
        T_abs = len(T)
        combinations = torch.combinations(
            T, r=W.shape[0], with_replacement=True
        )
        combinations = torch.cartesian_prod(T, T)
        combinations = torch.cartesian_prod(*[T for ix in range(W.shape[0])]).to(DEVICE)
        entropy = torch.zeros(1, dtype=torch.float64).to(DEVICE)
        for ix, combination in enumerate(combinations):
            if W.shape[0] == 1:
                entropy -= torch.exp((
                    emissions[0, combination]
                    + self.transitions[self.bot_idx, combination]
                )) * (
                    emissions[0, combination]
                    + self.transitions[self.bot_idx, combination]
                )

            else:
                local_score = (
                    emissions[0, combination[0]]
                    + self.transitions[self.bot_idx, combination[0]]
                )
                for qx in range(1, combination.shape[0]):
                    local_score += (
                        emissions[qx, combination[qx]]
                        + self.transitions[
                            combination[qx - 1], combination[qx]
                        ]
                    )
                entropy -= torch.exp(local_score) * local_score
        return entropy

    def get_viterbi(self, backpointer_matrix):
        """Return the best tagging based on a backpointer matrix.

        Parameters
        ---
        backpointer_matrix : torch.tensor
            Backpointer matrix from Viterbi indicating which
            tag is the highest scoring for each element in the sequence.

        Returns
        ---
        torch.tensor
            Indices of the best tagging based on `backpointer_matrix`.
        """
        raise NotImplementedError

    def backward_log_Z(self, W, emissions):
        """Calculate log Z using the backward algorithm.

        NB: You do need to vectorize this over samples.

        Parameters
        ---
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings have been added to W so we mask them out here.
        emissions : torch.tensor
            Word level scores for each tag of dimension batch_size x max
            sentence length within batch x |T|

        Returns
        ---
        torch.tensor
            Log Z for each sample in W.
        """
        raise NotImplementedError

    def forward_log_Z(self, W, emissions):
        """Calculate log Z using the forward algorithm.

        NB: You do need to vectorize this over samples.

        Parameters
        ---
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings have been added to W so we mask them out here.
        emissions : torch.tensor
            Word level scores for each tag of dimension batch_size x max
            sentence length within batch x |T|

        Returns
        ---
        torch.tensor
            Log Z for each sample in W.
        """
        raise NotImplementedError

    def backward_entropy(self, W, emissions):
        """Calculate the unnormalized entropy using the backward algorithm.

        NB: You do need to vectorize this over samples.

        Parameters
        ---
        W : torch.tensor
            Words for each sequence within the batch.
            Of dimension batch size x longest sequence within batch.
            Note the paddings have been added to W so we mask them out here.
        emissions : torch.tensor
            Word level scores for each tag of dimension batch_size x max
            sentence length within batch x |T|

        Returns
        ---
        torch.tensor
            Unnormalized entropy for each sample in W.
        """
        raise NotImplementedError

    def backward_viterbi_log(self, W, emissions):
        """Calculate the best tagging using the backward algorithm and return
            both the scoring matrix in log-space and the backpointer matrix.

        NB: You do not need to vectorize this over samples.

        Parameters
        ---
        W : torch.tensor
            Of dimension longest sequence within batch or less.
            Note the padding have been added to W so we manually remove them here if present.
        emissions : torch.tensor
            Word level scores for each tag of dimension max
            sentence length within batch x |T|

        Returns
        ---
        Tuple[torch.tensor, torch.tensor]
            Tuple containing the scoring matrix in log-space and the
            backpointer matrix for recovering the best tagging.
        """
        raise NotImplementedError

    def dijkstra_viterbi_log(self, W, emissions):
        """Calculate the best tagging using Dijsktra's algorithm and return
            both the best score and best tagging in log space.

        NB: You do not need to vectorize this over samples.

        Parameters
        ---
        W : torch.tensor
            Of dimension longest sequence within batch or less.
            Note the paddings have been added to W so we manually remove them here if present.
        emissions : torch.tensor
            Word level scores for each tag of dimension max
            sentence length within batch x |T|


        Returns
        ---
        Tuple[torch.tensor, log_Z]
            Tuple containing the log-score of the best tagging and log_Z.
            NB: We return log_Z if we already use it within the method
            to calculate probabilities, such that we don't have to
        """
        raise NotImplementedError

In [None]:
for i, data in enumerate(train_dataloader):
    W_train = F.to_tensor(data["words"], padding_value=pad_token_idx).to(DEVICE)
    T_train = F.to_tensor(data["pos"], padding_value=pos_vocab[pad_token]).to(DEVICE)
    if i == 0:
        break

bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
bert.eval()
T_CAL = torch.tensor([i for i in range(pos_vocab.__len__())]).to(DEVICE)
crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Q3a)

In [None]:
emissions = crf.calculate_emissions(W_train)
for sentence in range(W_train.shape[0]):
    for word_index in [1, 2, 3]:
        assert torch.isclose(
            crf.log_Z_naive(
                W_train[sentence, :word_index],
                emissions[
                    sentence,
                ],
            ),
            crf.backward_log_Z(W_train[:, :word_index], emissions)[sentence],
            atol=1e-07
        )


NotImplementedError: ignored

In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
)
emissions = crf.calculate_emissions(W_train)

assert torch.all(torch.isclose(crf.backward_log_Z(W_train, emissions),
    torch.tensor([ 87.7046677 ,  53.91382176,  51.06687669,  47.74253652,
       108.39011677,  39.48809894,  38.95036134,  47.93318056,
       105.83247378,  60.44181916,  57.37825015,  87.27584595,
        56.36454825,  51.35881652,  81.8586711 ,  78.36976768,
        80.75507746,  32.99132527,  60.26593105,  26.69618798,
        44.96849009,  48.03838924,  51.79710605, 121.1404292 ,
       148.84424765, 154.71288978,  39.11101607,  30.1374774 ,
        26.99081028,  91.06613232,  60.98234626,  79.20111824], device='cuda:0',
       dtype=torch.float64), atol=1e-07))

## Q3b)

In [None]:
emissions = crf.calculate_emissions(W_train)
assert torch.all(
    torch.isclose(
        crf.backward_log_Z(W_train, emissions),
        crf.forward_log_Z(W_train, emissions),
        atol=1e-07
    ),

)


## Q3c)

In [None]:
emissions = crf.calculate_emissions(W_train)

for sentence in range(W_train.shape[0]):
    for word_index in [1, 2, 3]:
        score_naive, sequence_naive = crf.viterbi_naive(
            W_train[sentence, :word_index],
            emissions[
                sentence,
            ],
        )
        score_viterbi, backpointers_viterbi = crf.backward_viterbi_log(
            W_train[sentence, :word_index],
            emissions[
                sentence,
            ],
        )
        sequence_viterbi = crf.get_viterbi(backpointers_viterbi)
        assert torch.isclose(score_viterbi[0, 0], score_naive, atol=1e-07)
        assert torch.all(sequence_viterbi == sequence_naive)


In [None]:
# NB: Our evaluation expects Viterbi to only predict tags for actual
# words, thus Viterbi (or get_viterbi) is supposed to remove instances
# of PAD. Example: If Viterbi is asked to predict the sequence
# ["I", "like" "dogs", "PAD", "PAD"], it should return a tagging
# of length 3 (one for each valid word in the input).
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
)
emissions = crf.calculate_emissions(W_train)
sequences_overall = []
first_batch_sequences_test =  [torch.tensor([10, 10, 10, 10, 10, 10, 10, 10, 15, 14, 10, 15, 14,  1,  3,  2,  4,  4,
                                        4,  4,  4,  4, 14, 14,  4,  4,  4,  4, 15], device='cuda:0'),
                                torch.tensor([14, 10, 10, 15, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 14],
                                        device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
                                        device='cuda:0'),
                                torch.tensor([14, 13,  3, 13,  3,  3, 13,  4, 15, 15, 15, 15, 15,  6,  9, 10],
                                        device='cuda:0'),
                                torch.tensor([11, 10, 15, 10, 10, 10, 10, 10, 10, 10, 15, 10, 10,  3, 14, 14, 14, 13,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15,  6,  3,  2,  2, 18],
                                        device='cuda:0'),
                                torch.tensor([14,  9,  4, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15], device='cuda:0'),
                                torch.tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], device='cuda:0'),
                                torch.tensor([14,  1, 10,  3, 14,  1,  3, 14,  8, 14, 14, 14, 14, 14,  0, 16],
                                        device='cuda:0'),
                                torch.tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
                                        device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10], device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        2], device='cuda:0'),
                                torch.tensor([ 9, 15, 15, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10,  5, 14, 13, 14, 13,  4, 15,  2,  2,  2], device='cuda:0'),
                                torch.tensor([14,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 14,  4, 14, 14,  0,
                                        16], device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
                                        device='cuda:0'),
                                torch.tensor([14, 14, 14, 13,  3, 17, 10, 10, 15, 14, 14, 14, 14, 14, 14, 14, 14,  9,
                                        10, 10, 10, 10, 10, 10, 10,  2,  2], device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10,  8, 14], device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 15, 10, 10, 15,  1, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 15,  4, 14, 14, 14], device='cuda:0'),
                                torch.tensor([14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15], device='cuda:0'),
                                torch.tensor([14,  1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10], device='cuda:0'),
                                torch.tensor([11, 10, 10, 10, 10, 10, 10, 10, 10], device='cuda:0'),
                                torch.tensor([10, 10, 10, 10, 10, 10, 10, 10,  8, 14,  8, 14, 14, 14, 14],
                                        device='cuda:0'),
                                torch.tensor([14, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
                                        device='cuda:0'),
                                torch.tensor([ 9,  1,  9, 10, 10, 10, 15, 15, 10, 10, 10, 10, 10, 10,  6,  0,  6],
                                        device='cuda:0'),
                                torch.tensor([14, 10, 10, 15, 14, 15, 14, 14, 14, 15, 14, 11, 10, 10,  3, 14, 11, 10,
                                        10, 15, 14, 14, 10, 10, 15, 14, 14, 10, 10, 10, 10, 10, 10, 15, 15, 15,
                                        15, 15, 15, 14], device='cuda:0'),
                                torch.tensor([10, 10, 10,  3, 14, 14,  9, 18, 14, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15,  9], device='cuda:0'),
                                torch.tensor([10, 10,  3, 10, 10, 10, 10, 10,  3,  2,  4,  4, 15,  3,  9, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  0, 11, 10, 15,
                                        6, 16, 10, 10, 10, 10, 15, 10, 10, 10, 10, 10, 10, 10, 10],
                                        device='cuda:0'),
                                torch.tensor([14, 14, 14, 14, 14, 14, 14, 11, 10, 10,  8, 17, 14], device='cuda:0'),
                                torch.tensor([14,  3, 14, 11, 10,  3, 14, 11, 10, 15], device='cuda:0'),
                                torch.tensor([14,  1, 14, 14, 14,  8,  3, 14, 14], device='cuda:0'),
                                torch.tensor([15, 15, 10, 10, 10, 15,  1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15], device='cuda:0'),
                                torch.tensor([15, 15, 15, 15, 15,  3, 10, 10, 10, 15, 10, 10, 10,  3, 17,  4,  4, 15,
                                        10, 15], device='cuda:0'),
                                torch.tensor([10, 10, 10, 10, 10, 10, 15,  3, 14, 14, 10, 10, 10, 10, 15, 15, 14, 10,
                                        10, 10, 10, 10, 10, 10, 10, 10], device='cuda:0')]

for sentence in range(W_train.shape[0]):
    score_viterbi, backpointers_viterbi = crf.backward_viterbi_log(
        W_train[sentence, :],
        emissions[
            sentence,
        ],
    )
    sequence_viterbi = crf.get_viterbi(backpointers_viterbi)
    sequences_overall += [sequence_viterbi]

assert torch.all(torch.tensor([torch.all(first_batch_sequences_test[ix].to(DEVICE) == sequences_overall[ix]) for ix in range(len(sequences_overall))]))

## Q3d)

In [None]:
emissions = crf.calculate_emissions(W_train)

for sentence in range(W_train.shape[0]):
    for word_index in [1, 2, 3]:
        score_viterbi, backpointers_viterbi = crf.backward_viterbi_log(
            W_train[sentence, :word_index], emissions[sentence, :word_index]
        )
        score_dijkstra, log_Z = crf.dijkstra_viterbi_log(
            W_train[sentence, :word_index], emissions[sentence, :word_index]
        )
        assert torch.isclose(
            score_viterbi[0, 0],
                score_dijkstra
                + torch.sum(
                    (W_train[sentence, :word_index] != crf.pad_idx_word)
                )
                * log_Z,
                atol=1e-07
        )

## Q3e)

In [None]:
%%timeit -n 5 -r2
score_viterbi, backpointers_viterbi = crf.backward_viterbi_log(W_train[0, :3], emissions[0, :])

In [None]:
%%timeit -n 5 -r2
score_dijkstra, log_Z = crf.dijkstra_viterbi_log(W_train[0, :3], emissions[0, :])

In [None]:
%%timeit -n 5 -r2
score_naive, sequence_naive = crf.viterbi_naive(W_train[0, :3], emissions[0, :])

## Q3f)

In [None]:
def train_model_report_accuracy(
    crf,
    lr,
    epochs,
    train_dataloader,
    dev_dataloader,
    pad_token_idx_word,
    pad_token_idx_tag,
):

    """Train model for `epochs` epochs and report performance on
        dev set after each epoch.

    Parameters
    ---
    crf : NeuralCRF
    lr : float
        Learning rate to train with.
    epochs : int
        For how many epochs to train.
    train_dataloader : torch.DataLoader
    dev_dataloder : torch.DataLoader
    pad_token_idx_word : int
        Index with which to pad the word indices.
    pad_token_idx_tag : int
        Index with which to pad the tag indices.
    """
    optimizer = torch.optim.Adam(crf.parameters(), lr=lr)
    for epoch in range(epochs):
        crf.train()
        crf.transformer.train()
        for i, data in enumerate(train_dataloader):
            W = F.to_tensor(data["words"], padding_value=pad_token_idx_word).to(DEVICE)
            T = F.to_tensor(data["pos"], padding_value=pad_token_idx_tag).to(DEVICE)
            for param in crf.parameters():
                param.grad = None
            loss = crf.loss(T, W)
            loss.backward()
            optimizer.step()
        crf.eval()
        crf.transformer.eval()
        with torch.no_grad():
            predicted_sequences = []
            true_sequences = []
            for i_dev, data_dev in enumerate(valid_dataloader):
                W_dev = F.to_tensor(
                    data_dev["words"], padding_value=pad_token_idx_word
                ).to(DEVICE)
                T_dev = F.to_tensor(
                    data_dev["pos"], padding_value=pad_token_idx_tag
                ).to(DEVICE)
                sequence_viterbi = crf(W_dev)
                predicted_sequences += sequence_viterbi
                for ix in range(W_dev.shape[0]):
                    true_sequences += [
                        T_dev[ix, : (sequence_viterbi[ix].shape[0])]
                    ]
            acc = torch.tensor(0.0).to(DEVICE)
            for ix in range(len(predicted_sequences)):
                acc += torch.mean(
                    (predicted_sequences[ix] == true_sequences[ix]).float()
                )
            acc = acc / len(predicted_sequences)
            print("-------------------------")
            print(f"Epoch: {epoch + 1} / {epochs}")
            print(f"Development set accuracy: {acc}")
            print("-------------------------")
        epoch += 1
    return None


In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

torch.cuda.empty_cache()
gc.collect()

bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
)

train_model_report_accuracy(
    crf,
    LR,
    EPOCHS,
    train_dataloader,
    valid_dataloader,
    pad_token_idx,
    pos_vocab[pad_token],
)


## Q3g)

In [None]:
emissions = crf.calculate_emissions(W_train)

for sentence in range(W_train.shape[0]):
    for word_index in [1, 2, 3]:
        assert torch.isclose(
            crf.entropy_naive(
                W_train[sentence, :word_index],
                emissions[
                    sentence,
                ],
            ),
            crf.backward_entropy(W_train[:, :word_index], emissions)[sentence],
            atol=1e-04
        )


In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
)
emissions = crf.calculate_emissions(W_train)
assert torch.all(torch.isclose(crf.backward_entropy(W_train, emissions),
    torch.tensor([-2.69665035e+37,  2.23318055e+22, -1.43720747e+21, -3.77172095e+19,
       -1.47468585e+47,  4.60982311e+15, -8.96114624e+15, -2.66620540e+20,
       -6.01803307e+45,  5.64486026e+24, -1.41724632e+25, -6.25432556e+37,
       -5.13592636e+23, -9.27744961e+21, -4.01553440e+35, -3.38492877e+33,
       -1.01823554e+35,  1.75573266e+13, -5.52993136e+24, -6.16945452e+10,
        1.83060232e+18,  7.18576385e+18, -1.06388015e+22, -3.28524515e+51,
       -1.60395407e+64, -1.20556752e+67, -3.29681723e+16, -3.47751146e+12,
       -8.19066087e+10, -1.66010499e+39, -1.45616448e+26, -2.05484106e+34], device='cuda:0', dtype=torch.float64), atol=1e-04, rtol=1e-04))

In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

torch.cuda.empty_cache()
gc.collect()

bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
entropy_regularized_crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
    beta=10.0,
)
train_model_report_accuracy(
    entropy_regularized_crf,
    LR,
    EPOCHS,
    train_dataloader,
    valid_dataloader,
    pad_token_idx,
    pos_vocab[pad_token],
)


In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

torch.cuda.empty_cache()
gc.collect()

bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
entropy_regularized_crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
    beta=1.0,
)
train_model_report_accuracy(
    entropy_regularized_crf,
    LR,
    EPOCHS,
    train_dataloader,
    valid_dataloader,
    pad_token_idx,
    pos_vocab[pad_token],
)


In [None]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

torch.cuda.empty_cache()
gc.collect()

bert = BertModel.from_pretrained(TRANSFORMER).to(DEVICE)
entropy_regularized_crf = NeuralCRF(
    pad_idx_word=pad_token_idx,
    pad_idx_pos=pos_vocab[pad_token],
    bot_idx=pos_vocab[init_token],
    t_cal=T_CAL,
    transformer=bert,
    beta=0.1,
)
train_model_report_accuracy(
    entropy_regularized_crf,
    LR,
    EPOCHS,
    train_dataloader,
    valid_dataloader,
    pad_token_idx,
    pos_vocab[pad_token],
)
