In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
!curl -so train.txt https://raw.githubusercontent.com/chaojiang06/chaojiang06.github.io/master/TA/spring2022_CS4650/train.txt

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

device = 'mps'
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data

In [11]:
# ===========================================================================
# Run some preprocessing code for our dataset. Don't modify anything in this cell.
# ===========================================================================

def load_tag_data(tag_file):
    all_sentences = []
    all_tags = []
    sent = []
    tags = []
    with open(tag_file, 'r') as f:
        for line in f:
            if line.strip() == "":
                all_sentences.append(sent)
                all_tags.append(tags)
                sent = []
                tags = []
            else:
                word, tag, _ = line.strip().split()
                sent.append(word)
                tags.append(tag)
    return all_sentences, all_tags

train_sentences, train_tags = load_tag_data('train.txt')

unique_tags = set([tag for tag_seq in train_tags for tag in tag_seq])

# Create train-val split from train data
train_val_data = list(zip(train_sentences, train_tags))
random.shuffle(train_val_data)
split = int(0.8 * len(train_val_data))
training_data = train_val_data[:split]
val_data = train_val_data[split:]

print("Train Data: ", len(training_data))
print("Val Data: ", len(val_data))
print("Total tags: ", len(unique_tags))

Train Data:  7148
Val Data:  1788
Total tags:  44


In [12]:
# ===========================================================================
# Don't modify anything in this cell.
# ===========================================================================

word_to_idx = {}
for sent in train_sentences:
    for word in sent:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

tag_to_idx = {}
for tag in unique_tags:
    if tag not in tag_to_idx:
        tag_to_idx[tag] = len(tag_to_idx)

idx_to_tag = {}
for tag in tag_to_idx:
    idx_to_tag[tag_to_idx[tag]] = tag

print("Total tags", len(tag_to_idx))
print("Vocab size", len(word_to_idx))

Total tags 44
Vocab size 19122


In [13]:
def prepare_sequence(sent, idx_mapping):
    idxs = [idx_mapping[word] for word in sent]
    return torch.tensor(idxs, dtype=torch.long)

In [23]:
prepare_sequence(["sometimes", "experiment"], word_to_idx)

tensor([1138, 4074])

### Modeling

In [2]:
# If you are interested in what other models are available, you can find a
# list of model names here (e.g., roberta-base, bert-base-uncased):
# https://huggingface.co/transformers/pretrained_models.html

from transformers import DistilBertModel, DistilBertTokenizerFast
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 5.62MB/s]


In [24]:
from torch.utils.data import Dataset

class POSDataset(Dataset):
  def __init__(self, data, tokenizer, max_len):
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    """
    Given an index, return the value in your training data (self.data). Make
    sure the full output dict from self.tokenizer is returned, with an additional
    value for your labels.

    Remember! Your BERT tokenizer will give multiple tokens to words with the
    same POS tag. We want the FIRST token be given the tag and all other tokens
    to be given -100.

    Hint: You may use the prepare_sequence() function from earlier sections
    Hint: Our training data is already tokenized, so you may find the `is_split_into_words=True`
      and `return_offsets_mapping=True` arguments helpful for getting the token offsets.
    Hint: When using the tokenizer, you can also use padding='max_length' for [PAD]
      tokens to be added for you.
    """
    encoding = None

    ### BEGIN YOUR CODE ###

    # Get the sentence and POS tags
    sentence, tags = self.data[index]
    tags = prepare_sequence(tags, tag_to_idx).to(device)

    # Use the BERT tokenizer (self.tokenizer) to encode the sentence. Make sure to
    # truncate the sentence if it is longer than self.max_len, and pad the sentence if it
    # is less than self.max_len.
    encoding = self.tokenizer(sentence,
                              is_split_into_words=True,
                              return_offsets_mapping=True,
                              padding='max_length',
                              truncation=True,
                              max_length=self.max_len)

    # Create token labels, where the first token of each word is the POS tag, and
    # all others are -100.
    encoded_labels = torch.ones(len(encoding["offset_mapping"]), dtype=int) * -100
    i = 0
    for idx, mapping in enumerate(encoding["offset_mapping"]):
      if mapping[0] == 0 and mapping[1] != 0:
        encoded_labels[idx] = tags[i]
        i += 1

    # Add the token labels back to the tokenized dict
    encoding['labels'] = torch.as_tensor(encoded_labels)

    # Make sure both your encoded sentence, labels and attention mask are PyTorch tensors
    encoding = {k: torch.as_tensor(v) for k, v in encoding.items()}

    ### END YOUR CODE ###

    return encoding

In [25]:
# Use your POSDataset class to create a train and test set
MAX_LEN = 128

# Further split your train data into train/test. You now have train/test/val.
train_test_data, split = training_data, int(0.7 * len(training_data))
random.shuffle(train_test_data)
split_training_data, split_test_data = train_test_data[:split], train_test_data[split:]

training_set = POSDataset(split_training_data, tokenizer, MAX_LEN)
testing_set = POSDataset(split_test_data, tokenizer, MAX_LEN)
validation_set = POSDataset(val_data, tokenizer, MAX_LEN)

In [26]:
# Print a few values from your Dataloader!
print(training_set.__getitem__(0)['input_ids'])
print(training_set.__getitem__(0)['labels'])

tensor([  101,  1039,  1011, 11138,  9313,  3279,  1997,  2274, 16653,  1037,
         3745,  1998,  9313,  5114,  1997,  2403, 16653,  1037,  3745,  2013,
        23260,  3466,  1997,  9529,  2689,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [27]:
# Create PyTorch dataloaders from the POSDataset
from torch.utils.data import DataLoader

training_loader = DataLoader(training_set, batch_size=64, shuffle=True)
testing_loader = DataLoader(testing_set, batch_size=64, shuffle=True)
validating_loader = DataLoader(validation_set, batch_size=8, shuffle=True)

In [28]:
class BertForPOSTagging(DistilBertModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        ### BEGIN YOUR CODE ###

        classifier_dropout = 0.5

        self.bert = DistilBertModel(config)
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.loss = nn.CrossEntropyLoss()

        ### END YOUR CODE ###

        self.post_init()

    def forward(self, input_ids, attention_mask, labels=None):
        """
        Forward pass through your model. Returns output logits for each POS
        label and the loss (if labels is not None)

        Hint: You may use nn.CrossEntropyLoss() to calculate your loss.
        """
        loss, logits = None, None

        ### BEGIN YOUR CODE ###

        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss = self.loss(logits.view(-1, self.num_labels), labels.view(-1))

        ### END YOUR CODE ###

        if loss is not None:
          return loss, logits
        return logits

In [33]:
model = BertForPOSTagging.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(tag_to_idx)
).to(device)

MAX_GRAD_NORM = 10
EPOCHS = 5

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-04)

Some weights of BertForPOSTagging were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['distilbert.bert.embeddings.LayerNorm.bias', 'distilbert.bert.embeddings.LayerNorm.weight', 'distilbert.bert.embeddings.position_embeddings.weight', 'distilbert.bert.embeddings.word_embeddings.weight', 'distilbert.bert.transformer.layer.0.attention.k_lin.bias', 'distilbert.bert.transformer.layer.0.attention.k_lin.weight', 'distilbert.bert.transformer.layer.0.attention.out_lin.bias', 'distilbert.bert.transformer.layer.0.attention.out_lin.weight', 'distilbert.bert.transformer.layer.0.attention.q_lin.bias', 'distilbert.bert.transformer.layer.0.attention.q_lin.weight', 'distilbert.bert.transformer.layer.0.attention.v_lin.bias', 'distilbert.bert.transformer.layer.0.attention.v_lin.weight', 'distilbert.bert.transformer.layer.0.ffn.lin1.bias', 'distilbert.bert.transformer.layer.0.ffn.lin1.weight', 'distilbert.bert.transformer.layer.0.ffn.lin2.bias', 'distilbe

In [34]:
# print(f"Currently allocated GPU memory: {torch.cuda.memory_allocated(device) / 1024**3:.2f} GB / {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
# torch.cuda.empty_cache()

In [35]:
def train(epoch):
    train_loss = 0
    train_examples, train_steps = 0, 0

    model.train()
    model.zero_grad()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        ### BEGIN YOUR CODE ###

        loss, _ = model(input_ids=ids, attention_mask=mask, labels=labels)
        train_loss += loss.item()

        # Clip gradients (Not required, but helps training)
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### END YOUR CODE ###

        train_steps += 1
        train_examples += labels.size(0)

    avg_train_loss = train_loss / train_steps
    avg_val_loss, val_accuracy = evaluate_bert(model)

    print(f"Epoch: {epoch}/{EPOCHS}\tAvg Train Loss: {avg_train_loss:.4f}\tAvg Val Loss: {avg_val_loss:.4f}\t Val Accuracy: {val_accuracy:.0f}")

def evaluate_bert(model):
    correct, val_loss, val_examples = 0, 0, 0

    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(validating_loader):
            """
            Implement the evaluate method. Find the average validation loss
            along with the validation accuracy.

            Remember! You have labeled only the first token of each word. Make
            sure you only calculate accuracy on values which are not -100.
            """
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            ### BEGIN YOUR CODE ###

            loss, target_logits = model(input_ids=ids, attention_mask=mask, labels=labels)

            # Compute training accuracy
            flattened_targets = labels.view(-1)
            active_logits = target_logits.view(-1, model.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            # Only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100

            # Get the predicted labels
            labels = torch.masked_select(flattened_targets, active_accuracy)
            pred = torch.masked_select(flattened_predictions, active_accuracy)

            # Get number of correct predictions
            correct += pred.eq(labels.data.view_as(pred)).sum()

            # Increase running total loss and the number of past valid samples
            val_loss += loss
            val_examples += labels.size(0)

            ### END YOUR CODE ###

    val_accuracy = 100 * correct / val_examples
    avg_val_loss = val_loss / val_examples
    return avg_val_loss, val_accuracy

In [36]:
EPOCHS = 1

for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0/1	Avg Train Loss: 1.1670	Avg Val Loss: 0.0031	 Val Accuracy: 83


In [37]:
def generate_prediction(model, sentence):
    """
    Given a sentence, generate a full prediction of POS tags.

    In this case, you are given a full sentence (not array of tokens), so you
    will need to use your tokenizer differently.

    Return your prediction in the format:
      [(token 1, POS prediction 1), (token 2, POS prediction 2), ...]

    E.g., "The imperatives that" => [('the', 'DT'), ('imperative', 'NNS'), ('that', 'WDT')]
    """
    prediction = []

    ### BEGIN YOUR CODE ###

    reversed_tag_to_idx = {v: k for k, v in tag_to_idx.items()}

    inputs = tokenizer(sentence,
                       return_offsets_mapping=True,
                       padding='max_length',
                       truncation=True,
                       max_length=MAX_LEN,
                       return_tensors="pt")

    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)

    outputs = model(input_ids=ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [reversed_tag_to_idx[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))

    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
      if mapping[0] != 0 or mapping[1] != 0:
        prediction += [token_pred]
      else:
        continue

    ### END YOUR CODE ###

    return prediction

In [38]:
sentence = "The imperatives that can be obeyed by a machine that has no limbs are bound to be of a rather intellectual character."
print(generate_prediction(model, sentence))

[('the', 'DT'), ('imperative', 'NN'), ('##s', 'NN'), ('that', 'IN'), ('can', 'MD'), ('be', 'VB'), ('obeyed', 'VBG'), ('by', 'IN'), ('a', 'DT'), ('machine', 'NN'), ('that', 'IN'), ('has', 'VBZ'), ('no', 'DT'), ('limbs', 'VB'), ('are', 'VBP'), ('bound', 'NN'), ('to', 'TO'), ('be', 'VB'), ('of', 'IN'), ('a', 'DT'), ('rather', 'RB'), ('intellectual', 'JJ'), ('character', 'NN'), ('.', '.')]
