# Week 9: Contextualized embeddings and Fine-tuning

In [None]:
# Enable the autoreload extension in Jupyter
%load_ext autoreload
# Set autoreload mode to 2, which automatically reloads modules before executing code
# This means you don't need to restart the kernel when you modify imported Python files
%autoreload 2

## New dependencies

Make sure to install the following dependencies:
```
pip install datasets transformers evaluate
pip install altair vegafusion vegafusion-python-embed vl-convert-python
```




In [None]:
from collections import Counter

import torch
import pandas as pd
import numpy as np
import altair as alt
import evaluate

from torch.utils.data import DataLoader

from datasets import load_dataset

from transformers import AutoModelForMaskedLM, BertForMaskedLM
from transformers import (
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
)
from transformers import AutoTokenizer, BertTokenizer
from transformers import Trainer, TrainingArguments, TrainerCallback


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

alt.data_transformers.enable("vegafusion")

In [1]:
model = None

## Contextualized embeddings

The first part of this exercise will show how to extract contextualized embeddings from BERT.

We start by loading the Winogrande dataset.

In [None]:
winogrande = load_dataset("coref-data/winogrande_raw", "winogrande_debiased")
winogrande

In [None]:
# Let's look at an example from the dataset
winogrande["train"][0]

Next, we use huggingface transformers to load the BERT model and tokenizer.

In [None]:
model_name = "google-bert/bert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_name)  
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Let's look at the model architecture
model

Now we are ready to encode the training set. For this, we first need to write a function that encodes a list of sequences from the dataset. We then use the map function of the dataset to encode the entire training set.

In [None]:
def encode_sentence(examples):
    # replace the _ with a [MASK] token
    sentences = [sentence.replace("_", tokenizer.mask_token) for sentence in examples["sentence"]]
    # encode the sentences (add special tokens, apply padding, apply truncation, set max length to 128)
    sentence_ids = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, padding="max_length", truncation=True, max_length=128)
    return sentence_ids

# encode the training set
data = winogrande["train"].map(encode_sentence, batched=True, batch_size=1000)

In [38]:
# set the format to torch and only keep the input_ids, token_type_ids, and attention_mask
data.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask"])

Next, we will create a dataloader to iterate over the dataset in batches.

In [39]:
# To speed up the computations, we will subsample the dataset to 100 examples.
indices = np.random.randint(0, len(data), size=100)
data = data.select(indices)
# Create a dataloader
dataloader = DataLoader(data, batch_size=10, shuffle=False)

Now it's time to extract the contextualized embeddings from BERT. For this, we first need to set the model to evaluation mode.
Then, we can iterate over the dataloader and extract the contextualized embeddings.

In [None]:
model.eval() # set model to evaluation mode

# to collect embeddings and hidden representations, we initialize empty lists
all_embeddings = []
all_hidden_representations = []

with torch.no_grad(): # disable gradient calculation. Not needed for inference
    for batch in dataloader:
        # compute embeddings
        embeddings = model.bert.embeddings.word_embeddings(batch["input_ids"])
        all_embeddings.append(embeddings)
        # run forward pass
        bert_output = model(**batch, output_hidden_states=True, output_attentions=True) 
        # print(bert_output.logits.shape)
        # print(len(bert_output.hidden_states), bert_output.hidden_states[0].shape)
        # print(len(bert_output.attentions), bert_output.attentions[0].shape)
        all_hidden_representations.append(bert_output.hidden_states[-1])

# concatenate all embeddings
all_embeddings = torch.cat(all_embeddings, dim=0)
print(all_embeddings.shape)

# concatenate all hidden representations
all_hidden_representations = torch.cat(all_hidden_representations, dim=0)
print(all_hidden_representations.shape)

# create a flattened version of the embeddings by combining first two dimensions
all_embeddings = all_embeddings.reshape(-1, all_embeddings.shape[-1])
print(all_embeddings.shape)

# create a flattened version of the hidden representations by combining first two dimensions
all_hidden_representations = all_hidden_representations.reshape(-1, all_hidden_representations.shape[-1])
print(all_hidden_representations.shape)


Later, we will visualize the representations of BERT in 2D using t-SNE. To make our life easier, we will limit ourselves to the tokens that are not [PAD] tokens.

In [41]:
# First, we collect the tokens for each sentence in the dataset
all_tokens = []
sentence_for_token = []
for sample in data:
    input_ids = sample["input_ids"]
    sentence_toks = tokenizer.convert_ids_to_tokens(input_ids)
    # convert back to string
    sentence = [tokenizer.decode([tidx for tidx in input_ids if tidx != tokenizer.pad_token_id])]
    sentences = sentence * len(sentence_toks)
    all_tokens.extend(sentence_toks)
    sentence_for_token.extend(sentences)

assert len(all_tokens) == len(sentence_for_token)

In [42]:
# Next, we filter out embeddings/representations of [PAD] tokens
non_pad_indices = [i for i, token in enumerate(all_tokens) if token != "[PAD]"]
non_pad_tokens = [all_tokens[i] for i in non_pad_indices]
sentence_for_non_pad_token = [sentence_for_token[i] for i in non_pad_indices]

assert len(all_tokens) == len(sentence_for_token)

Now we are ready to compute the t-SNE embeddings. T-SNE is a dimensionality reduction technique that is often used to visualize high-dimensional data in 2D.

In [None]:
# Compute TSNE
# X = all_embeddings
X = all_hidden_representations

# Filter out embeddings of [PAD] tokens
X = np.array([X[i] for i in non_pad_indices])
print(X.shape)

tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_2d = tsne.fit_transform(X)
print(X_2d.shape)

We are now ready to visualize the t-SNE embeddings using matplotlib.

In [None]:
# Visualize TSNE result
fig, ax = plt.subplots(figsize=(6, 5))
ax.scatter(X_2d[:, 0], X_2d[:, 1], marker=".", s=10)
ax.set_xlabel("t-SNE dimension 1")
ax.set_ylabel("t-SNE dimension 2")
ax.set_title("2D projection of BERT representations")
plt.tight_layout()
plt.show()

To create an interactive plot, we can use altair. Altair is a declarative statistical visualization library for Python. It's great for creating interactive plots.

In [45]:
# For altair, we need to convert the data we want to plot to a pandas dataframe
embeddings_df = pd.DataFrame(X_2d, columns=["x", "y"])
# add the tokens to the dataframe
embeddings_df["token"] = non_pad_tokens
# add the sentence to the dataframe
embeddings_df["sentence"] = sentence_for_non_pad_token

In [None]:
# plot with altair
alt.Chart(embeddings_df).mark_circle(size=40, opacity=0.5).encode(
    x='x',
    y='y',
    # color='sentence',
    tooltip=['sentence', 'token']
).properties(
    width=600,
    height=500
).configure_legend(disable=True).interactive()

# --> Hover over the points to see the token and sentence that generated the embedding!


## Fine-tuning BERT for sequence classification

In [2]:
# delete the model from the previous exercise
if model is not None:
    del model

The second part of this exercise will show how to fine-tune BERT for sequence classification. We will use the IMDB dataset, which is a commonly used benchmark for sentiment analysis of movie reviews.

In [None]:
# First, we load the dataset
imdb = load_dataset("stanfordnlp/imdb")
imdb

In [None]:
# Let's look at an example from the dataset
imdb["train"][0]

In [None]:
# Let's look at the distribution of the labels in the training set
labels = imdb["train"]["label"]
label_counts = Counter(labels)
print(label_counts)

In [None]:
# Sample a random subset from the training set
data = imdb["train"].select(np.random.randint(0, len(imdb["train"]), size=1000))
label_counts = Counter(data["label"])
print(label_counts)

Now we load the BERT model for sequence classification and tokenizer.

In [None]:
model_name = "google-bert/bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Again, we need to encode the sentences before we can fine-tune the model.

In [None]:
def encode_sentence(examples):
    text = [t for t in examples["text"]]
    examples = tokenizer.batch_encode_plus(
        text,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    return examples


data = data.map(encode_sentence, batched=True, batch_size=1000)
data.set_format(
    type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"]
)
data

We will fine-tune the model for a few steps using the HF trainer class. The HF trainer class takes care of the training loop, including learning rate scheduling, gradient accumulation, and evaluation. You can find more information about the trainer class in the [HF documentation](https://huggingface.co/docs/transformers/main/en/main_classes/trainer).

In [93]:
# Define a callback to collect the loss and history of the training logs
class LossCallback(TrainerCallback):
    def __init__(self):
        self.losses = []
        self.logs = {}
        self.global_steps = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        for k, v in logs.items():
            if k not in self.logs:
                self.logs[k] = []
            self.logs[k].append(v)
        if "loss" in logs:
            self.losses.append(logs["loss"])
            self.global_steps.append(state.global_step)

We will use the evaluate library to compute the accuracy and F1 score.

In [94]:
# We simply use the accuracy and F1 score from the evaluate library
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")


# We define a function to compute the accuracy and F1 score given the predictions and labels
def compute_metrics(eval_pred):
    # Compute the predictions and labels
    # eval_pred is a tuple of (predictions, labels)
    predictions, labels = eval_pred
    # Get the predicted class
    predictions = np.argmax(predictions, axis=1)
    # Compute the accuracy and F1 score
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)[
            "accuracy"
        ],
        "f1": f1.compute(
            predictions=predictions, references=labels, average="weighted"
        )["f1"],
    }

In [95]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none",
)

In [96]:
# Initialize the LossCallback and the Trainer
loss_callback = LossCallback()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=data,
    callbacks=[loss_callback],
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
train_output = trainer.train()

In [None]:
# use matplotlib to visualize the loss during training
global_steps = loss_callback.global_steps
losses = loss_callback.losses
print(global_steps)
print(losses)
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(losses)
ax.set_xlabel("Logging step")
ax.set_xticks(range(len(global_steps)))
ax.set_xticklabels(global_steps)
ax.set_ylabel("Loss")
ax.set_title("Training Loss")
plt.tight_layout()
plt.show()
plt.close()

In [None]:
# loss_callback.logs

In [None]:
# trainer.state.log_history