In [1]:
# cloneright 2020 The HuggingFace Team. All rights reserved.
# Copyright 2022 Vladislav Lialin
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

If you're opening this Notebook on colab, you will probably need to

1. install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.
2. make sure your runtime is GPU

In [2]:
! pip install datasets transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.14.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-an

In [3]:
# Uncomment this to verify that you have a GPU
# !nvidia-smi

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm

print(transformers.__version__)

4.28.1


# Fine-tuning a model on a text classification task

In this notebook, we will see how to fine-tune one of the [🤗 Transformers](https://github.com/huggingface/transformers) model to a text classification task of the [GLUE Benchmark](https://gluebenchmark.com/).

The GLUE Benchmark is a group of nine classification tasks on sentences or pairs of sentences which are:

- [CoLA](https://nyu-mll.github.io/CoLA/) (Corpus of Linguistic Acceptability) Determine if a sentence is grammatically correct or not.is a  dataset containing sentences labeled grammatically correct or not.
- [MNLI](https://arxiv.org/abs/1704.05426) (Multi-Genre Natural Language Inference) Determine if a sentence entails, contradicts or is unrelated to a given hypothesis. (This dataset has two versions, one with the validation and test set coming from the same distribution, another called mismatched where the validation and test use out-of-domain data.)
- [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398) (Microsoft Research Paraphrase Corpus) Determine if two sentences are paraphrases from one another or not.
- [QNLI](https://rajpurkar.github.io/SQuAD-explorer/) (Question-answering Natural Language Inference) Determine if the answer to a question is in the second sentence or not. (This dataset is built from the SQuAD dataset.)
- [QQP](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Quora Question Pairs2) Determine if two questions are semantically equivalent or not.
- [RTE](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment) (Recognizing Textual Entailment) Determine if a sentence entails a given hypothesis or not.
- [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) Determine if the sentence has a positive or negative sentiment.
- [WNLI](https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html) (Winograd Natural Language Inference) Determine if a sentence with an anonymous pronoun and a sentence with this pronoun replaced are entailed or not. (This dataset is built from the Winograd Schema Challenge dataset.)

> We do not use STSB here, because it uses slightly different evaluation schema than the rest and we don't want to complicate code.

In [5]:
GLUE_TASKS = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", "wnli"]

In [6]:
# Task 1.1: Select a task (from GLUE_TASKS) and a model
# We recommend "cola" and "bert-base-uncased"
# You can also try "bert-large-uncased" (a larger model) and "distilbert-base-cased" (a smaller model)
# Full list of models: https://huggingface.co/transformers/pretrained_models.html (not all of them will work with this script)

# YOUR CODE STARTS HERE
task = "cola"
# task = "mnli"
# task = "mrpc"
model_name = "bert-base-uncased"

# YOUR CODE ENDS HERE
assert task in GLUE_TASKS

## Loading the dataset

We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data and get the metric we need to use for evaluation (to compare our model to the benchmark). This can be easily done with the functions `load_dataset` and `load_metric`.  

In [7]:
from datasets import load_dataset, load_metric

We can directly pass our task name to those functions. `load_dataset` will cache the dataset to avoid downloading it again the next time you run this cell.

In [8]:
dataset = load_dataset("glue", task)
metric = load_metric("glue", task)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/cola to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  metric = load_metric("glue", task)


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set (with more keys for the mismatched validation and test set in the special case of `mnli`).

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [10]:
import random

# Task 1.1: To get a sense of what the data looks like, print 5 random examples from the "train" subset
# YOUR CODE STARTS HERE

num_samples = len(dataset["train"])
[print(dataset["train"][random.randint(0,num_samples)]) for _ in range(0,5)]

# YOUR CODE ENDS HERE

{'sentence': 'Megan marveled at the beauty of the Grand Canyon.', 'label': 1, 'idx': 3003}
{'sentence': 'Emma slighted Miss Bates.', 'label': 1, 'idx': 6691}
{'sentence': 'The candidate was dogged by charges of infidelity, or at least trying to.', 'label': 0, 'idx': 955}
{'sentence': 'The collection of syntax articles with the red cover bores students of syntax in Tucson.', 'label': 1, 'idx': 5814}
{'sentence': 'Some professor admires every student.', 'label': 1, 'idx': 1024}


[None, None, None, None, None]

The metric is an instance of [`datasets.Metric`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Metric):

In [11]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

You can call its `compute` method with your predictions and labels directly and it will return a dictionary with the metric(s) value:

In [12]:
import numpy as np

# Task 1.2: Compute the metric using `fake_preds` and `fake_labels`
# YOUR CODE STARTS HERE
# feel free to modify fake_preds and fake_labels if your metric does not work for these inputs
fake_preds = np.array([1, 2, 0, 0, 0, 1, 1, 2])
fake_labels = np.array([1, 1, 0, 2, 0, 2, 2, 2])

metric_value = metric.compute(predictions=fake_preds, references=fake_labels)
print(metric_value)
# YOUR CODE ENDS HERE

{'matthews_correlation': 0.29277002188455997}


Note that `load_metric` has loaded the proper metric associated to your task, which is:

- for CoLA: [Matthews Correlation Coefficient](https://en.wikipedia.org/wiki/Matthews_correlation_coefficient)
- for MNLI (matched or mismatched): Accuracy
- for MRPC: Accuracy and [F1 score](https://en.wikipedia.org/wiki/F1_score)
- for QNLI: Accuracy
- for QQP: Accuracy and [F1 score](https://en.wikipedia.org/wiki/F1_score)
- for RTE: Accuracy
- for SST-2: Accuracy
- for WNLI: Accuracy

so the metric object only computes the one(s) needed for your task.

## Preprocessing the data

Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- we get a tokenizer that corresponds to the model architecture we want to use,
- we download the vocabulary used when pretraining this specific checkpoint.

That vocabulary will be cached, so it's not downloaded again the next time we run the cell.

In [13]:
from transformers import AutoTokenizer

# Task 1.3: Load a pre-trained tokenizer.
# Remember that we want to work with a pre-trained model,
# which means that we need to use exactly the same tokenizer that was used to pre-train the model.
# Provide your model name to the .from_pretrained method and it will download the right tokenizer files for you.
# Provide `use_fast=True` to use the fast tokenizer.
# You can learn more about fast tokenizers vs slow tokenizers here: https://www.youtube.com/watch?v=g8quOxoqhHQ
# YOUR CODE STARTS HERE

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
print(type(tokenizer))

# YOUR CODE ENDS HERE

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


We pass along `use_fast=True` to the call above to use one of the fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, but if you got an error with the previous call, remove that argument.

You can directly call this tokenizer on one sentence or a pair of sentences:

In [14]:
tokenizer("Hello, this one sentence!", text_pair="And this sentence goes with it.")

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Depending on the model you selected, you will see different keys in the dictionary returned by the cell above. They don't matter much for what we're doing here (just know they are required by the model we will instantiate later), you can learn more about them in [this tutorial](https://huggingface.co/transformers/preprocessing.html) if you're interested.

To preprocess our dataset, we will thus need the names of the columns containing the sentence(s). The following dictionary keeps track of the correspondence task to column names:

In [15]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

We can double check it does work on our current dataset:

In [16]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")

Sentence: Our friends won't buy this analysis, let alone the next one we propose.


We can them write the function that will preprocess our samples. We just feed them to the `tokenizer` with the argument `truncation=True`. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model.

In [17]:
# Task 1.4: Implement preprocess_function that applies tokenizer to either a single sentence or to a pair of sentences.
# Truncate the sequences via passing `truncation=True` to the tokenizer in case some texts have larger length that the model can process (512 tokens).
# YOUR CODE STARTS HERE

def preprocess_function(sample):
    sentence1_key, sentence2_key = task_to_keys[task]
    if sentence2_key is None:
        # Tokenize a single sentence
        tokenized = tokenizer(sample[sentence1_key], truncation=True)
        return tokenized

    # Tokenize a pair of sentences: one defined by sentence1_key, the other defined by sentence2_key
    tokenized = [tokenizer(sample[sentence1_key], truncation=True),
                 tokenizer(sample[sentence2_key], truncation=True)]
    return tokenized

# YOUR CODE ENDS HERE

This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:

In [18]:
# Task 1.5 to make sure your preprocess_function works, apply it to the first 5 elements of the train set
# You can get them vias dataset["train"][:5]
# YOUR CODE STARTS HERE

dataset = load_dataset("glue", task)
metric = load_metric("glue", task)
y = preprocess_function(dataset["train"][:5])
for i in range(0,5):
    print(y[i])
# [print(yy) for yy in y]

# YOUR CODE ENDS HERE



  0%|          | 0/3 [00:00<?, ?it/s]

Encoding(num_tokens=19, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

Because you apply this fuction to the `dataset` object that contains all `train`, `valid` and `test` subsets,
encoded_dataset will have all of the same subsets.
use `remove_columns` to remove all of the columns that are not needed for the model
(all `dataset["train"].column_names`) except `label` (we have already created one for you -- `old_column_names`)

In [19]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [20]:
old_column_names = [c for c in dataset["train"].column_names if c != "label"]

# Task 1.6: map the preprocess_function to the dataset
# Use batched=True, so it would run faster
# (batched operations are always faster than a single-element ones, even though they are a bit harder to write)
# Assign the results to encoded_dataset
# YOUR CODE STARTS HERE

encoded_dataset = dataset.map(preprocess_function, batched=True)
# print(f'before: {encoded_dataset["train"].column_names}')
# print(f'old_columnn_names {old_column_names}')
for c in old_column_names:
    encoded_dataset = encoded_dataset.remove_columns(c)
# encoded_dataset = dataset.rename_column("idx", "input_ids")
print(f'after: {encoded_dataset["train"].column_names}')

# YOUR CODE ENDS HERE




after: ['label', 'input_ids', 'token_type_ids', 'attention_mask']


The `.map` results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). For instance, it will properly detect if you change the task in the first cell and rerun the notebook. 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.

Note that we passed `batched=True` to encode the texts by batches together. This is to leverage the full benefit of the fast tokenizer we loaded earlier, which will use multi-threading to treat the texts in a batch concurrently.

## Fine-tuning the model

Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about sentence classification, we use the `AutoModel` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. This model will return us the hidden states of the pre-trained transformers. We need to take them and use to predict the classes. To do that we need to pool them across the time dimension (to get a fixed-size vector independent on the sequence length). The most common way to do that is to wrap the model into a new `nn.Module` class that will have our pre-trained transformer and an extra linear layer (feel free to use more than one layer).

But first, let's load the model using `AutoModel.from_pretrained` and play with it to learn its interfaces.

In [21]:
from transformers import AutoModel

# Task 1.7: Load a pre-trained model using the same name you used for the tokenizer
# name it bert_model
# YOUR CODE STARTS HERE

bert_model = AutoModel.from_pretrained(model_name)

# YOUR CODE ENDS HERE

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


To tokenize the sentence (or a pair of sentences), we use a tokenizer.
More that that, we can ask the tokenizer to return us pytorch tensors if we provide `return_tensors="pt"`, where "pt" means PyTorch.

In [22]:
texts = ["Hello, this is the sentence number one!",
         "This is another sentence!"]
text_pairs = ["And this sentence goes with the first sentence.",
              "And this sentence goes with the second one."]

# Task 1.8: Tokenize the batch of texts and text pairs
# Note that you need to make the tokenizer to
# 1. return pytorch tensors
# 2. do padding
# If padding is not turned on, the model won't be able to create a batch of these two examples.
# YOUR CODE STARTS HERE

batch = [z for z in zip(texts, text_pairs)]
input_object = tokenizer(batch, return_tensors="pt", padding=True)

# YOUR CODE ENDS HERE
input_object

{'input_ids': tensor([[ 101, 7592, 1010, 2023, 2003, 1996, 6251, 2193, 2028,  999,  102, 1998,
         2023, 6251, 3632, 2007, 1996, 2034, 6251, 1012,  102],
        [ 101, 2023, 2003, 2178, 6251,  999,  102, 1998, 2023, 6251, 3632, 2007,
         1996, 2117, 2028, 1012,  102,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

Your `input_object` should look like this:

```python
{
'input_ids': tensor(
        [[ 101, 7592, 1010, 2023, 2003, 1996, 6251, 2193, 2028,  999,  102, 1998, 2023, 6251, 3632, 2007, 1996, 2034, 6251, 1012,  102],
        [ 101, 2023, 2003, 2178, 6251,  999,  102, 1998, 2023, 6251, 3632, 2007, 1996, 2117, 2028, 1012,  102,    0,    0,    0,    0]]), 
'token_type_ids': tensor(
        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]),
'attention_mask': tensor(
        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
}
```

We can see that a tokenizer returns a dictionary with the following keys:
    
    - "input_ids": a list of token ids
    - "attention_mask": a list of booleans, where 1 means that the corresponding token is part of the input, and 0 means that it is padding
    - "token_type_ids": a list of integers, where 0 means that the corresponding token is the first sentence and 1 means that the corresponding token is the second sentence. These are very imporant for tasks like MNLI or SST.
    
The model expects the arguments with the same keys, so we can forward it with

In [23]:
ouput_obj = bert_model(
    input_ids=input_object["input_ids"],
    attention_mask=input_object["attention_mask"],
    token_type_ids=input_object["token_type_ids"],
)

In [24]:
# Now, look at ouput_obj and figure out how to get the hidden states of the last transformer layer.
# Task 1.9: Get the hidden states of the last transformer layer
# YOUR CODE STARTS HERE

# print(type(bert_model))
# print(type(ouput_obj))
# print(ouput_obj)

bert_model.config.output_hidden_states=True
ouput_obj = bert_model(
    input_ids=input_object["input_ids"],
    attention_mask=input_object["attention_mask"],
    token_type_ids=input_object["token_type_ids"],
)

hidden_states = ouput_obj.hidden_states
last_hidden_state = ouput_obj.last_hidden_state
pooler_output = ouput_obj.pooler_output

# YOUR CODE ENDS HERE

_batch_size = len(input_object["input_ids"])
_seq_len = len(input_object["input_ids"][0])
_hidden = bert_model.config.hidden_size
assert last_hidden_state.shape == (_batch_size, _seq_len, _hidden)

Now it's time to wrap the model into PyTorch `nn.Module`.
Your task is to:

1. Create BertForClassification class that inherits from `nn.Module`
2. Implement `__init__` which should take `pre_trained_encoder` (an object of type `BertModel`) and `num_classes` (integer). Then, as usual, you assign the layer objects (and create new ones, if nesessary) to the object arguments (e.g., `self.pre_trained_encoder` and `self.output_layer`).
3. Implement `forward()`. Remember to pool the sequence dimension via extracting the first token hidden (it corresponds to the CLS token).

This model should take `input_ids`, `attention_mask`, and `token_type_ids` and to output the logits of the `num_classes`.

> DO NOT hard-code any values (e.g., do not assume that model hidden is always equal to 768). Extract this number from the model config as we did in the previuos cell.


**Extra points:** Feel free to have more than one layer after the pre_trained_encoder and/or to use Batch Normalization, Dropout, and other tricks. Remember that you need nonlinearities between linear layers, but not after the last layer. You can additionally use different learning rates for the body and the head of the model. You can learn more about common tricks in [fast.ai Lesson 1](https://course.fast.ai/videos/?lesson=1).

In [25]:
# YOUR CODE STARTS HERE

from collections import OrderedDict

class BertForClassification(nn.Module):
    def __init__(self, pre_trained_encoder, num_classes):
        super(BertForClassification, self).__init__()

        self.pre_trained_encoder = pre_trained_encoder

        self.bert_out_size = pre_trained_encoder.config.hidden_size

        self.layer1 = nn.Linear(self.bert_out_size, self.bert_out_size)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(self.bert_out_size, self.bert_out_size)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(self.bert_out_size, self.bert_out_size)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(self.bert_out_size, num_classes)
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.pre_trained_encoder(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        x = x.pooler_output

        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))

        return x

# YOUR CODE ENDS HERE

bert_model = AutoModel.from_pretrained(model_name)
model = BertForClassification(pre_trained_encoder=bert_model, num_classes=17)
output_logits = model(input_object["input_ids"], input_object["attention_mask"], input_object["token_type_ids"])
assert output_logits.shape == (2, 17)
print("Passed!")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Passed!


The warning is telling us we are throwing away some weights. This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

In [26]:
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"

# Feel free to change the parameters below
# YOUR CODE STARTS HERE

# from https://arxiv.org/pdf/1810.04805.pdf appendix A.3
learning_rate = 2e-5
batch_size = 16
num_train_epochs = 1
weight_decay = 0.01

# YOUR CODE ENDS HERE


Now, it's time to create the dataloaders that will batch the data from our datasets.

To do that, you need a collation function that combines multiplle examples (of different lengths in general) into a batch. In machine translation homework we had written our own collation function and it looked like this:

```python
def collation_function_for_seq2seq(batch, source_pad_token_id, target_pad_token_id):
    input_ids_list = [ex["input_ids"] for ex in batch]
    decoder_input_ids_list = [ex["decoder_input_ids"] for ex in batch]
    labels_list = [ex["labels"] for ex in batch]

    collated_batch = {
        "input_ids": pad(input_ids_list, source_pad_token_id),
        "decoder_input_ids": pad(decoder_input_ids_list, target_pad_token_id),
        "labels": pad(labels_list, target_pad_token_id),
    }

    collated_batch["encoder_padding_mask"] = collated_batch["input_ids"] == source_pad_token_id
    return collated_batch

def pad(sequence_list, pad_id):
    max_len = max(len(x) for x in sequence_list)
    padded_sequence_list = []
    for sequence in sequence_list:
        padding = [pad_id] * (max_len - len(sequence))
        padded_sequence = sequence + padding
        padded_sequence_list.append(padded_sequence)

    return torch.LongTensor(padded_sequence_list)
```

We can either write a similar function by here or to use `transformers.data.data_collator.DataCollatorWithPadding`.

> NOTE: remember to shuffle the order of the dataset elements in your training dataloader (and do not shuffle test dataloader)

In [27]:
from transformers.data.data_collator import DataCollatorWithPadding

# Task 1.10: Create collator, training set dataloader, and validation set dataloader
# (these dataloaders should use the collator to collate the examples of different lengths into batches)
# YOUR CODE STARTS HERE (our implementation is 3 lines)

collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')
train_dataloader = torch.utils.data.DataLoader(encoded_dataset["train"],      batch_size=batch_size, shuffle=True, collate_fn=collator)
valid_dataloader = torch.utils.data.DataLoader(encoded_dataset["validation"], batch_size=batch_size, shuffle=True, collate_fn=collator)

# YOUR CODE ENDS HERE

> If dataloader fails, it might happen because it is trying to collate non-collatable objects like strings. This could happend if you forgot to remove the extra columns from the dataset during `.map` a few cells above.

In [28]:
batch = next(iter(valid_dataloader))
batch["input_ids"].shape

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 29])

In [29]:
# make sure this line does not fail and gives you a reasonable output
batch = next(iter(train_dataloader))
batch

{'input_ids': tensor([[  101,  2017,  2323,  2681,  1010,  5807,  1005,  1056,  2017,  1029,
           102,     0,     0,     0,     0],
        [  101,  1996,  2879,  1005,  1055,  6697,  1005,  1055,  2057,  2700,
         11194,  2343,  1012,   102,     0],
        [  101,  5035,  2038,  4999,  1999,  2029,  2282,  5639,  4370,  1012,
           102,     0,     0,     0,     0],
        [  101,  2577,  2003,  2383,  2973,  1999, 13163,  2005,  4228,  2086,
          1012,   102,     0,     0,     0],
        [  101,  1999,  2023,  7551,  1010,  7426, 10857,  2442,  2022,  2921,
          2650,  1997,  7453,  1012,   102],
        [  101,  1996,  3648, 12279,  2094,  1996,  3453,  2007,  1037,  3396,
          1012,   102,     0,     0,     0],
        [  101,  1045,  4912,  1996,  5249,  2005,  2208,  1012,   102,     0,
             0,     0,     0,     0,     0],
        [  101,  1045,  2031,  2416,  7720,  2015,  2205,  2116,  1012,   102,
             0,     0,     0,     0,   

Now it's time to build a training loop. It will be similar (but not completely similar) to the one you built in the previous homework.

1. iterate over the number of epochs,
1. iterate over your training examples in the train_dataloader,
1. compute the loss and update model parameters,
1. in the end of epoch, compute the metric (using `metric`` object) on the validation set.
1. **after all training is done, print the final validation metric**

Log your training loss and accuracy and your validtion accuracy to wandb (you will need to submit a link to the most succesfull wandb run).

You can use `tqdm` to make a progress bar for your epochs and/or iterations within one epohc (over `train_dataloader`).

It is not nesessary, but recommended to wrap evaluation loop into a block

```python
    with torch.no_grad():
        ...
```

This can make evaluation a bit faster, because pytorch won't track the gradient values. 

> Remember to put your model into eval mode in the beginning of the evaluation and to put it back into train mode at the end or the model would use dropout during evaluatoin which will **very significantly** affect your results.

> If your epoch for CoLA takes **more than 5 minutes**, you are probably not utilizing the GPU (forgot to move your model and data to the correct device).

Please refer to [this guide](https://docs.wandb.ai/guides/track/jupyter) on how to work with wandb inside jupyter notebooks.

In [30]:
import random
# random classifier metric on the eval dataset
_labels = encoded_dataset["validation"]["label"]
_shuffled_labels = _labels.copy()
random.shuffle(_shuffled_labels)

_random_metric_value = metric.compute(predictions=_shuffled_labels, references=_labels)
print(f"Random classifier {metric_name} on {task} is {_random_metric_value}. Your model should perform **way** better than that.")

Random classifier matthews_correlation on cola is {'matthews_correlation': 0.025116082735331363}. Your model should perform **way** better than that.


In [31]:
import wandb

In [33]:
run = wandb.init(project=f"bert_classification_{task}")

# Feel free to modify the code below
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

bert_model = AutoModel.from_pretrained(model_name)
model = BertForClassification(pre_trained_encoder=bert_model, num_classes=num_labels)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Task 1.11: Training loop
# YOUR CODE STARTS HERE
import math
import datetime
log_name = datetime.datetime.now().strftime("%m%d_%H%M%S")

validation_frequency = 50
best_val_loss = math.inf

# loss_fn = torch.nn.CrossEntropyLoss()
loss_fn = torch.nn.MSELoss()

wandb.config.update({
    "device": device,
    "num_labels": num_labels,
    "model_name": model_name,
    "optimizer": optimizer,
    "learning_rate": learning_rate,
    "weight_decay": weight_decay,
    "loss_fn": loss_fn,
    "num_train_epochs": num_train_epochs,
    "validation_frequency": validation_frequency,
    "logfile_name": log_name,
    })


global_step = -1
for epoch in tqdm(range(0, num_train_epochs)):

    for trainbatch in train_dataloader:
        model.train()  # put model in training mode
        optimizer.zero_grad()
        global_step += 1

        input_ids = trainbatch["input_ids"].to(device)
        attention_mask = trainbatch["attention_mask"].to(device)
        token_type_ids = trainbatch["token_type_ids"].to(device)
        reference_labels = trainbatch["labels"].float().to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        predicted_labels = outputs[:,1].to(device)  #  0=sequence 1=pooled

        loss = loss_fn(input=predicted_labels, target=reference_labels)
        loss.backward()
        optimizer.step()

        wandb.log(
            {
                "training_loss": loss,
                "epoch": epoch,
            },
            step=global_step
        )

        pred_int_list = predicted_labels.int().tolist()
        refe_int_list = reference_labels.int().tolist()
        metric_value = metric.compute(predictions=pred_int_list,
                                 references=refe_int_list)

        wandb.log(
            data=metric_value,
            step=global_step,
        )

        if global_step % validation_frequency == 0:
            model.eval()  # put model into evaluation mode
            with torch.no_grad():

                total_val_loss = 0.
                total_val_metric = 0.
                num_val_batches = 0
                for validbatch in valid_dataloader:
                    num_val_batches += 1

                    input_ids = validbatch["input_ids"].to(device)
                    attention_mask = validbatch["attention_mask"].to(device)
                    token_type_ids = validbatch["token_type_ids"].to(device)
                    reference_labels = validbatch["labels"].float().to(device)

                    outputs = model(input_ids, attention_mask, token_type_ids)
                    predicted_labels = outputs[:,1].to(device)  #  0=sequence 1=pooled

                    loss = loss_fn(input=predicted_labels, target=reference_labels)
                    total_val_loss += loss

                avg_val_loss = total_val_loss / num_val_batches
                avg_val_metric = total_val_metric / num_val_batches

                wandb.log(
                    {
                        "validation_loss": avg_val_loss,
                    },
                    step=global_step,
                    )
                
                # if avg_val_loss < best_val_loss:
                #     best_val_batch = global_step
                #     best_val_loss = avg_val_loss
                #     model_string = f"./models/m_{log_name}_{best_val_batch}"
                #     print(f"Saving model checkpoint to {model_string}")
                #     torch.save(model,              f"{model_string}_all.cpk")
                #     torch.save(model.state_dict(), f"{model_string}_state.cpk")

            model.train()  # put model in training mode

# model_string = f"./models/m_{log_name}_FINAL"
# print(f"Saving model checkpoint to {model_string}")
# torch.save(model,              f"{model_string}_all.cpk")
# torch.save(model.state_dict(), f"{model_string}_state.cpk")


# YOUR CODE ENDS HERE
run.finish()  # stop wandb run

0,1
epoch,▁
matthews_correlation,▁
training_loss,▁
validation_loss,▁

0,1
epoch,0.0
matthews_correlation,0.0
training_loss,0.25019
validation_loss,0.24861


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
matthews_correlation,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
training_loss,▆▆▇▅▇▆▃▅▂▇▄▄▂▄▇▃▄▆▂▆▂▇▇▂▁█▅▃▆▃█▄▄▅▂▅▃▆▄▄
validation_loss,█▅▅▃▂▃▂▁▁▄▂

0,1
epoch,0.0
matthews_correlation,0.0
training_loss,0.23563
validation_loss,0.16883


To see how your model fared you can compare it to the [GLUE Benchmark leaderboard](https://gluebenchmark.com/leaderboard).

### Task:
Tune your hparams until you are close to the original BERT results. Feel free to read BERT paper to find what hparam the authors used.

# Interact with your trained model

Input some examples into your model and look at the predicitons.
Find 4 inputs such that:
1. Model makes a correct positive prediction (predicts 1 when the label is 1)
1. Model makes an incorrect positive prediction (predicts 1 when the label is 0)
1. Model makes a correct negative prediction (predicts 0 when the label is 0)
1. Model makes an incorrect negative prediciton

In [None]:
dataset["train"][:5]

In [None]:
dataset = load_dataset("glue", task)
metric = load_metric("glue", task)
y = preprocess_function(dataset["train"][:5])
for i in range(0,5):
    print(y[i])

In [None]:
dataset["train"][:2]

In [None]:
# Task 1.13
# Interact with your trained model and find true/false positive/negatives
# Freel free to either come up with your own examples or to find them in the test set (but **not** from the train set).
# You can use more than one notebook cell for this task
# YOUR CODE STARTS HERE

data = {'sentence': [
    "This is a sentence that is self consistent",
    "An apple dog formerly cat diver is nonsensical.",
    "Leaf raking in the fall is a hearty task.",
    'Elephants roam, foxes trot and green pigs.',
    'The facts remain and no further claims will convince us.'],
 'label': [1, 1, 1, 1, 1],
 'idx': [0, 1, 2, 3, 4]}

tokens = preprocess_function(data)
input_ids = torch.tensor(tokens.input_ids)
token_type_ids = torch.tensor(tokens.token_type_ids)
attention_mask = torch.tensor(tokens.attention_mask)
#outputs = model(tokens.input_ids, tokens.attention_mask, tokens.token_type_ids)

# YOUR CODE ENDS HERE

# The final task (1.14)


Now, instead of using a pre-trained BERT, let's initialize it with random values and train our classifier from scratch. This way we'll see how much improvemet we get from the pre-training.

Now, please **copy** the cell with the training loop below (yes, this is generaly a bad practice, but it would help us to check your homework easier in this particular case).

In [None]:
run = wandb.init(project=f"bert_classification_{task}")

# Feel free to modify the code below
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_config = AutoConfig.from_pretrained(model_name)  # we request a config of exactly the same model as we worked with before
bert_untrained = AutoModel(model_config)  # create the model from config, no pre-trained weights provided
model = BertForClassification(pre_trained_encoder=bert_untrained, num_classes=num_labels)
model = model.to(device)

# Training loop
# YOUR CODE STARTS HERE

# YOUR CODE ENDS HERE
run.finish()  # stop wandb run