# Fine Tuning Pretrained Models

In [1]:
# Built-in library
import re
import json
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
from rich import print

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

### Train The Model On A Small Sequence of Data

```text
- Of course, training the model on two sentences is not going to yield very good results. 
- To get better results, you will need to prepare a bigger dataset.
```

In [2]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification


# Uncases means: Better == better
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# Train the model with 2 sequences.
# This raises a warning because we're overriding the pretrained model weights
# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

2023-09-08 05:28:19.211011: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Pretraining Using [MRPC](https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt)

```text
- The example dataset in this section is MRPC (Microsoft Research Paraphrase Corpus), which contains 5,801 sentence pairs labeled as paraphrases or not, 
- It's suitable for experimenting with training due to its small size.


- The Hub doesn’t just contain models; it also has multiple datasets in lots of different languages.
```
<br>

- You can browse the datasets [here](https://huggingface.co/datasets), 
- It's recommend you try to load and process a new dataset once you have gone through this section (see the general documentation [here](https://huggingface.co/docs/datasets/loading_datasets.html#from-the-huggingface-hub))

```text
Let’s focus on the MRPC dataset! This is one of the 10 datasets composing the GLUE benchmark, which is an academic benchmark that is used to measure the performance of ML models across 10 different text classification tasks.
```

In [3]:
# The 🤗 Datasets library provides a very simple command to download and cache a dataset on the Hub.
# We can download the MRPC dataset like this:
from datasets import load_dataset


raw_datasets = load_dataset(path="glue", name="mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
# Access each pair of sentences in our raw_datasets object by indexing, like with a dictionary:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

# The labels are already integers so no further preprocessing is required.

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [5]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

<br>

### Ex 1: 

```text
Look at element 15 of the training set and element 87 of the validation set. What are their labels?
```

In [6]:
# Element 15!
N = 15
raw_train_dataset[N]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [7]:
# Element 87!
N = 87
raw_train_dataset[N]

{'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this year , up 6 percent from 2002 .',
 'sentence2': 'For the current academic year , tuition at public colleges averaged $ 4,694 , up almost $ 600 from the year before .',
 'label': 1,
 'idx': 100}

### Preprocessing Sentence Pairs

In [8]:
from transformers import AutoTokenizer


checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [9]:
# When determining if two sentences are paraphrases, simply passing the
# sequences to the model won't suffice. We must preprocess and handle them as a pair,
# which the tokenizer can do to align with the BERT model's expectations.

inputs = tokenizer("This is the first sentence.", "This is the second one.")
print(inputs)

#### Note

```text
- token_type_ids tell the model which part of the input is the 1st sentence and which is the 2nd sentence.
- In the input, the token type IDs distinguish between sentence1 and sentence2, with 0 representing sentence1 and 1 representing sentence2. 
- Note that not all models provide token_type_ids, and BERT is pretrained with them to model the relationship between sentence pairs.
```

In [10]:
# Decoding the IDs inside input_ids back to words gives:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [11]:
# Preprocess The Traning Data
# Not recommended
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

### Note

```text 
- The current approach returns a dictionary with keys (input_ids, attention_mask, token_type_ids) and values as lists of lists, which can be memory-intensive.
- To mitigate this, we can use the Dataset.map() method for tokenization and additional preprocessing, maintaining the dataset structure. 
- By defining a function, we can apply it to each element of the dataset through map().
```

In [12]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

### Note

```text
- The function processes a dictionary (similar to dataset items) and returns a new dictionary with keys (input_ids, attention_mask, token_type_ids). 
- It supports multiple samples and enables faster tokenization with batched=True. 
- The tokenizer, powered by Rust implementation from 🤗 Tokenizers library, performs best with larger input batches.
- We currently exclude the padding argument in the tokenization function to improve efficiency. 
- Padding only occurs during batch construction, considering the maximum length within the batch, saving time and processing power. 
- Applying the tokenization function to all datasets simultaneously with batched=True enhances preprocessing speed.
```

In [13]:
# 🤗 Datasets library adds new fields corresponding to the keys in the
# dictionary returned by the preprocessing function.
# Recommended!
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

<br>

### Dynamic padding

```text
- The collate function is responsible for creating batches by concatenating samples. 
- However, since our inputs have variable sizes, we postpone padding to apply it only when necessary within each batch. 
- This speeds up training but may cause issues on TPUs, which prefer fixed shapes. 
- To handle padding, we can use the DataCollatorWithPadding provided by the 🤗 Transformers library. 
- It takes a tokenizer during instantiation and handles padding based on the model's requirements.
```


In [14]:
from transformers import DataCollatorWithPadding


# Data collator that will dynamically pad the inputs received.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# Let's select a few samples from the training set for batching.
# We exclude unnecessary columns and examine the lengths of each entry in the batch.
samples = tokenized_datasets["train"][:2]
# print(samples)
samples

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."],
 'label': [1, 0],
 'idx': [0, 1],
 'input_ids': [[101,
   2572,
   3217,
   5831,
   5496,
   2010,
   2567,
   1010,
   3183,
   2002,
   2170,
   1000,
   1996,
   7409,
   1000,
   1010,
   1997,
   9969,
   4487,
   23809,
   3436,
   2010,
   3350,
   1012,
   102,
   7727,
   2000,
   2032,
   2004,
   2069,
   1000,
   1996,
   7409,
   1000,
   1010,
   2572,
   3217,
   5831,
   5496,
   2010,
   2567,
   1997,
   9969,
   4487,
   23809,
   3436,
   2010,
   3350,
   1012,
   102],
  [101,
   9805,
   3540,
   11514,
   2050,


In [16]:
samples.keys()

dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])

In [17]:
samples = tokenized_datasets["train"][:8]
samples = {
    k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]
}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

```text
- As expected, the samples in the batch have different lengths, ranging from 32 to 67. 
- Dynamic padding ensures that all samples are padded to the maximum length within the batch (67). 
- This approach avoids padding to the maximum length of the entire dataset or model's limit. 
- Let's verify that the data_collator correctly applies dynamic padding to the batch.
```

In [18]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

<br><hr>

### Ex 2

```text
- Replicate the preprocessing on the GLUE SST-2 dataset. It’s a little bit different since it’s composed of single sentences instead of pairs, but the rest of what we did should look the same. 
- For a harder challenge, try to write a preprocessing function that works on any of the GLUE tasks.
```

In [19]:
glue_sst_2 = load_dataset(path="glue", name="sst2")
glue_sst_2

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [20]:
glue_sst_2.get("train")[0], glue_sst_2.get("train")[20]

({'sentence': 'hide new secretions from the parental units ',
  'label': 0,
  'idx': 0},
 {'sentence': 'equals the original and in some ways even betters it ',
  'label': 1,
  'idx': 20})

In [21]:
glue_sst_2.get("train").features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [22]:
from transformers import AutoTokenizer


# Workflow
# Tokenize the data
# For very large datasets, apply a map function and a custom tokenize function.
# Apply dynamic padding using a data_collator

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [23]:
def tokenize_function(example):
    return tokenizer(example.get("sentence"), truncation=True)


tokenized_dataset = glue_sst_2.map(tokenize_function, batched=True)
tokenized_dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [24]:
from transformers import DataCollatorWithPadding


# Data collator that will dynamically pad the inputs received.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_dataset.get("train")[:5]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples.get("input_ids")]

[10, 11, 15, 10, 22]

In [25]:
# Apply the data collator
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([5, 22]),
 'token_type_ids': torch.Size([5, 22]),
 'attention_mask': torch.Size([5, 22]),
 'labels': torch.Size([5])}