# Fine Tuning

In [20]:
import torch

In [21]:
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

## Quick Example

In [22]:
# Import BERT
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Creating the dataset for training
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors="pt"))
batch["labels"] = torch.tensor([1, 1])
print(batch)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2023,  2607,  2003,  6429,   999,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 1])}


In [24]:
# This the part we fine tune 
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()



## Fine tuning Bert on MRPC (Microsoft Research Paraphrase Corpus) dataset

### Loading dataset from datasets library

In [4]:
import datasets

In [5]:
raw_datasets = datasets.load_dataset("glue", "mrpc")
raw_datasets

Downloading builder script: 100%|██████████| 28.8k/28.8k [00:00<00:00, 7.18MB/s]
Downloading metadata: 100%|██████████| 28.7k/28.7k [00:00<00:00, 16.6MB/s]
Downloading readme: 100%|██████████| 27.9k/27.9k [00:00<00:00, 26.1MB/s]
Downloading data: 6.22kB [00:00, 3.86MB/s]/3 [00:00<?, ?it/s]
Downloading data: 1.05MB [00:00, 13.5MB/s]/3 [00:00<00:00,  2.95it/s]
Downloading data: 441kB [00:00, 9.28MB/s]2/3 [00:00<00:00,  3.08it/s]
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  3.25it/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 28686.06 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 6847.95 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 47229.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [11]:
raw_train_dataset[15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [8]:
# See the label names
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

### Preprocessing the dataset

In [25]:
# Import BERT
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [50]:
# Explore token type ids for BERT tokenizer
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [51]:
# Explore token type ids for BERT tokenizer
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [53]:
# Explore token type ids for BERT tokenizer
inputs.token_type_ids

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [61]:
# Tokenizing the dataset
# But this returns a dictionary and also stores all dataset in RAM
# Therefore not very good approach, instead will use Dataset.map()
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

tokenized_dataset.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [80]:
# Tokenizing function to be called in Dataset.map()
# We do not pad here because we will use dynamic padding
# If you want to do the fixed padding, used padding="max_length" in the tokenizer below
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [84]:
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
tokenized_dataset_train = tokenized_dataset['train'].remove_columns(["idx", "sentence1", "sentence2"])
tokenized_dataset_train = tokenized_dataset_train.rename_column("label", "labels")
tokenized_dataset_train = tokenized_dataset_train.with_format("torch")

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map: 100%|██████████| 3668/3668 [00:00<00:00, 15348.06 examples/s]


In [85]:
tokenized_dataset_train

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [86]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(tokenized_dataset_train, batch_size=16, 
                                               shuffle=True, collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
    print(batch["input_ids"].shape)
    if step >5: break

torch.Size([16, 83])
torch.Size([16, 72])
torch.Size([16, 70])
torch.Size([16, 80])
torch.Size([16, 78])
torch.Size([16, 81])
torch.Size([16, 74])
