- How to prepare a large dataset from the Hub
- How to use the high-level Trainer API to fine-tune a model
- How to use a custom training loop

In [1]:
import torch
from transformers import AdamW,AutoTokenizer, AutoModelForSequenceClassification
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch= tokenizer(sequences, padding= True, truncation=True, return_tensors ='pt')
batch['labels']=torch.tensor([1,1])
optimizer = AdamW(model.parameters())
loss = model (**batch).loss
loss.backward()
optimizer.step()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

In [2]:
#The 🤗 Datasets library provides a very simple command to download and cache a dataset on the Hub.
from datasets import load_dataset
raw_datasets = load_dataset('glue', 'mrpc')
raw_datasets

Reusing dataset glue (C:\Users\Sheraz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
raw_train_dataset = raw_datasets['train']
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [4]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [5]:
print(raw_train_dataset[15])
raw_valid_dataset = raw_datasets['validation']
print(raw_valid_dataset[87])

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .', 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .', 'label': 0, 'idx': 16}
{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .', 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .', 'label': 0, 'idx': 812}


In [6]:
# We need to handle the two sequences as a pair, and apply the appropriate preprocessing. Fortunately, 
# the tokenizer can also take a pair of sequences and prepare it the way our BERT model expects
# Take element 15 of the training set and tokenize the two sentences separately and as a pair. 
# What’s the difference between the two results?
myseq1= tokenizer(raw_train_dataset[15]['sentence1'])
myseq2= tokenizer(raw_train_dataset[15]['sentence2'])
myseq_combined = tokenizer(raw_train_dataset[15]['sentence1'],raw_train_dataset[15]['sentence2'], 
                           padding=True, return_tensors='pt')
print(f"Sentence1: {myseq1['input_ids']}")
print(f"Sentence2: {myseq2['input_ids']}")
print(f"Combined : {myseq_combined['input_ids']}")

Sentence1: [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102]
Sentence2: [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102]
Combined : tensor([[  101, 24049,  2001,  2087,  3728,  3026,  3580,  2343,  2005,  1996,
          9722,  1004,  4132,  9340, 12439,  2964,  2449,  1012,   102,  3026,
          3580,  2343,  4388, 24049,  1010,  3839,  2132,  1997,  1996,  9722,
          1998,  4132,  9340, 12439,  2964,  3131,  1010,  2097,  2599,  1996,
          2047,  9178,  1012,   102]])


In [7]:
myseq_combined

{'input_ids': tensor([[  101, 24049,  2001,  2087,  3728,  3026,  3580,  2343,  2005,  1996,
          9722,  1004,  4132,  9340, 12439,  2964,  2449,  1012,   102,  3026,
          3580,  2343,  4388, 24049,  1010,  3839,  2132,  1997,  1996,  9722,
          1998,  4132,  9340, 12439,  2964,  3131,  1010,  2097,  2599,  1996,
          2047,  9178,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
tokenizer.convert_ids_to_tokens(myseq2['input_ids'])

['[CLS]',
 'senior',
 'vice',
 'president',
 'eric',
 'rudder',
 ',',
 'formerly',
 'head',
 'of',
 'the',
 'developer',
 'and',
 'platform',
 'evan',
 '##gel',
 '##ism',
 'unit',
 ',',
 'will',
 'lead',
 'the',
 'new',
 'entity',
 '.',
 '[SEP]']

we can use the following to tokenize out complete dataset
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)
- This works well, but it has the disadvantage of returning a dictionary. It will also only work if you have enough RAM to store your whole dataset during the tokenization (whereas the datasets from the 🤗 Datasets library are Apache Arrow files stored on the disk, so you only keep the samples you ask for loaded in memory).
- Additionaly, we do not want to do padding at this point becuase it will be very inefficient. We will do padding for each batch individually.

In [9]:
#o keep the data as a dataset, we will use the Dataset.map() method. 
#The map() method works by applying a function on each element of the dataset
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [10]:
#We’re using batched=True in our call to map so the function is applied to multiple
# elements of our dataset at once, and not on each element separately.
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at C:\Users\Sheraz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-5fcd8e9e249ce44a.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\Sheraz\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-9719dcf4bc76e406.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

#### Dynamic Padding

The function that is responsible for putting together samples inside a batch is called a collate function.
To do this in practice, we have to define a collate function that will apply the correct amount of padding to the items of the dataset we want to batch together.
Transformers library provides us with such a function via DataCollatorWithPadding. It takes a tokenizer when you instantiate it and will do everything

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [13]:
#Dynamic padding means the samples in this batch should all be padded to a length 
# of 67, the maximum length inside the batch.
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments("./model")

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

ModuleNotFoundError: No module named 'click'