# Explore Pytorch

Although I prefer use `.py` file, but jupytor notebook do provide some advantage. Especially step by step execution.

## Loading Model and Tokenizer

In [1]:
from transformers import BertModel, BertTokenizer

checkpoint = "bert-base-uncased"

tokenizer: BertTokenizer = BertTokenizer.from_pretrained(checkpoint)
model: BertModel = BertModel.from_pretrained(checkpoint)

raw_inputs = "I've been waiting for a HuggingFace course my whole life."

Tokenizer actually is a two step process. Tokenize and Convert token to ids.

In [2]:
tokens = tokenizer.tokenize(raw_inputs)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


Tokenizer interface allow padding and truncation.

And also specify the return_tensor type, can be pytorch or tensorflow.

In [3]:
model_inputs = tokenizer(raw_inputs, return_tensors="pt")
print(model_inputs["input_ids"].size())

model_inputs = tokenizer(raw_inputs, padding="longest", return_tensors="pt")
print(model_inputs["input_ids"].size())

model_inputs = tokenizer(raw_inputs, padding="max_length", return_tensors="pt")
print(model_inputs["input_ids"].size())

model_inputs = tokenizer(raw_inputs, padding="max_length", max_length=8, return_tensors="pt")
print(model_inputs["input_ids"].size())


torch.Size([1, 16])
torch.Size([1, 16])
torch.Size([1, 512])
torch.Size([1, 16])


Tokenizer also take multiple sequence as inputs.
And Special token appear at start and end of the sequence.

In [4]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I hate that!"]

model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print(model_inputs)
print(model_inputs["input_ids"].size())

print(tokenizer.decode(model_inputs["input_ids"][0]))
print(tokenizer.decode(model_inputs["input_ids"][1]))

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,  5223,  2008,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
torch.Size([2, 16])
[CLS] i've been waiting for a huggingface course my whole life. [SEP]
[CLS] so have i hate that! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


# To Fine Tune / Train a model

Basic sample code.

In [5]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some Exploration on dataset loaded.

In [6]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [7]:
train_ds = raw_datasets["train"]
print(train_ds[0])
print(train_ds.features)
print(train_ds.features["label"].names)

# from datasets import ClassLabel
# ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': Value('string'), 'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), 'idx': Value('int32')}
['not_equivalent', 'equivalent']


In [None]:
tokenized_ds = train_ds.map(lambda dr : tokenizer(dr["sentence1"], dr["sentence2"], truncation=True), batched=True)
print(train_ds)
print(tokenized_ds)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})


Use Data Collator to batch padding sequence.

In [9]:
from transformers import DataCollatorWithPadding

batch_size = 16
samples = tokenized_ds[:batch_size]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(samples)
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([16, 67]), 'token_type_ids': torch.Size([16, 67]), 'attention_mask': torch.Size([16, 67]), 'labels': torch.Size([16])}


# Use a Trainer to fine tune the model

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(
    "hf_test_dir",
    eval_strategy="epoch",
)
model: AutoModelForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
metric = evaluate.load("glue", "mrpc")

tokenized_ds = raw_datasets.map(lambda dr : tokenizer(dr["sentence1"], dr["sentence2"], truncation=True), batched=True)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

model.train()
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [12]:
trainer.train()

# predictions, label_ids, metrics = trainer.predict(tokenized_ds["validation"])
predictions = trainer.predict(tokenized_ds["validation"])
print(predictions)



Step,Training Loss
500,0.5271
1000,0.3351




PredictionOutput(predictions=array([[-2.8588297e+00,  2.9076500e+00],
       [ 2.1174812e+00, -2.1830401e+00],
       [-1.3695407e+00,  1.0223094e+00],
       [-2.7460175e+00,  2.8163099e+00],
       [ 2.0967627e+00, -2.1941075e+00],
       [-2.8573503e+00,  2.9109869e+00],
       [-2.7460475e+00,  2.8361127e+00],
       [-2.8682790e+00,  2.9111447e+00],
       [-2.4225206e+00,  2.5764523e+00],
       [-2.8561912e+00,  2.9126270e+00],
       [-2.8786507e+00,  2.9200387e+00],
       [ 2.1351697e+00, -2.2298594e+00],
       [ 2.0381045e+00, -2.0524592e+00],
       [-2.7989392e+00,  2.8602877e+00],
       [-2.8759811e+00,  2.8983665e+00],
       [-2.7815151e+00,  2.8385062e+00],
       [-2.8701277e+00,  2.8969526e+00],
       [ 1.7477285e+00, -1.8299227e+00],
       [-2.8804173e+00,  2.9075732e+00],
       [ 1.7656301e+00, -1.8787601e+00],
       [ 1.8633493e+00, -1.9813521e+00],
       [-9.7451252e-01,  7.2711480e-01],
       [ 2.0082030e+00, -2.0762424e+00],
       [-2.8459220e+00,  2.8