In [27]:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer

In [2]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [4]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
training_args = TrainingArguments("../temp/test-trainer")

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Examine model

In [7]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Fine-tune

In [8]:
training_args.evaluation_strategy = "epoch"
training_args.num_train_epochs = 3
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [9]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.412826
2,0.528200,0.453798
3,0.306400,0.628002


TrainOutput(global_step=1377, training_loss=0.3458745121176726, metrics={'train_runtime': 161.1505, 'train_samples_per_second': 68.284, 'train_steps_per_second': 8.545, 'total_flos': 405540469624800.0, 'train_loss': 0.3458745121176726, 'epoch': 3.0})

# Inspect some predictions

In [10]:
predictions = trainer.predict(tokenized_datasets["validation"])

In [11]:
print(predictions.predictions.shape)
print(predictions.label_ids.shape)

(408, 2)
(408,)


In [12]:
predictions.predictions[:5]

array([[-3.5333772,  3.5699859],
       [ 3.1833112, -3.551851 ],
       [ 1.4999326, -1.6531037],
       [-3.3935084,  3.3974605],
       [ 2.5472898, -2.9051936]], dtype=float32)

In [13]:
predictions.label_ids[:5]

array([1, 0, 0, 1, 0])

In [14]:
predictions.metrics

{'test_loss': 0.628001868724823,
 'test_runtime': 1.2494,
 'test_samples_per_second': 326.56,
 'test_steps_per_second': 40.82}

# Compute custom metrics

In [15]:
preds = predictions.predictions.argmax(axis=-1)
labs = predictions.label_ids

In [22]:
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8406862745098039, 'f1': 0.8877374784110534}

In [23]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits,  axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args.evaluation_strategy = "epoch"
training_args.num_train_epochs = 3
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.383768,0.838235,0.887372
2,0.523000,0.581205,0.848039,0.895623
3,0.292000,0.666918,0.862745,0.902778


TrainOutput(global_step=1377, training_loss=0.33292880643513556, metrics={'train_runtime': 162.7212, 'train_samples_per_second': 67.625, 'train_steps_per_second': 8.462, 'total_flos': 405540469624800.0, 'train_loss': 0.33292880643513556, 'epoch': 3.0})