In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [3]:
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "google/bert_uncased_L-2_H-128_A-2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
# 数据收集器类，自动处理数据里的填充，使得同一个batch中的序列长度保持一致
data_collator = DataCollatorWithPadding(tokenizer)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments
# path to folder 保存结果和checkpoints
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.05,
)


In [7]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

Step,Training Loss
500,0.6042
1000,0.5542


Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1150, training_loss=0.5708178578252378, metrics={'train_runtime': 107.1118, 'train_samples_per_second': 171.223, 'train_steps_per_second': 10.736, 'total_flos': 3463883657760.0, 'train_loss': 0.5708178578252378, 'epoch': 5.0})

In [8]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [9]:
import numpy as np
from datasets import load_metric

metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.7058823529411765, 'f1': 0.8181818181818182}

In [10]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.05,
    evaluation_strategy="epoch",
)
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.578222,0.715686,0.824242
2,No log,0.566954,0.715686,0.822086
3,0.503500,0.584563,0.727941,0.830534
4,0.503500,0.578646,0.737745,0.835637
5,0.453900,0.573223,0.735294,0.832817


Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1150, training_loss=0.473566025443699, metrics={'train_runtime': 247.142, 'train_samples_per_second': 74.208, 'train_steps_per_second': 4.653, 'total_flos': 3466484185920.0, 'train_loss': 0.473566025443699, 'epoch': 5.0})

In [11]:
predictions_test = trainer.predict(tokenized_datasets["test"])
preds_test = np.argmax(predictions_test.predictions, axis=-1)
metric.compute(predictions=preds_test, references=predictions_test.label_ids)

{'accuracy': 0.7194202898550724, 'f1': 0.8165276724791509}