In [None]:
from IPython.display import Image

## TrainingArguments

- 维护统一命名的训练参数

    - `output_dir`
    - `num_train_epochs`
    - `evaluation_strategy`
        - `epoch`
    - `logging_steps`:
        - 100，多少个optimizer steps 显示 loss及其他 metrics
    - `per_device_train_batch_size`
    - `per_device_eval_batch_size`
    - `save_strategy`
        - 'epoch'
    - 优化器相关
        - `learning_rate`
        - `alpha`
        - `weight_decay`
        - `optim`
            - 'adamw-torch'
        - lr scheduler
            - `lr_scheduler_type="linear"`,
                - linear
                - cosine
            - `warmup_ratio=0.1`,
    - 精度量化
        - `fp16`: `True`
    - `push_to_hub`
        - `True/False`
- device_map
    - `device_map = {"": 0}`: 只使用 gpu 0，一张卡

### lr scheduler

- lr_scheduler_type
    - warmup_steps
    - warmup_ratio

```
def get_warmup_steps(self, num_training_steps: int):
    """
    Get number of steps used for a linear warmup.
    """
    warmup_steps = (
        self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio)
    )
    return warmup_steps
```

## Trainer

In [None]:
Image('../imgs/trainer.png', width=500)

- Trainer pipeline

```
train()
    inner_training_loop()
        for epoch in range(num_train_epochs):
            for step, inputs in enumerate(epoch_iterator):
                tr_loss_step = self.training_step(model, inputs)
                    loss = self.compute_loss(model, inputs)
                    loss.backward()
```

- `model` or `model_init`（Function object）
    - 必须指定其一；
- 核心的成员函数
    - `compute_loss`: batch 粒度
- 数据
    - `train_dataset`
    - `eval_dataset`
- 参数：
    - `args`
- tokenzier
    - `tokenizer`
- 重要回调函数（非成员函数）
    - `compute_metrics`：参数类型为 `EvalPrediction`
    
- datasets/inputs 的关键成员
    - `labels`：Trainer looks for a column called labels 

## examples

In [None]:
import os
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

### datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset('glue', 'mrpc')
ckpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(ckpt)

def tokenize_func(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_func, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
raw_datasets

In [None]:
raw_datasets['train'][0]

In [None]:
input_ids = tokenizer(raw_datasets['validation']['sentence1'], 
                      raw_datasets['validation']['sentence2'], 
                      truncation=True)['input_ids']

In [None]:
len(input_ids[0])

In [None]:
tokenized_datasets

### model

In [2]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=2)
# model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### training

In [None]:
tokenized_datasets

- `TrainingArguments`: `@dataclass`
    - 只是用来存参数配置的；
    - batch_size 与 global_step 
        - per_device_train_batch_size=16,
            - per_device_eval_batch_size=16,
        - num_train_epochs=5,
        - `5*3668 /(16*2) == 574`
            - 2 表示我本机 gpus 的数量
    - `evaluation_strategy`
        - `epoch`
        - `steps`
    - `logging_strategy`：如果不指定的话，输出的 log 显示上 `Training Loss`（no log）
        - `epoch`
    - 梯度优化相关
        - `gradient_accumulation_steps`
        
- `Trainer`
    - `data_collator`: 
        - `DataCollatorWithPadding(tokenizer)`
            - dynamic padding

In [None]:
5*3668 /(16*2)

In [3]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_strategy='epoch'
)

In [4]:
from datasets import load_metric
import numpy as np
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("glue", "mrpc")
Using the latest cached version of the module from /home/whaow/.cache/huggingface/modules/datasets_modules/metrics/glue/91f3cfc5498873918ecf119dbf806fb10815786c84f41b85a5d3c47c1519b343 (last modified on Sun Jun 18 16:07:08 2023) since it couldn't be found locally at glue, or remotely on the Hugging Face Hub.


In [5]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlanchunhui[0m ([33mloveresearch[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5453,0.44357,0.816176,0.877651
2,0.3501,0.367913,0.830882,0.880829
3,0.2062,0.433944,0.840686,0.888889
4,0.1179,0.515514,0.833333,0.885522
5,0.0768,0.483391,0.848039,0.891228




TrainOutput(global_step=575, training_loss=0.25926235033118206, metrics={'train_runtime': 98.4745, 'train_samples_per_second': 186.241, 'train_steps_per_second': 5.839, 'total_flos': 753299284826400.0, 'train_loss': 0.25926235033118206, 'epoch': 5.0})

### inference

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])
predictions

In [None]:
from datasets import load_metric
import numpy as np
metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)