In [6]:
from transformers import BertModel, BertTokenizer
import torch
import torch.utils.data as Data
from sklearn.metrics import f1_score
from transformers import Trainer, TrainingArguments

from text_classification.models.Bert_base import Bert_base
from text_classification.data_ag_news.data.data_process_bert import Dataset, get_collate_fn

%run ..\..\models\Bert_base.py

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ckpt = "bert-base-uncased"

token = BertTokenizer.from_pretrained(model_ckpt)
dataset_test = Dataset('../datasets/test.csv')
dataset_train = Dataset('../datasets/train.csv')
collate_fn = get_collate_fn(token)

dataLoader = Data.DataLoader(dataset=dataset_test, batch_size=4, collate_fn=collate_fn)
for i in dataLoader:
    print(i)
    break

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\dcdmm/.cache\huggingface\transformers\45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at C:\Users\dcdmm/.cache\huggingface\transformers\c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\dcdmm/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005b

{'input_ids': tensor([[  101, 10069,  2005,  1056,  1050, 11550,  2044,  7566,  9209,  5052,
          3667,  2012,  6769,  2047,  8095,  2360,  2027,  2024,  1005,  9364,
          1005,  2044,  7566,  2007, 16654,  6687,  3813,  2976,  9587, 24848,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [8]:
pretrained = BertModel.from_pretrained(model_ckpt)
# 冻结网络层参数(不进行梯度更新)
for param in pretrained.parameters():
    param.requires_grad = False

criterion = torch.nn.CrossEntropyLoss()
model = Bert_base(pretrained, 4, criterion)
model = model.to(device)
model

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\dcdmm/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin fr

Bert_base(
  (fc): Linear(in_features=768, out_features=4, bias=True)
  (pretrained): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [9]:
def compute_metrics(pred):
    """验证数据集评估函数"""
    labels = pred.label_ids  # 对应自定义模型forward函数输入:labels
    preds = pred.predictions  # 对应自定义模型forward函数返回值的第二个元素
    preds_argmax = preds.argmax(-1)
    f1 = f1_score(labels, preds_argmax, average='micro')
    return {"f1 score": f1}  # return a dictionary string to metric value


# 主要调节的超参数
training_args = TrainingArguments(
    output_dir='output_dir',
    overwrite_output_dir=True,
    seed=42,
    num_train_epochs=1.0,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_strategy='epoch',
    disable_tqdm=False,  # 使用tqdm显示进度
)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=dataset_train,
                  eval_dataset=dataset_test,
                  data_collator=collate_fn,
                  compute_metrics=compute_metrics,
                  tokenizer=token)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [10]:
trainer.train()  # 模型训练

***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Epoch,Training Loss,Validation Loss


Saving model checkpoint to output_dir\checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in output_dir\checkpoint-500\tokenizer_config.json
Special tokens file saved in output_dir\checkpoint-500\special_tokens_map.json
Saving model checkpoint to output_dir\checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in output_dir\checkpoint-1000\tokenizer_config.json
Special tokens file saved in output_dir\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to output_dir\checkpoint-1500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in output_dir\checkpoint-1500\tokenizer_config.json
Special tokens file saved in output_dir\checkpoint-1500\special_tokens_map.json
Saving model checkpoint to output_dir\checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved