In [1]:
from transformers import PreTrainedTokenizerFast, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [2]:
# 加载从头训练的分词器(也可使用已有的分词器)
tokenizer_fast = PreTrainedTokenizerFast(tokenizer_file='tokenizer-wikitext-103.json')
tokenizer_fast

PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={})

In [3]:
tokenizer_fast.add_special_tokens(special_tokens_dict={'eos_token': '[EOS]',
                                                       'mask_token': '[UNK]',
                                                       'pad_token': '[PAD]',
                                                       'cls_token': '[CLS]',
                                                       'sep_token': '[SEP]'})
tokenizer_fast.all_special_tokens

['[EOS]', '[SEP]', '[PAD]', '[CLS]', '[UNK]']

In [4]:
files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
dataset = Dataset.from_text(files[2])
dataset

Using custom data configuration default-2d03098f229c8816
Reusing dataset text (/root/.cache/huggingface/datasets/text/default-2d03098f229c8816/0.0.0)


Dataset({
    features: ['text'],
    num_rows: 3760
})

In [5]:
def filter_func(data):
    text = data['text']
    return len(text) > 0 and not text.isspace()  # 过滤空白行


def map_func(data):
    batch_encoding = tokenizer_fast(data['text'], truncation=True, padding="max_length", max_length=512)
    return {'input_ids': batch_encoding['input_ids'],
            'attention_mask': batch_encoding['attention_mask'],
            'token_type_ids': batch_encoding['token_type_ids']}


dataset_filter = dataset.filter(filter_func)
dataset_map = dataset_filter.map(map_func, batched=True, batch_size=1000)  # 每次处理1000条数据
dataset_map

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'token_type_ids'],
    num_rows: 2461
})

In [6]:
# 相当于torch.utils.data.DataLoader中collate_fn的作用(可以重写,参考_way_of_training/pytorch_transformer.ipynb)
# Data collator used for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_fast, mlm=True, mlm_probability=0.15)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '[EOS]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[UNK]'}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [7]:
config = BertConfig(
    vocab_size=tokenizer_fast.vocab_size,
    hidden_size=768,
    num_hidden_layers=6,
    num_attention_heads=12,
    max_position_embeddings=512
)

# Bert Model with a language modeling head on top.
model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())

No of parameters:  81965648


In [8]:
training_args = TrainingArguments(
    output_dir='preTrained_Model',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_map,
    tokenizer=tokenizer_fast
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text. If text are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2461
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 770


Step,Training Loss
500,7.4178


Saving model checkpoint to preTrained_Model/checkpoint-154
Configuration saved in preTrained_Model/checkpoint-154/config.json
Model weights saved in preTrained_Model/checkpoint-154/pytorch_model.bin
tokenizer config file saved in preTrained_Model/checkpoint-154/tokenizer_config.json
Special tokens file saved in preTrained_Model/checkpoint-154/special_tokens_map.json
Saving model checkpoint to preTrained_Model/checkpoint-308
Configuration saved in preTrained_Model/checkpoint-308/config.json
Model weights saved in preTrained_Model/checkpoint-308/pytorch_model.bin
tokenizer config file saved in preTrained_Model/checkpoint-308/tokenizer_config.json
Special tokens file saved in preTrained_Model/checkpoint-308/special_tokens_map.json
Saving model checkpoint to preTrained_Model/checkpoint-462
Configuration saved in preTrained_Model/checkpoint-462/config.json
Model weights saved in preTrained_Model/checkpoint-462/pytorch_model.bin
tokenizer config file saved in preTrained_Model/checkpoint-462/

TrainOutput(global_step=770, training_loss=7.229094428520698, metrics={'train_runtime': 196.0867, 'train_samples_per_second': 62.753, 'train_steps_per_second': 3.927, 'total_flos': 1631901312860160.0, 'train_loss': 7.229094428520698, 'epoch': 5.0})