In [1]:
from transformers import PreTrainedTokenizerFast, RobertaForMaskedLM, DataCollatorForLanguageModeling, AutoConfig
from datasets import Dataset
from transformers import Trainer, TrainingArguments

In [2]:
# 加载训练好的分词器
tokenizer_fast = PreTrainedTokenizerFast(tokenizer_file='tokenizer.json')

# 模仿HuggingFace transformers/models/bert/tokenizaiton_bert.py
tokenizer_fast.add_special_tokens(special_tokens_dict={'bos_token': "<s>",
                                                       'eos_token': '</s>',
                                                       'unk_token': '<unk>',
                                                       'pad_token': '<pad>',
                                                       'cls_token': '<s>',
                                                       'mask_token': '<mask>',
                                                       'sep_token': '</s>'})
tokenizer_fast

PreTrainedTokenizerFast(name_or_path='', vocab_size=6982, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [3]:
dataset = Dataset.from_text('../../data/csv_to_trainTxt/trainTxt_pretrain_model.txt')
dataset

Using custom data configuration default-c427600ca8f54ad1


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-c427600ca8f54ad1/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-c427600ca8f54ad1/0.0.0. Subsequent calls will reuse this data.


Dataset({
    features: ['text'],
    num_rows: 662554
})

In [4]:
def filter_func(data):
    text = data['text']
    return len(text) > 0 and not text.isspace()  # 过滤空白行


def map_func(data):
    batch_encoding = tokenizer_fast(data['text'], truncation=True, padding="max_length", max_length=512)
    return {'input_ids': batch_encoding['input_ids'],
            'attention_mask': batch_encoding['attention_mask'],
            'token_type_ids': batch_encoding['token_type_ids']}


dataset_filter = dataset.filter(filter_func)
dataset_map = dataset_filter.map(map_func, batched=True, batch_size=1000)  # 每次处理1000条数据
dataset_map

  0%|          | 0/663 [00:00<?, ?ba/s]

  0%|          | 0/570 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'token_type_ids'],
    num_rows: 569598
})

In [5]:
# Data collator used for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_fast, mlm=True, mlm_probability=0.15)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='', vocab_size=6982, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [6]:
config_roberta_base = AutoConfig.from_pretrained('roberta-base')  # 加载预训练模型roberta config参数
config_roberta_base.update({
    'vocab_size': tokenizer_fast.vocab_size
})
print(config_roberta_base)

model = RobertaForMaskedLM(config_roberta_base)  # 从0开始预训练roberta模型
# model = RobertaForMaskedLM.from_pretrained('output_idr/checkpoint-??????')  # 从checkpoint开始训练(之前模型训练中断)
print('No of parameters: ', model.num_parameters())

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 6982
}

No of parameters:  91412806


In [7]:
training_args = TrainingArguments(
    output_dir='output_dir',
    overwrite_output_dir=True,
    # num_train_epochs=30.0,
    max_steps=50,
    per_device_train_batch_size=16,
    save_strategy='epoch',
    disable_tqdm=False,  # 是否使用tqdm显示进度
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_map,
    tokenizer=tokenizer_fast
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text. If text are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 569598
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 50


Step,Training Loss


Saving model checkpoint to output_dir/checkpoint-50
Configuration saved in output_dir/checkpoint-50/config.json
Model weights saved in output_dir/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output_dir/checkpoint-50/tokenizer_config.json
Special tokens file saved in output_dir/checkpoint-50/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=50, training_loss=7.819699096679687, metrics={'train_runtime': 17.9658, 'train_samples_per_second': 44.529, 'train_steps_per_second': 2.783, 'total_flos': 210505998336000.0, 'train_loss': 7.819699096679687, 'epoch': 0.0})

In [None]:
# trainer.save_model("save_model")