In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (RobertaForMaskedLM, RobertaTokenizer, DataCollatorForLanguageModeling, Trainer,
                          TrainingArguments)

In [2]:
model_name = 'roberta-large'

model = RobertaForMaskedLM.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)  # 已经训练好的预训练模型
print(tokenizer.model_input_names)

['input_ids', 'attention_mask']


In [3]:
train_data = pd.read_csv('../../datasets/train.csv')['excerpt']
test_data = pd.read_csv('../../datasets/test.csv')['excerpt']
all_data = pd.concat([train_data, test_data]).reset_index(drop=True)
all_data

0       When the young people returned to the ballroom...
1       All through dinner time, Mrs. Fayre was somewh...
2       As Roger had predicted, the snow departed as q...
3       And outside before the palace a great garden w...
4       Once upon a time there were Three Bears who li...
                              ...                        
2836    It was a bright and cheerful scene that greete...
2837    Cell division is the process by which a parent...
2838    Debugging is the process of finding and resolv...
2839    To explain transitivity, let us look first at ...
2840    Milka and John are playing in the garden. Her ...
Name: excerpt, Length: 2841, dtype: object

In [4]:
dataset = Dataset.from_pandas(pd.DataFrame(all_data.values, columns=['text']))
dataset

Dataset({
    features: ['text'],
    num_rows: 2841
})

In [5]:
def filter_func(data):
    text = data['text']
    return len(text) > 0 and not text.isspace()  # 过滤空白行


def map_func(data):
    batch_encoding = tokenizer(data['text'], truncation=True, padding="max_length", max_length=512)
    return {'input_ids': batch_encoding['input_ids'],
            'attention_mask': batch_encoding['attention_mask']}


dataset_filter = dataset.filter(filter_func)
dataset_map = dataset_filter.map(map_func, batched=True, batch_size=1000)  # 每次处理1000条数据

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [6]:
# Data collator used for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizer(name_or_path='roberta-large', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [7]:
training_args = TrainingArguments(
    output_dir='output_dir',
    overwrite_output_dir=True,
    seed=42,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy='epoch',
    logging_strategy='epoch',
    disable_tqdm=False  # 是否使用tqdm显示进度
)

# 继续训练预训练模型
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_map,
    tokenizer=tokenizer
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text. If text are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2841
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1780


Step,Training Loss
356,1.4423
712,1.3319
1068,1.2492
1424,1.1772
1780,1.0997


Saving model checkpoint to output_dir/checkpoint-356
Configuration saved in output_dir/checkpoint-356/config.json
Model weights saved in output_dir/checkpoint-356/pytorch_model.bin
tokenizer config file saved in output_dir/checkpoint-356/tokenizer_config.json
Special tokens file saved in output_dir/checkpoint-356/special_tokens_map.json
Saving model checkpoint to output_dir/checkpoint-712
Configuration saved in output_dir/checkpoint-712/config.json
Model weights saved in output_dir/checkpoint-712/pytorch_model.bin
tokenizer config file saved in output_dir/checkpoint-712/tokenizer_config.json
Special tokens file saved in output_dir/checkpoint-712/special_tokens_map.json
Saving model checkpoint to output_dir/checkpoint-1068
Configuration saved in output_dir/checkpoint-1068/config.json
Model weights saved in output_dir/checkpoint-1068/pytorch_model.bin
tokenizer config file saved in output_dir/checkpoint-1068/tokenizer_config.json
Special tokens file saved in output_dir/checkpoint-1068/sp

TrainOutput(global_step=1780, training_loss=1.260038997350114, metrics={'train_runtime': 987.8622, 'train_samples_per_second': 14.38, 'train_steps_per_second': 1.802, 'total_flos': 1.324027838080512e+16, 'train_loss': 1.260038997350114, 'epoch': 5.0})

In [8]:
model.save_pretrained('save_model/')

Configuration saved in save_model/config.json
Model weights saved in save_model/pytorch_model.bin
