先简单测试一下效果

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM, BertConfig, BertForMaskedLM

checkpoint = "google-bert/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
fine_tuned_model = AutoModelForMaskedLM.from_pretrained(checkpoint)
un_fine_tuned_model = BertForMaskedLM(BertConfig(tokenizer.vocab_size))

In [None]:
text = "有人问一位智者：“我觉得自己很有能力，可为什么没人[MASK]赏我呢？”智者随手捡起一块石头向远处扔去，接着叫他去捡回来，他说做不到。"

pipe = pipeline("fill-mask", fine_tuned_model, tokenizer=tokenizer, device=0)
result = pipe(text, top_k=1)
print(result[0]["sequence"])

In [None]:
text = "有人问一位智者：“我觉得自己很有能力，可为什么没人[MASK]赏我呢？”智者随手捡起一块石头向远处扔去，接着叫他去捡回来，他说做不到。"

pipe = pipeline("fill-mask", un_fine_tuned_model, tokenizer=tokenizer, device=0)
result = pipe(text, top_k=1)
print(result[0]["sequence"])

# 开始训练这个模型

In [1]:
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BertForMaskedLM, BertConfig

In [None]:
from datasets import load_dataset

ds = load_dataset("shaowenchen/wiki_zh")
print(ds)

In [None]:
ds["train"][42]["text"][:200]

In [None]:
ds = ds["train"].train_test_split(test_size=10, shuffle=True, seed=42)
ds

In [None]:
checkpoint = "google-bert/bert-base-chinese"
model = BertForMaskedLM(BertConfig(vocab_size=21128))
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def processFunciton(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
ds = ds.map(processFunciton, batched=True, remove_columns=ds["train"].column_names)
ds

In [8]:
args = TrainingArguments(
    output_dir="output/mlm_pretrained",
    per_device_train_batch_size=24,
    fp16=True,
    max_steps=10000,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=1000
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=DataCollatorForLanguageModeling(tokenizer),
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=tokenizer
)

In [None]:
trainer.train()

# 验证训练效果

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM

In [None]:
checkpoint = "google-bert/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
un_fine_tuned_model = AutoModelForMaskedLM.from_pretrained("output/mlm_pretrained/checkpoint-10000")


text = "有人问一位智者：“我觉得自己很有能力，可为什么没人[MASK]赏我呢？”智者随手捡起一块石头向远处扔去，接着叫他去捡回来，他说做不到。"

pipe = pipeline("fill-mask", un_fine_tuned_model, tokenizer=tokenizer, device=0)
result = pipe(text, top_k=1)
print(result[0]["sequence"])