# **執行完重新啟動**

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Masked language model**

Masked language modeling predicts a masked token in a sequence, and the model can attend to tokens bidirectionally. This means the model has full access to the tokens on the left and right. Masked language modeling is great for tasks that require a good contextual understanding of an entire sequence.

In [None]:
!pip install transformers datasets evaluate



# **Load ELI5 dataset**

In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

# **train test split**

In [None]:
eli5 = eli5.train_test_split(test_size=0.2)

# **Dataset format dictionary**

In [None]:
# extract the text subfield from its nested structure
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': '13eoiq',
 'title': 'How does something like this form naturally?',
 'selftext': '_URL_0_\n\nIt just seems like such a major coincidence.  How does such a huge boulder come to rest on a spot like that and stay balanced?',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c73ah7d', 'c73a75c'],
 'answers.text': ["In addition to what esbenab said, think about this: there are many *many* more boulders that did not manage to come to rest like this one (and just fell off making two side by side boulders), but we don't really take that into consideration when we think about how this one could have possibly settled down this way. \n\nIt's sort of like [confirmation bias](_URL_0_). The same thing happens when a streetlight goes out while you're walking under it: we think that it's an amazing coincidence, but we don't take into account how many streetlights *didn't* turn off while we walked under them. Statistically, major coincidences happen.",
  'Ice age, glacier pushes 

## **Join the list of strings for each example and tokenize the result**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

# **Dataset process**

In [None]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (793 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (885 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1870 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (960 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (870 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors


This dataset contains the token sequences, but some of these are longer than the maximum input length for the model.

1. concatenate all the sequences.
2. split the concatenated sequences into shorter chunks defined by block_size.

In [None]:
block_size = 128
# option 2

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

**DataCollatorForLanguageModeling**.

more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

Use the end-of-sequence token as the padding token and specify mlm_probability to randomly mask tokens each time you iterate over the data.

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# **Load DistilRoBERTa with AutoModelForMaskedLM**



In [None]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1. Define your training hyperparameters in TrainingArguments. The only required parameter is output_dir which specifies where to save your model.
2. Pass the training arguments to Trainer along with the model, datasets, and data collator.

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.238,2.079274
2,2.1526,2.012057
3,2.1249,2.015891


Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_awesome_eli5_mlm_model/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3408, training_loss=2.1863897977300653, metrics={'train_runtime': 769.0391, 'train_samples_per_second': 35.452, 'train_steps_per_second': 4.432, 'total_flos': 903950233731072.0, 'train_loss': 2.1863897977300653, 'epoch': 3.0})

# **Perplexity**

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.24


In [None]:
model.push_to_hub('eli5_mlm_model')

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/david965154/eli5_mlm_model/commit/0b7ad16eef02fd020d6040fd6b3441d181f80307', commit_message='Upload RobertaForMaskedLM', commit_description='', oid='0b7ad16eef02fd020d6040fd6b3441d181f80307', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub('eli5_mlm_model')

CommitInfo(commit_url='https://huggingface.co/david965154/eli5_mlm_model/commit/9e4a2088b5e7d4ac6488ade07e5977702718e535', commit_message='Upload tokenizer', commit_description='', oid='9e4a2088b5e7d4ac6488ade07e5977702718e535', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

mask_filler = pipeline("fill-mask", "david965154/eli5_mlm_model")

# **Predict mask**

In [None]:
text = "Deep learing is a <mask> in this laboratory."

In [None]:
mask_filler(text, top_k=3)

[{'score': 0.18199005722999573,
  'token': 936,
  'token_str': ' problem',
  'sequence': 'Deep learing is a problem in this laboratory.'},
 {'score': 0.06399531662464142,
  'token': 33984,
  'token_str': ' rarity',
  'sequence': 'Deep learing is a rarity in this laboratory.'},
 {'score': 0.056482452899217606,
  'token': 2212,
  'token_str': ' concern',
  'sequence': 'Deep learing is a concern in this laboratory.'}]

In [None]:
from transformers import AutoTokenizer
import torch
import torchvision
# get tokenizer
tokenizer = AutoTokenizer.from_pretrained("david965154/eli5_mlm_model")
# text to input format
inputs = tokenizer(text, return_tensors="pt")
# get mask position
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [None]:
from transformers import AutoModelForMaskedLM
# get model
model = AutoModelForMaskedLM.from_pretrained("david965154/eli5_mlm_model")
# get model prediction logit
logits = model(**inputs).logits
# get mask logit
mask_token_logits = logits[0, mask_token_index, :]

In [None]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

Deep learing is a  problem in this laboratory.
Deep learing is a  rarity in this laboratory.
Deep learing is a  concern in this laboratory.
