In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

Reusing dataset conll2003 (C:\Users\danie\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3454
    })
})

In [3]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [4]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [5]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], names_file=None, id=None), length=-1, id=None)

In [6]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [7]:
words = raw_datasets["train"][10]["tokens"]
labels = raw_datasets["train"][10]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Spanish Farm Minister Loyola de    Palacio had earlier accused Fischler at an EU    farm ministers ' meeting of causing unjustified alarm through " dangerous generalisation . " 
B-MISC  O    O        B-PER  I-PER I-PER   O   O       O       B-PER    O  O  B-ORG O    O         O O       O  O       O           O     O       O O         O              O O 


In [8]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [10]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [11]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [12]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [14]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee\cache-93ce4055740cd32a.arrow
Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee\cache-15f94d483a6136cc.arrow


  0%|          | 0/4 [00:00<?, ?ba/s]

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [17]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


In [18]:
!pip install seqeval



In [19]:
from datasets import load_metric

metric = load_metric("seqeval")

In [20]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [21]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [22]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [23]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [24]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [25]:
model.config.num_labels

9

In [26]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 14042
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0862,0.061311,0.924416,0.938573,0.931441,0.983973
2,0.0333,0.061179,0.93096,0.946314,0.938575,0.985239
3,0.0232,0.062581,0.935687,0.950017,0.942797,0.986092


***** Running Evaluation *****
  Num examples = 3251
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner\checkpoint-1756
Configuration saved in bert-finetuned-ner\checkpoint-1756\config.json
Model weights saved in bert-finetuned-ner\checkpoint-1756\pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner\checkpoint-1756\tokenizer_config.json
Special tokens file saved in bert-finetuned-ner\checkpoint-1756\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3251
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner\checkpoint-3512
Configuration saved in bert-finetuned-ner\checkpoint-3512\config.json
Model weights saved in bert-finetuned-ner\checkpoint-3512\pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner\checkpoint-3512\tokenizer_config.json
Special tokens file saved in bert-finetuned-ner\checkpoint-3512\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3251
  Batch size = 8
Saving model checkpoin

TrainOutput(global_step=5268, training_loss=0.06740947917454516, metrics={'train_runtime': 441.2424, 'train_samples_per_second': 95.471, 'train_steps_per_second': 11.939, 'total_flos': 921690774203400.0, 'train_loss': 0.06740947917454516, 'epoch': 3.0})

In [29]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\danie/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": "5",
    "B-MISC": "7",
    "B-ORG": "3",
    "B-PER": "1",
    "I-LOC": "6",
    "I-MISC": "8",
    "I-ORG": "4",
    "I-PER": "2",
    "O": "0"
  },
  "layer

In [31]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [32]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

5268


In [33]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [34]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [35]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [38]:
from tqdm.auto import tqdm
import torch
num_train_epochs = 3
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

  0%|          | 0/5268 [00:00<?, ?it/s]

epoch 0: {'precision': 0.9276337933355773, 'recall': 0.9027186374058304, 'f1': 0.9150066401062417, 'accuracy': 0.9807499852828634}
epoch 1: {'precision': 0.9404240996297543, 'recall': 0.9081748740451812, 'f1': 0.9240181893344357, 'accuracy': 0.9829869900512156}
epoch 2: {'precision': 0.9493436553349041, 'recall': 0.9293245469522241, 'f1': 0.9392274392274392, 'accuracy': 0.9860923058809677}


In [39]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/config.json not found in cache or force_download set to True, downloading to C:\Users\danie\.cache\huggingface\transformers\tmp_b28u1xl


Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/config.json in cache at C:\Users\danie/.cache\huggingface\transformers\89b5cbe52d0bb00707c92604aa32923af1e03e431c6dc7755afa8e12736b8611.61f3554ca83a9cd2ab0687ff37a7d0f4065bc064c4f297e5d2d42e322e4c5f26
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\89b5cbe52d0bb00707c92604aa32923af1e03e431c6dc7755afa8e12736b8611.61f3554ca83a9cd2ab0687ff37a7d0f4065bc064c4f297e5d2d42e322e4c5f26
loading configuration file https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/config.json from cache at C:\Users\danie/.cache\huggingface\transformers\89b5cbe52d0bb00707c92604aa32923af1e03e431c6dc7755afa8e12736b8611.61f3554ca83a9cd2ab0687ff37a7d0f4065bc064c4f297e5d2d42e322e4c5f26
Model config BertConfig {
  "_name_or_path": "huggingface-course/bert-finetuned-ner",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": n

Downloading:   0%|          | 0.00/411M [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/pytorch_model.bin in cache at C:\Users\danie/.cache\huggingface\transformers\53be13866b5e6a7bf270c198a504ba00a1d6c99765a5425c8d8a6f67474c59db.34b5567d0f0878a5afa5157f6e9b2fe7742287d066ae18b16a646cba224cb46f
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\53be13866b5e6a7bf270c198a504ba00a1d6c99765a5425c8d8a6f67474c59db.34b5567d0f0878a5afa5157f6e9b2fe7742287d066ae18b16a646cba224cb46f
loading weights file https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/pytorch_model.bin from cache at C:\Users\danie/.cache\huggingface\transformers\53be13866b5e6a7bf270c198a504ba00a1d6c99765a5425c8d8a6f67474c59db.34b5567d0f0878a5afa5157f6e9b2fe7742287d066ae18b16a646cba224cb46f
All model checkpoint weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the model checkpoint at huggingface-course/bert-f

Downloading:   0%|          | 0.00/320 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/tokenizer_config.json in cache at C:\Users\danie/.cache\huggingface\transformers\b0df7b2f0fed938ea1c03e3bfce55b08731d98d1eb6ca196178bfeb9203c7507.0bbe47aa0e39b09ed05a95f7d42a27299232ce8e9ef28608e8f8a1cb57a74c0a
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\b0df7b2f0fed938ea1c03e3bfce55b08731d98d1eb6ca196178bfeb9203c7507.0bbe47aa0e39b09ed05a95f7d42a27299232ce8e9ef28608e8f8a1cb57a74c0a
https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to C:\Users\danie\.cache\huggingface\transformers\tmp8kgtao7k


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/vocab.txt in cache at C:\Users\danie/.cache\huggingface\transformers\dbadb9cd77a95bc4e921c9457f6c9f87f9654e89c139503e43f3c6abd4aef018.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\dbadb9cd77a95bc4e921c9457f6c9f87f9654e89c139503e43f3c6abd4aef018.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to C:\Users\danie\.cache\huggingface\transformers\tmp4k6tc8sq


Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/tokenizer.json in cache at C:\Users\danie/.cache\huggingface\transformers\450ab56275366591009a03ebe21bfa2523a89a590ac2e4b569920025b849ecf5.1f9d100b22551a7009fb51f1fadb9158af5db04f4c188aceecaa745a1917c983
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\450ab56275366591009a03ebe21bfa2523a89a590ac2e4b569920025b849ecf5.1f9d100b22551a7009fb51f1fadb9158af5db04f4c188aceecaa745a1917c983
https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to C:\Users\danie\.cache\huggingface\transformers\tmpk01b72lv


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/special_tokens_map.json in cache at C:\Users\danie/.cache\huggingface\transformers\1f7a04f6385a04a9c60686046244d8daaa06489d154d6523ca28c5d8430c74c0.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for C:\Users\danie/.cache\huggingface\transformers\1f7a04f6385a04a9c60686046244d8daaa06489d154d6523ca28c5d8430c74c0.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/vocab.txt from cache at C:\Users\danie/.cache\huggingface\transformers\dbadb9cd77a95bc4e921c9457f6c9f87f9654e89c139503e43f3c6abd4aef018.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/huggingface-course/bert-finetuned-ner/resolve/main/tokenizer.json from cache at C:\Users\danie/.cache\huggingface\transformers\450ab56275366591009a03ebe21bfa2523a89a590ac2e4b569

[{'entity_group': 'PER',
  'score': 0.9988506,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9647625,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9986118,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [1]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [2]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [3]:
text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

Reusing dataset imdb (C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")

Loading cached shuffled indices for dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-8a9e43a6ac4acdff.arrow



'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [8]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-a447541be4899505.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-da5232449a0927e9.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [9]:
tokenizer.model_max_length


512

In [10]:
chunk_size = 128

In [11]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 363'
'>>> Review 1 length: 304'
'>>> Review 2 length: 133'


In [12]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 800'


In [13]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 32'


In [14]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-abdb1c4b1d177e80.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-c2fcb759f25488d2.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [16]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [17]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [19]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented i am curious - yellow from my video store because of all [MASK] controversy that ellison it when it was first released in 1967. [MASK] also heard that at first [MASK] was seized by u [MASK] s. [MASK] if it ever tried to enter this country limitation therefore being a fan of films considered " [MASK] " i really had to see this for myself. [MASK] br / > < [MASK] / > the plot [MASK] centered around a young swedish drama student named [MASK] who [MASK] to learn everything she can about life [MASK] in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about [MASK] [MASK] issues such'

'>>> as the vietnam war and race issues in the united [MASK]. in between asking [MASK] and ordinary denizens [MASK] stockholm about their opinions on politics [MASK] she [MASK] sex with her drama teacher [MASK] classmates, and married men. < br / [MASK] < br / > what kills me about i am curious - yellow is [MASK] 40 years ago, 

In [20]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [21]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented [MASK] am curious [MASK] yellow from my [MASK] store because of all the controversy that [MASK] it when it [MASK] first released in 1967. i also heard that at first it was seized by [MASK]. [MASK] [MASK] [MASK] if it ever tried [MASK] [MASK] this country [MASK] therefore being a fan of films [MASK] " controversial " i really had to see this for [MASK]. < [MASK] / > < br / [MASK] the plot is [MASK] around a young swedish drama student named lena who wants to learn [MASK] she can [MASK] life. in particular she wants to focus her [MASK] [MASK] to making some sort of documentary [MASK] [MASK] the average [MASK] [MASK] thought about certain political issues such'

'>>> as [MASK] vietnam war [MASK] race issues in the united states [MASK] in between asking [MASK] and ordinary denizens [MASK] stockholm [MASK] their [MASK] on politics, she [MASK] sex with her drama teacher, [MASK], and married men. < br / > < br / > what [MASK] [MASK] about i am curious - yellow [MASK] that

In [22]:
train_size = 10_000
test_size = int(0.05 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-d9dc84943453eb8f.arrow and C:\Users\danie\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1\cache-e39540bf8c6b9a22.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 500
    })
})

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

Using amp half precision backend


In [26]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 64


>>> Perplexity: 22.03


In [27]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 471


Epoch,Training Loss,Validation Loss
1,2.7017,2.49999
2,2.5729,2.459273
3,2.5547,2.430537


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=471, training_loss=2.6083076379861043, metrics={'train_runtime': 206.2779, 'train_samples_per_second': 145.435, 'train_steps_per_second': 2.283, 'total_flos': 994208670720000.0, 'train_loss': 2.6083076379861043, 'epoch': 3.0})

In [28]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 64


>>> Perplexity: 11.43
