<a href="https://colab.research.google.com/github/ftnext/practice-dl-nlp/blob/master/bert_exercise/transformers_examples/20230912_tutorial_ner_conll2003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ref: https://github.com/huggingface/notebooks/blob/226b30b12d3f8102098cd3713a568954ca238936/examples/token_classification.ipynb

In [1]:
!python -V

Python 3.10.12


In [2]:
!pip install -qq datasets 'transformers[torch]' seqeval evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m32.5 MB/s[

In [3]:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)

In [4]:
task = "ner"
model_checkpoint = "distilbert-base-uncased"

## Preprocess

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label_ids = [-100 if i is None else label[i] for i in word_ids]
        labels.append(aligned_label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
datasets = load_dataset("conll2003")
label_list = datasets["train"].features[f"{task}_tags"].feature.names

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

## Fine tune

In [9]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list)
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 16
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [11]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [12]:
metric = evaluate.load("seqeval")


def remove_ignored_index_from_predictions(predictions, labels, label_list):
    return [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


def remove_ignored_index_from_labels(predictions, labels, label_list):
    return [
        [label_list[l] for (_, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = remove_ignored_index_from_predictions(
        predictions, labels, label_list
    )
    true_labels = remove_ignored_index_from_labels(
        predictions, labels, label_list
    )

    results = metric.compute(
        predictions=true_predictions, references=true_labels
    )
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2519,0.072562,0.90667,0.92158,0.914064,0.980031
2,0.05,0.06278,0.924667,0.930977,0.927811,0.982811
3,0.0315,0.062369,0.925357,0.936123,0.930708,0.983891


TrainOutput(global_step=2634, training_loss=0.0874490069003384, metrics={'train_runtime': 287.6068, 'train_samples_per_second': 146.46, 'train_steps_per_second': 9.158, 'total_flos': 512107577001720.0, 'train_loss': 0.0874490069003384, 'epoch': 3.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.06236886978149414,
 'eval_precision': 0.9253566294371337,
 'eval_recall': 0.9361226087929299,
 'eval_f1': 0.9307084862640419,
 'eval_accuracy': 0.9838912100656108,
 'eval_runtime': 6.5424,
 'eval_samples_per_second': 496.761,
 'eval_steps_per_second': 31.181,
 'epoch': 3.0}

In [16]:
from pprint import pprint

In [17]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)  # axis=2は9つのラベル
true_predictions = remove_ignored_index_from_predictions(predictions, labels, label_list)
true_labels = remove_ignored_index_from_labels(predictions, labels, label_list)
results = metric.compute(predictions=true_predictions, references=true_labels)
pprint(results)

{'LOC': {'f1': 0.9558461246920599,
         'number': 2618,
         'precision': 0.9484768710041369,
         'recall': 0.9633307868601986},
 'MISC': {'f1': 0.8213141025641025,
          'number': 1231,
          'precision': 0.8102766798418972,
          'recall': 0.8326563769293257},
 'ORG': {'f1': 0.8979493365500603,
         'number': 2056,
         'precision': 0.8908568693154619,
         'recall': 0.9051556420233463},
 'PER': {'f1': 0.9762532981530343,
         'number': 3034,
         'precision': 0.976897689768977,
         'recall': 0.975609756097561},
 'overall_accuracy': 0.9838912100656108,
 'overall_f1': 0.9307084862640419,
 'overall_precision': 0.9253566294371337,
 'overall_recall': 0.9361226087929299}
