####Setup Env


In [None]:
!pip install transformers datasets tokenizers evaluate seqeval accelerate -q

In [None]:
from datasets import load_dataset
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

## Load Dataset

In [None]:
raw_datasets = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

## Explore Dataset

In [None]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [None]:
raw_datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
words = raw_datasets["train"][4]["tokens"]
labels = raw_datasets["train"][4]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Preprocess Data
## Consecutive Subword Problem
The problem that arises with subword-tokenization is when we use transformers that have often been trained with a subword tokenizer. This means that even if our input has been split into words, each of those words can be subdivided by the tokenizer. This means that we need to do some processing on our labels because the input ids returned by the tokenizer are longer than the list of labels contained in our dataset.

This happens first because some special tokens may be added (such as [CLS] and [SEP]) and then because of the possible splitting of words into multiple tokens:

A strategy to solve the above problem is to set the labels of all special tokens to -100 (an index ignored by PyTorch), and set the labels of all other tokens to the label of the word they come from. Another strategy is to set the label of only the first token obtained from a particular word, and assign the label -100 to the other subtokens of the same word. We propose both of these strategies here, just change the values of the following flags:

- If you want to use the first strategy, you can set the labels of special tokens (such as [CLS] and [SEP]) to -100 and set the labels of other tokens to be the labels of the origin word.
- If you want to use the second strategy, you can set the label only on the first token of each word and give the label -100 on the other subtokens of the same word.

The choice between these two strategies depends on the needs of your natural language processing task.

Assign -100 as the label for this particular token and the subwords we want to hide during training:
Why do we choose -100 as the ID to hide the subword representation? The reason is because in PyTorch, the cross-entropy loss class torch.nn.CrossEntropyLoss has an attribute called ignore_index whose value is -100. This index is ignored during training,

We can also use it to ignore tokens associated with consecutive subwords.

The cell below is just to check the output of some variables before applying tokenize_and_align_labels()

In [None]:
example_text = raw_datasets['train'][0]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()

print(word_ids)
tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Masalah Sub-Token - Id masukan yang dikembalikan oleh tokenizer lebih panjang daripada daftar label yang ada di dataset kita.

In [None]:
print("Input length before tokenizer:", len(example_text['ner_tags']))
print("Input length after tokenizer:", len(tokenized_input["input_ids"]))


Panjang input sebelum tokenizer: 9
Panjang input setelah tokenizer: 11


The function below tokenize_and_align_labels does 2 jobs
- set -100 as the labels for these special tokens and the subwords we want to cover during training
- cover the subword representation after the first subword

Then we align the labels with the token ids using our chosen strategy:

In [None]:
def tokenize_align_labels(examples, label_all_tokens=True) -> dict:
    """
    Function for marking and aligning labels with respect to tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where label alignment is required after tokenization.

    Parameters:
    example (dict): A dictionary containing tokens and their corresponding NER tags.
                     - "tokens": a list of words in a sentence.
                     - "ner_tags": a list of entity tags corresponding to each word.

    label_all_token (bool): Flag to indicate whether all tokens should have labels.
                             If false, only the first token of a word will have a label,
                             other tokens (subwords) associated with the same word will be assigned a value of -100.

    Returns
    tokenized_inputs (dictionary): A dictionary containing tokenized inputs and the labels corresponding to the tokens.
    """

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list that maps tokens
        # to the actual word in the original sentence.
        # Returns a list showing the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
       # Special tokens like `<s>` and `<\s>` are initially mapped to None
        # We need to set the labels to -100 so that they are automatically
        # ignored in the loss function (cross entropy loss).
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For other tokens in a word, we set the label to the current label or -100, depending on the
            # the label_all_token flag.
            elif word_idx != previous_word_idx:
                # if the current word_idx != prev then that is the most common case
                # and add the appropriate token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
q = tokenize_align_labels(raw_datasets['train'][4:5])
print(q)

{'input_ids': [[101, 2762, 1005, 1055, 4387, 2000, 1996, 2647, 2586, 1005, 1055, 15651, 2837, 14121, 1062, 9328, 5804, 2056, 2006, 9317, 10390, 2323, 4965, 8351, 4168, 4017, 2013, 3032, 2060, 2084, 3725, 2127, 1996, 4045, 6040, 2001, 24509, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 5, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, -100]]}


So before applying the tokenize_and_align_labels() the tokenized_input has 3 keys
- input_ids
- token_type_ids
- attention_mask

But after applying tokenize_and_align_labels() we have an extra key - 'labels'

================================================

In [None]:
# Check
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
germany_________________________________ 5
'_______________________________________ 0
s_______________________________________ 0
representative__________________________ 0
to______________________________________ 0
the_____________________________________ 0
european________________________________ 3
union___________________________________ 4
'_______________________________________ 0
s_______________________________________ 0
veterinary______________________________ 0
committee_______________________________ 0
werner__________________________________ 1
z_______________________________________ 2
##wing__________________________________ 2
##mann__________________________________ 2
said____________________________________ 0
on______________________________________ 0
wednesday_______________________________ 0
consumers_______________________________ 0
should__________________________________ 0
buy_____________________________________ 0
sheep___

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

## Model
```
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
```

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Trainer

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
args = TrainingArguments(
    "bert-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)

We can't just use DataCollatorWithPadding as in Chapter 3 because that only pads the input (input ID, attention mask, and token type ID). Here, our labels must be padded in exactly the same way as the input so that the size remains the same, using -100 as the value so that the corresponding predictions are ignored in the loss calculation.

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Evaluate Metric

In [None]:
import evaluate
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
example = raw_datasets["train"][0]
label_list = raw_datasets["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
labels = [label_list[i] for i in example["ner_tags"]]

metric.compute(predictions=[labels], references=[labels])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

The `seqeval` package is a Python library used to measure the performance of models in token-based classification tasks such as Named Entity Recognition (NER) or Part-of-Speech (POS) tagging. `Seqeval` expects input in the form of a list of lists, where each list within the list contains tokens representing one example in the dataset. Each of these tokens also has a corresponding label.

In NER or POS tagging tasks, tokens are sometimes split into subtokens by a tokenizer, and labels must be adjusted to match the tokens present in the input. This includes handling the labeling IDs associated with the next subtoken to match the format expected by `seqeval`.

For example, if we have the sentence "Ibu sedang memasak," and this sentence has been tokenized into subtokens by a tokenizer, such as: ["Ibu", "sedang", "m", "##em", "##asa", "##k"], then the corresponding labels must be adjusted to match these subtokens.

The original labels might look like this:
```
Labels: ["O", "O", "B-VERB", "I-VERB", "I-VERB", "I-VERB"]
```

However, the labels expected by `seqeval` after adjustment will be:
```
Labels expected by Seqeval: [["O", "O", "B-VERB", "I-VERB"], ["I-VERB", "I-VERB"]]
```

In the example above, we ensure that the subtoken labels associated with the word "memasak" are combined into a single unit that matches the token "memasak."

So, to integrate `seqeval` metrics during training, you need to create a function that performs this adjustment on the output from your model to produce the label format expected by `seqeval`, which is a list of lists representing tokens and labels in the examples from your dataset.

This compute_metrics() function first takes argmax from logit to convert it to a prediction (as usual, logit and probability are in the same order, so we don't need to apply softmax). Then we have to convert the labels and predictions from integers to strings. We remove all values that have the label -100, then pass the result to the metric.compute() method:

In [None]:
def compute_metrics(eval_preds) -> dict:
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """

    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]

    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


## Train

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2234,0.064832,0.911048,0.932655,0.921725,0.982064
2,0.0443,0.055226,0.934493,0.94317,0.938812,0.985369
3,0.0258,0.057149,0.938458,0.945072,0.941754,0.985623
4,0.0139,0.062311,0.941255,0.949994,0.945604,0.986322
5,0.0098,0.064756,0.941958,0.951337,0.946624,0.986417


TrainOutput(global_step=4390, training_loss=0.05107230271185178, metrics={'train_runtime': 890.7088, 'train_samples_per_second': 78.819, 'train_steps_per_second': 4.929, 'total_flos': 1705041168096870.0, 'train_loss': 0.05107230271185178, 'epoch': 5.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

'https://huggingface.co/fahmiaziz/bert-ner/tree/main/'

https://huggingface.co/learn/nlp-course/chapter7/2#using-the-fine-tuned-model

# Using fine-tuned model
Mengubah Label, kalo gk penging ubah label secara manual bisa pake yg diatas saat load model

In [None]:
model.save_pretrained("bert-ner")
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
import json


config = json.load(open("bert-ner/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open("bert-ner/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("bert-ner")

In [None]:
from transformers import pipeline

nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
example = "My name is Clara and I live in Berkeley, California."
ner_results = nlp(example)

print(ner_results)

[{'entity': 'B-PER', 'score': 0.9938585, 'index': 4, 'word': 'clara', 'start': 11, 'end': 16}, {'entity': 'B-LOC', 'score': 0.99393773, 'index': 9, 'word': 'berkeley', 'start': 31, 'end': 39}, {'entity': 'B-LOC', 'score': 0.99586004, 'index': 11, 'word': 'california', 'start': 41, 'end': 51}]


In [None]:
example = "My name is Clara and I live in Berkeley, California."
nlp(example)


[{'entity': 'B-PER',
  'score': 0.9938585,
  'index': 4,
  'word': 'clara',
  'start': 11,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.99393773,
  'index': 9,
  'word': 'berkeley',
  'start': 31,
  'end': 39},
 {'entity': 'B-LOC',
  'score': 0.99586004,
  'index': 11,
  'word': 'california',
  'start': 41,
  'end': 51}]

In [None]:
model_fine_tuned.push_to_hub("fahmiaziz/bert-ner")

CommitInfo(commit_url='https://huggingface.co/fahmiaziz/bert-ner/commit/d1ae4baeeac285d4e20469e05ad882ec095b3aaa', commit_message='Upload BertForTokenClassification', commit_description='', oid='d1ae4baeeac285d4e20469e05ad882ec095b3aaa', pr_url=None, pr_revision=None, pr_num=None)