In [1]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/Users/filippovarini/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

### Dataset Exploration

In [27]:
dataset["train"].features["ner_tags"]
dataset["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Preprocessing
You need to pass to the model a `dictionary` of *only* two things: `words` and `labels`

In [3]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
# Add is_split_into_word = True
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)

In [16]:
# Match tokens with word ids
print(inputs.tokens())
print(inputs.word_ids())

['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


### Align tokens and labels
With a tiny bit of work, we can then expand our label list to match the tokens. The first rule we’ll apply is that special tokens get a label of -100. This is because by default -100 is an index that is ignored in the loss function we will use (cross entropy). Then, each token gets the same label as the token that started the word it’s inside, since they are part of the same entity. For tokens inside a word but not at the beginning, we replace the B- with I- (since the token does not begin the entity):

In [17]:
IGNORED_LABEL = -100

In [24]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id is None:
            # Special token
            new_labels.append(IGNORED_LABEL)
            continue
            
        if word_id != current_word_id:
            # Start of a new word!
            current_word_id = word_id
            new_labels.append(labels[word_id])
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            # using the fact that B is at odd pos
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [30]:
labels = dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [35]:
def tokenize_and_align_labels(examples):
    """Tokenize input and align with the labels. all_labels is 
    the list of [labels] for each instance of the dataset. 
    
    Note that here .word_ids needs the index of the instance to get
    the word ids from as we tokenized in batch."""
     tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [37]:
# We use the dataset.map function that maps over each dataset["train/test/validate"] automatically!
""" NOTE: we use remove_columns. When you remove a column, it is only removed after the example has 
been provided to the mapped function. This allows the mapped function to use the content of the 
columns before they are removed."""
tokenize_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [39]:
tokenize_dataset["train"][100]

{'input_ids': [101,
  16890,
  25473,
  11690,
  1110,
  14042,
  1146,
  1117,
  1858,
  1112,
  9088,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, -100]}

In [40]:
dataset["train"].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

# Data Collation
Now we have the tokenized dataset, but every instance is of different length. 
Therefore, we need to perform **data collation**. 

In this case, we use `DataCollatorForTokenClassification` to add padding not only to the tokens, but also to the labels, to align them

In [41]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [49]:
# Test it
batch = data_collator([tokenize_dataset["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

# Metric
We want our `Trainer` to evaluate the metric at each epoch, and learn from them. For token classification, `seqeval` is the traditional framework

In [52]:
pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [54]:
import evaluate

metric = evaluate.load("seqeval")

> NOTE: This metric does not behave like the standard accuracy: it will actually take the lists of labels as strings, not integers, 
so we will need to fully decode the predictions and labels before passing them to the metric. 

In [58]:
label_names = dataset["train"].features["ner_tags"].feature.names
labels = dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [59]:
# Test a fake prediction
predictions = labels.copy()
predictions[2] = "O" # insert error
metric.compute(predictions=[predictions], references= [labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

# Compute metrics function
The `compute_metrics` function is what is used to evaluate the model at each epoc. It takes an `EvalPrediction` object that contains:
- `predictions` in the form of logits *A `logit` is an array of probability of each label for a given token.* 
- `label_ids` the actual results
Therefore, if we predict one instance, `predictions` will be 2D: `[[logits_token1], [logits_token2], ...]` and `label_ids` will be 1D `[label_token1, label_token2, ...]`

In [61]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    
    print(logits,labels)
    # Get the most probable label by taking the argmax of the logits. In this case the 
    # logits are for the whole dataset. Therefore, we argmax on the last axis=-1 to be 
    # sure to take the row-wise for the instance.
    predictions = np.argmax(logits, axis=-1)
    print(predictions)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    print(true_predictions)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Load the Model
We want to `AutoLoad` the model. Since we are performing classification task, we must use the `AutoModelForTokenClassification` class.

We need to specify the labels. We can do it by passing `label2id` and `id2label`

In [62]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [66]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label,
    label2id = label2id
)

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

> NOTE: the warning says that some weights have been randomly set $\rightarrow$ we need to train it!

# Train the model
> NOTE: If you want to push to hub, We need to login to hugging face

Why should I save my model?
While the training happens, each time the model is saved (here, every epoch) it is uploaded to the Hub in the background. This way, you will be able to to resume your training on another machine if necessary.

In [71]:
from huggingface_hub import notebook_login
notebook_login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Login successful
Your token has been saved to /Users/filippovarini/.huggingface/token
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [74]:
from transformers import TrainingArguments

# Set HYPERPARAMETERS!
args = TrainingArguments(
    "bert-finetuned-ner",           # Output-dir where predictions will be written
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True
#     hub_model_id="name-of-repo"      # By default same name as output-dir
)

In [76]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["validation"],
    data_collator=data_collator,
    compute_metrics= compute_metrics,
    tokenizer=tokenizer
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/varinoskij/bert-finetuned-ner into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Now, let's train!
trainer.train()

***** Running training *****
  Num examples = 14041
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5268


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0888,0.075274,0.91358,0.934029,0.923691,0.980927


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 8


[[[ 6.2220645e+00 -5.4900855e-01 -1.7422593e+00 ... -1.4417371e+00
   -1.5177847e-01  2.7877894e-01]
  [ 7.4457636e+00 -6.6507077e-01 -1.7666076e+00 ... -1.6523329e+00
   -6.3868892e-01 -9.9472336e-02]
  [ 6.9196234e+00 -1.6217829e+00  4.9500793e-01 ... -8.0439705e-01
   -2.2267303e+00  8.4461546e-01]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000000e+02 -1.0000000e+02]]

 [[ 7.5574007e+00 -6.9704777e-01 -2.1817431e+00 ... -1.0223368e+00
   -4.6605745e-01 -1.0871850e+00]
  [-9.5453456e-02  5.6740099e-01 -1.9258015e+00 ... -1.3232652e+00
   -9.8212552e-01 -1.9256837e+00]
  [ 6.8924701e-01 -2.6000779e+00 -7.1997643e-01 ...  7.6813955e+00
   -2.6718538e+00  3.2962540e-01]
  ...
  [-1.0000000e+02 -1.0000000e+02 -1.0000000e+02 ... -1.0000000e+02
   -1.0000

Saving model checkpoint to bert-finetuned-ner/checkpoint-1756
Configuration saved in bert-finetuned-ner/checkpoint-1756/config.json
Model weights saved in bert-finetuned-ner/checkpoint-1756/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-1756/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-1756/special_tokens_map.json
tokenizer config file saved in bert-finetuned-ner/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
# Once the training is complete, we use the push_to_hub() 
# method to make sure we upload the most recent version of the model:
trainer.push_to_hub(commit_message="Training complete")