In [2]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3,
                       'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def get_all_tokens_and_ner_tags(directory):
    return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) 
                      for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
    
def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
  
def get_un_token_dataset(train_directory, test_directory):
    train_df = get_all_tokens_and_ner_tags(train_directory)
    test_df = get_all_tokens_and_ner_tags(test_directory)
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    return (train_dataset, test_dataset)

In [4]:
train_dataset, test_dataset = get_un_token_dataset('./UN-named-entity-recognition/tagged-training/',
                                                   './UN-named-entity-recognition/tagged-test/')

In [9]:
pd.DataFrame(train_dataset)

Unnamed: 0,tokens,ner_tags
0,"[Since, the, last, time, we, gathered, here, i...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[We, have, witnessed, the, hope-inspiring, glo...","[O, O, O, O, O, O, O, O, O, O, O, I-MISC, I-MI..."
2,"[We, have, a, growing, global, understanding, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[Since, I, spoke, at, this, rostrum, in, the, ...","[O, O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-..."
4,"[Terrorist, forces, have, significantly, incre...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
3652,"[He, should, also, be, informed, that, the, pe...","[O, O, O, O, O, O, O, O, O, I-LOC, O, O, O, O,..."
3653,"[We, want, to, be, left, in, peace, to, carry,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3654,"[We, refuse, to, be, an, extension, of, Europe...","[O, O, O, O, O, O, O, I-LOC, O]"
3655,"[We, are, Africans, and, shall, remain, so, fo...","[O, O, I-MISC, O, O, O, O, O, O]"


In [6]:
pd.DataFrame(test_dataset)

Unnamed: 0,tokens,ner_tags
0,"[May, I, ,, at, the, outset, ,, convey, my, he...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[I, have, no, doubt, that, your, experience, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[I, also, want, to, take, this, opportunity, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, I-PER,..."
3,"[These, General, Assembly, meetings, provide, ...","[O, I-ORG, I-ORG, O, O, O, O, O, O, O, O, O, O..."
4,"[Today, ,, the, international, community, face...","[O, O, O, O, O, O, O, O, O, O, O, I-MISC, I-MI..."
...,...,...
2069,"[I, believe, that, the, new, millennium, is, o...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2070,"[I, believe, that, an, international, peace, s...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2071,"[We, must, join, our, strength, to, build, an,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2072,"[A, stable, ,, secure, ,, and, prosperous, wor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

100%|██████████| 4/4 [00:00<00:00,  5.23ba/s]
100%|██████████| 3/3 [00:00<00:00,  6.56ba/s]


In [14]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] 
                   for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]}

In [16]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_tokenized_datasets,
                  eval_dataset=test_tokenized_datasets,
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()
trainer.save_model('un-ner.model')

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3657
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 687
  Number of trainable parameters = 66369801
  0%|          | 0/687 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 229/687 [11:15<19:14,  2.52s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTok

{'eval_loss': 0.052030667662620544, 'eval_precision': 0.8221218961625282, 'eval_recall': 0.8442280945757997, 'eval_f1': 0.833028362305581, 'eval_accuracy': 0.9825567168874751, 'eval_runtime': 108.987, 'eval_samples_per_second': 19.03, 'eval_steps_per_second': 1.193, 'epoch': 1.0}


 67%|██████▋   | 458/687 [24:34<09:23,  2.46s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2074
  Batch size = 16
                                                 
 67%|██████▋   | 458/687 [26:20<09:23,  2.46s/it]

{'eval_loss': 0.053790051490068436, 'eval_precision': 0.8344640434192673, 'eval_recall': 0.8553546592489569, 'eval_f1': 0.8447802197802198, 'eval_accuracy': 0.9842521061278708, 'eval_runtime': 106.337, 'eval_samples_per_second': 19.504, 'eval_steps_per_second': 1.223, 'epoch': 2.0}


 73%|███████▎  | 500/687 [28:21<08:51,  2.84s/it]  Saving model checkpoint to test-ner\checkpoint-500
Configuration saved in test-ner\checkpoint-500\config.json


{'loss': 0.0661, 'learning_rate': 2.7219796215429405e-05, 'epoch': 2.18}


Model weights saved in test-ner\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-ner\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-ner\checkpoint-500\special_tokens_map.json
100%|██████████| 687/687 [37:22<00:00,  2.52s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2074
  Batch size = 16
                                                 
100%|██████████| 687/687 [39:06<00:00,  2.52s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 687/687 [39:06<00:00,  3.42s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have bee

{'eval_loss': 0.0575423538684845, 'eval_precision': 0.8445945945945946, 'eval_recall': 0.8692628650904033, 'eval_f1': 0.8567511994516792, 'eval_accuracy': 0.9854231481805152, 'eval_runtime': 104.063, 'eval_samples_per_second': 19.93, 'eval_steps_per_second': 1.249, 'epoch': 3.0}
{'train_runtime': 2346.5426, 'train_samples_per_second': 4.675, 'train_steps_per_second': 0.293, 'train_loss': 0.05203347469868445, 'epoch': 3.0}


100%|██████████| 130/130 [01:41<00:00,  1.28it/s]
Saving model checkpoint to un-ner.model
Configuration saved in un-ner.model\config.json
Model weights saved in un-ner.model\pytorch_model.bin
tokenizer config file saved in un-ner.model\tokenizer_config.json
Special tokens file saved in un-ner.model\special_tokens_map.json


## Predict on new sentences

In [21]:
tokenizer = AutoTokenizer.from_pretrained('./un-ner.model/')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [22]:
paragraph = '''Before proceeding further, I should like to inform members that action on draft resolution iv, entitled situation of human rights of Rohingya Muslims and other minorities in Myanmar is postponed to a later date to allow time for the review of its programme budget implications by the fifth committee. The assembly will take action on draft resolution iv as soon as the report of the fifth committee on the programme budget implications is available. I now give the floor to delegations wishing to deliver explanations of vote or position before voting or adoption.'''
tokens = tokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

torch.Size([1, 103])

In [23]:
model = AutoModelForTokenClassification.from_pretrained('./un-ner.model/', num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in predictions]

loading configuration file ./un-ner.model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./un-ner.model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "

In [24]:
words = tokenizer.batch_decode(tokens['input_ids'])
pd.DataFrame({'ner': predictions, 'words': words}).to_csv('un_ner.csv')