In [None]:
!pip install transformers datasets

In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset('conll2003')

In [None]:
data

In [None]:
data['train'][0]

In [None]:
data['train'].features

In [None]:
data["train"].features['ner_tags']

In [None]:
data['train'].features['ner_tags'].feature.names

In [None]:
# save for later
label_names = data['train'].features['ner_tags'].feature.names

In [None]:
from transformers import AutoTokenizer

# also try using bert 
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
idx = 0 
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

In [None]:
type(t) 

In [None]:
t.tokens()

In [None]:
# value of i indicates it is in the i'th word 
# in the input sentence (counting from 0)
t.word_ids()

In [None]:
# O, B-PER, I-PER B-ORG I-ORG B-LOC I-LOC B-MISC I-MISC
begin2inside = {
    1: 2, 
    3: 4, 
    5: 6, 
    7: 8
}

def align_targets(labels, word_ids): 
    aligned_labels = []
    last_word = None

    for word in word_ids: 
        if word is None: 
            label = -100 # start token [CLS]
        elif word != last_word: 
            label = labels[word] # new word
        else: 
            label = labels[word] # it's the same word as before 
            
            # change B-<tag> to I-<tag> if necessary
            if label in begin2inside: 
                label = begin2inside[label]
        
        aligned_labels.append(label) # add label 
        last_word = word # update last word
    
    return aligned_labels

In [None]:
# try function 
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids) 
aligned_targets

In [None]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets] 
for x, y in zip(t.tokens(), aligned_labels): 
    print(f"{x}\t{y}")

In [None]:
# make up a fake input just to test it 
words = [
    '[CLS]', "Ger", "##man", "call", "to", "boycott", "Micro", "##soft", "[SEP]"
]
word_ids = [None, 0, 0, 1, 2, 3, 4, 4, None]
labels = [7, 0, 0, 0, 3]
aligned_targets = align_targets(labels, word_ids) 
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x, y in zip(words, aligned_labels): 
    print(f"{x}\t{y}")

In [None]:
# tokenize both inputs and targets
def tokenize_fn(batch): 
    # tokenize the input sequence first 
    # this populates input_ids, attention_mask, etc
    tokenized_inputs = tokenizer(
        batch['tokens'], truncation=True, is_split_into_words=True
    )

    labels_batch = batch['ner_tags'] # original targets
    aligned_labels_batch = [] # aligned targets
    
    for i, labels in enumerate(labels_batch): 
        word_ids = tokenized_inputs.word_ids(i) 
        aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: "target" must be stored in key called "labels"
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

In [None]:
# want to remove these from model inputs = they are neither inputs nor targets
data['train'].column_names

In [None]:
tokenized_datasets = data.map(tokenize_fn, batched= True, remove_columns = data['train'].column_names)

In [None]:
tokenized_datasets

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
tokenized_datasets['train'][0:2]

In [None]:
[tokenized_datasets['train'][i] for i in range(2)]

In [None]:
# example 
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels']

In [None]:
!pip install seqeval 

In [None]:
from datasets import load_metric 

metric = load_metric('seqeval')

In [None]:
# test it out 
metric.compute(
    predictions = [0, 0, 0], 
    references = [0, 0, 1]
)
# errors, expects batches of sequences (list of lists)

In [None]:
# test it out 
metric.compute(
    predictions = [[0, 0, 0]], 
    references = [[0, 0, 1]]
)
# warnings, labels are not tags (must be strigs )

In [None]:
metric.compute(
    predictions = [["A", "A", "A"]], 
    references = [["A", "B", "A"]]
)
# again warning, because no tags 

In [None]:
metric.compute(
    predictions = [["O", "O", "I-ORG", "B-MISC"]], 
    references = [["O", "B-ORG", "I-ORG", "B-MISC"]]
)
# special computations based on IOB format

In [None]:
import numpy as np 

def compute_metrics(logits_and_labels): 
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis = -1) 

    # remove -100 from labels and predictions
    # and convert the label_ids to label names
    str_labels = [
        [label_names[t] for t in label if t != -100] for label in labels
    ]

    # to the same for predictions whenever true label is -100
    str_preds = [
        [label_names[p] for p, t in zip(pred, targ) if t != -100] \
        for pred, targ in zip(preds, labels)
    ]

    the_metrics = metric.compute(predictions= str_preds, references = str_labels) 
    return {
        "precision": the_metrics["overall_precision"], 
        "recall": the_metrics['overall_recall'], 
        'f1': the_metrics['overall_f1'], 
        'accuracy': the_metrics['overall_accuracy']
    }


In [None]:
id2label = {k:v for k, v in enumerate(label_names)}
label2id = {v:k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, 
    id2label = id2label, 
    label2id = label2id
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner", 
    evaluation_strategy='epoch', 
    save_strategy='epoch', 
    learning_rate=2e-5, 
    num_train_epochs=3, 
    weight_decay=0.01
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args = training_args, 
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['validation'], 
    data_collator=data_collator, 
    compute_metrics=compute_metrics, 
    tokenizer=tokenizer
)

trainer.train()

In [None]:
trainer.save_model('my_saved_model')

In [None]:
from transformers import pipeline

ner = pipeline(
    "token-classification", 
    model = 'my_saved_model', 
    aggregation_strategy='simple', 
    device=0
)

In [None]:
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
ner(s)