In [None]:
!pip install transformers datasets

In [None]:
import nltk 
from nltk.corpus import brown

In [None]:
nltk.download('brown')
nltk.download('universal_tagset')

In [None]:
corpus = brown.tagged_sents(tagset='universal')
corpus

In [None]:
inputs = []
targets = []

for sentence_tag_pairs in corpus: 
    tokens = []
    target = []

    for token, tag in sentence_tag_pairs: 
        tokens.append(token) 
        target.append(tag) 

    inputs.append(token) 
    targets.append(target)

In [None]:
# save data to json format
import json 

with open('data.json', 'w') as f: 
    for x, y in zip(inputs, targets): 
        j = {'inputs': x, 'targets': y} 
        s = json.dumps(j) 
        f.write(f"{s} \n")

In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset("json", data_files = "data.json")

In [None]:
data

In [None]:
small = data['train'].shuffle(seed = 42).select(range(20_000))
small

In [None]:
data = small.train_test_split(seed=42) 

In [None]:
data["train"][0]

In [None]:
data['train'].features

In [None]:
# map targets to ints 
target_set = set()
for target in targets: 
    target_set = target_set.union(target) 
target_set

In [None]:
target_list = list(target_set) 
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoTokenizer

# also try using bert 
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
idx = 0 
t = tokenizer(data['train'][idx]['inputs'], is_split_into_words=True) 
t

In [None]:
type(t) 

In [None]:
t.tokens()

In [None]:
# value of i indicates it is the i'th word 
# in the input sentence (counting form 0) 
t.word_ids()

In [None]:
def align_targets(labels, word_ids): 
    aligned_labels = []
    for word in word_ids: 
        if word is None: 
            # it's a token like [CLS]
            label = -100 
        else: 
            # it's a real word 
            label = label2id[labels[word]]

        # add the label 
        aligned_labels.append(label)
        
    return aligned_labels

In [None]:
# try our function 
labels = data['train'][idx]['targets'] 
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids) 
aligned_targets

In [None]:
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets] 
for x, y in zip(t.tokens(), aligned_labels): 
    print(f"{x}\t{y}") 

In [None]:
# tokenize both inputs and targets 
def tokenize_fn(batch): 
    # tokenize the input sequence first 
    # this populates input_ids, attention_mask, etc
    tokenized_inputs = tokenizer(
        batch['inputs'], truncation=True, is_split_into_words=True
    )

    labels_batch = batch['targets'] # original targets
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch): 
        word_ids = tokenized_inputs.word_ids(i) 
        aligned_labels_batch.append(align_targets(labels, word_ids))
    
    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [None]:
# want to remove these from model inputs - they are neither inputs nor targets
data['train'].column_names

In [None]:
tokenized_datasets = data.map(
    tokenize_fn, 
    batched = True, 
    remove_columns = data["train"].column_names, 
)

In [None]:
tokenized_datasets

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer) 

In [None]:
def flatten(list_of_lists): 
    flattened = [val for sublist in list_of_lists for val in sublist]
    return flattened

In [None]:
import numpy as np 
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels): 
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis = -1) 

    # remove -100 from labels and predictions 
    labels_jagged = [[t for t in label if t != -100] for label in labels] 

    # do the same for predictions whenever true labels is -100 
    preds_jagged = [[p for p, t in zip(ps, ts) if t != -100] for ps, ts in zip(preds, labels)]

    # flatten labels and preds 
    labels_flat = flatten(labels_jagged) 
    preds_flat = flatten(preds_jagged) 

    acc = accuracy_score(labels_flat, preds_flat) 
    f1 = f1_score(labels_flat, preds_flat, average='macro')

    return {
        "f1" : f1, 
        "accuracy" : acc
    }

In [None]:
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
    [0.8, 0.1, 0.1], 
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1], 
    [0.1, 0.8, 0.1], 
    [0.1, 0.8, 0.1], 
    [0.1, 0.8, 0.1], 
    [0.1, 0.8, 0.1]
]])

compute_metrics((logits, labels))

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, 
    id2label = id2label, 
    label2id = label2id
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner", 
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    num_train_epochs=2, 
)

from transformers import Trainer 

trainer = Trainer(model = model, args = training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['test'], data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer)
trainer.train()

In [None]:
trainer.save_model('my_saved_model')

In [None]:
from transformers import pipeline 

pipe = pipeline(
    "token-classification", 
    model= "my_saved_model", 
    device=0 
)

In [None]:
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington" 
pipe(s)