In [1]:
# Example sentence transformed throughout thesis methdology section

# 1. Original CanSat sentence
# 2. Word wise split with IOB format
# 3. Tokenized WordPiece split with word-ids
# 4. Word-id aligned
# 5. Early model iteration inference with logit and prediction
# 6. Late model iteration inference with logit and prediction
# 7. Query strategy calculations on sentence
# 8. F1 evaluation metric

In [2]:
# General packages
import numpy as np
import pandas as pd

# HuggingFace packages
from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import evaluate
from evaluate import evaluator
from tqdm.notebook import trange

import torch
from numba import jit

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
tokens = [["The", "Payload", "shall", "resist", "an", "acceleration", "of", "at", "least", "60Gs", "of", "shock"],["test","test"]]
ner_tags = [[0,9,0,1,0,3,0,5,6,8,0,0],[0,0]]
data = {'tokens': tokens, 'ner_tags': ner_tags}

In [9]:
# 3. Tokenized WordPiece split with word-ids
test = tokenizer(data['tokens'], truncation=False, is_split_into_words=True)
test.word_ids(0)

[None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 10, 11, None]

In [23]:
# alignment function
# special tokens = -100
# replace tokens with have been split from B into B + I

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id: # check if it is not same word, just split into two
            current_word_id = word_id 
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [24]:
# scaling align_labels_with_tokens function towards entire datasets by using lists in lists [[]]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) # similar to word_ids

    all_labels = examples["ner_tags"] # similar to labels but recursive [[ner_tag1],[ner_tag2],[ner_tag3]]
    new_labels = []
    for i, labels in enumerate(all_labels): # i = iterative number, labels = [ner_tag1]
        word_ids = tokenized_inputs.word_ids(i) # takes word_ids from tokenizer magic based on iterative number 
        new_labels.append(align_labels_with_tokens(labels, word_ids)) # creates [[new_labels1],[new_labels2],[new_labels3]]

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [30]:
# 4. Word-id aligned
tokenize_and_align_labels(data)['labels'][0]

[-100, 0, 9, 10, 0, 1, 0, 3, 0, 5, 6, 8, 8, 8, 0, 0, -100]

In [32]:
inference_model = AutoModelForTokenClassification.from_pretrained("dxiao/bert-finetuned-ner-100percent")

In [34]:
def inference(sentence, inference_model):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = inference_model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

In [36]:
# Use inference from display inference. Examples can be tinkered with!
inference(data['tokens'][0], inference_model)[0]

array([9.8735946e-01, 1.4781930e-03, 3.4862154e-04, 2.1089052e-03,
       1.2852497e-03, 1.0302414e-03, 5.0840829e-04, 7.2974205e-04,
       3.0446865e-03, 1.0607493e-03, 1.0458402e-03], dtype=float32)