In [1]:
! pip3 install numpy torch torchvision torchaudio transformers datasets evaluate



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
import numpy as np
from evaluate import load


In [3]:
label_list = [
    "O",
    "LDEMO",
    "LB",
    "VBA",
    "LGROUND",
    "LBALL",
    "LSP",
    "VSPEED",
    "LD",
    "VDIR",
    "LBRAKE",
    "LSTE",
    "VSTEER",
    "LTHROT",
    "VTHROTTLE",
    "LBOOST",
    "LPOS"
]

In [4]:
# id2label = {
#     0: "O",
#     1: "L-DEMO",
#     2: "L-BA",
#     3: "V-BA",
#     4: "L-GROUND",
#     5: "L-BALL",
#     6: "L-SPEED",
#     7: "V-SPEED",
#     8: "L-DIR",
#     9: "V-DIR",
#     10: "L-BRAKE",
#     11: "L-STEER",
#     12: "V-STEER",
#     13: "L-THROTTLE",
#     14: "V-THROTTLE",
#     15: "L-BOOST",
#     16: "L-POS"
# }

In [5]:
# label2id = {
#     "O": 0,
#     "L-DEMO": 1,
#     "L-BA": 2,
#     "V-BA": 3,
#     "L-GROUND": 4,
#     "L-BALL": 5,
#     "L-SPEED": 6,
#     "V-SPEED": 7,
#     "L-DIR": 8,
#     "V-DIR": 9,
#     "L-BRAKE": 10,
#     "L-STEER": 11,
#     "V-STEER": 12,
#     "L-THROTTLE": 13,
#     "V-THROTTLE": 14,
#     "L-BOOST": 15,
#     "L-POS": 16
# }

In [6]:
dataset = load_dataset("cw1521/nl-st")
dataset["train"] = dataset["train"].shard(10, 0)
dataset["validation"] = dataset["validation"].shard(10, 0)
dataset["test"] = dataset["test"].shard(10, 0)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 86533
    })
    validation: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 24724
    })
    test: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 12362
    })
})

In [8]:
example = dataset["train"][0]

In [9]:
model_name = "bert-base-cased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [11]:
example = dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)



tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokens

['[CLS]',
 'I',
 "'",
 'm',
 'heading',
 'east',
 '.',
 'I',
 'cu',
 '##rre',
 '##n',
 '##ly',
 'have',
 '34',
 'percent',
 'boost',
 '.',
 'I',
 "'",
 'm',
 'not',
 'in',
 'the',
 'air',
 '.',
 'I',
 "'",
 'm',
 'in',
 'q',
 '##uad',
 '##rant',
 '4',
 '.',
 'I',
 "'",
 'm',
 'about',
 'to',
 'turn',
 'right',
 'and',
 'I',
 "'",
 'm',
 'travelling',
 'forwards',
 '.',
 'I',
 "'",
 'm',
 'travelling',
 '140',
 '##7',
 'miles',
 'per',
 'hour',
 '.',
 '[SEP]']

In [12]:
tokenized_input.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 3,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 11,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 17,
 18,
 19,
 19,
 19,
 20,
 21,
 22,
 22,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 28,
 28,
 29,
 30,
 31,
 32,
 32,
 32,
 33,
 34,
 34,
 35,
 36,
 37,
 38,
 None]

In [13]:
# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"ner_ids"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [14]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_ids"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_list[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

I'm heading east . I currenly have 34  percent boost . I'm not     in the air     . I'm in quadrant 4    . I'm about to turn right  and I'm travelling forwards  . I'm travelling 1407   miles per hour . 
O   LD      VDIR O O LB       O    VBA O       LB    O O   LGROUND O  O   LGROUND O O   O  LPOS     LPOS O O   O     O  LSTE VSTEER O   O   LTHROT     VTHROTTLE O O   O          VSPEED LSP   LSP LSP  O 


In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # # If the label is B-XXX we change it to I-XXX
            # if label % 2 == 1:
            #     label += 1
            # new_labels.append(label)

    return new_labels

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_ids"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [17]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

In [18]:
tokenized_dataset["train"][0]

{'input_ids': [101,
  146,
  112,
  182,
  5312,
  1746,
  119,
  146,
  16408,
  11604,
  1179,
  1193,
  1138,
  3236,
  3029,
  14112,
  119,
  146,
  112,
  182,
  1136,
  1107,
  1103,
  1586,
  119,
  146,
  112,
  182,
  1107,
  186,
  18413,
  6922,
  125,
  119,
  146,
  112,
  182,
  1164,
  1106,
  1885,
  1268,
  1105,
  146,
  112,
  182,
  9169,
  22453,
  119,
  146,
  112,
  182,
  9169,
  8183,
  1559,
  1829,
  1679,
  2396,
  119,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
labels = [label_list[i] for i in example[f"ner_ids"]]

In [21]:
labels

['O',
 'LD',
 'VDIR',
 'O',
 'O',
 'LB',
 'O',
 'VBA',
 'O',
 'LB',
 'O',
 'O',
 'LGROUND',
 'O',
 'O',
 'LGROUND',
 'O',
 'O',
 'O',
 'LPOS',
 'LPOS',
 'O',
 'O',
 'O',
 'O',
 'LSTE',
 'VSTEER',
 'O',
 'O',
 'LTHROT',
 'VTHROTTLE',
 'O',
 'O',
 'O',
 'VSPEED',
 'LSP',
 'LSP',
 'LSP',
 'O']

In [22]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(50)])
batch["labels"]

tensor([[-100,    0,    8,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,   11,  ..., -100, -100, -100],
        ...,
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    8,  ..., -100, -100, -100],
        [-100,    0,    4,  ..., -100, -100, -100]])

In [23]:
metric = load("seqeval")

In [24]:

predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels], zero_division=0.5)



{'B': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'BA': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'D': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'DIR': {'precision': np.float64(1.0),
  'recall': np.float64(0.0),
  'f1': np.float64(0.0),
  'number': np.int64(1)},
 'GROUND': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'POS': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'SP': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'SPEED': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'STE': {'precision': np.float64(1.0),
  'recall'

In [25]:

def compute_metrics(p):
    metric = load("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0.5)
    return {
        "precision": results["overall_precision"], 
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [26]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [27]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=17, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    output_dir="nl-ner",
    learning_rate=2e-5,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50
    # load_best_model_at_end=True
)



In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss
