In [1]:
! pip3 install numpy torch torchvision torchaudio transformers datasets seqeval evaluate



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
import evaluate
import numpy as np


In [3]:
label_list = [
    "O",
    "L-DEMO",
    "L-BA",
    "V-BA",
    "L-GROUND",
    "L-BALL",
    "L-SPEED",
    "V-SPEED",
    "L-DIR",
    "V-DIR",
    "L-BRAKE",
    "L-STEER",
    "V-STEER",
    "L-THROTTLE",
    "V-THROTTLE",
    "L-BOOST",
    "L-POS"
]

In [4]:
id2label = {
    0: "O",
    1: "L-DEMO",
    2: "L-BA",
    3: "V-BA",
    4: "L-GROUND",
    5: "L-BALL",
    6: "L-SPEED",
    7: "V-SPEED",
    8: "L-DIR",
    9: "V-DIR",
    10: "L-BRAKE",
    11: "L-STEER",
    12: "V-STEER",
    13: "L-THROTTLE",
    14: "V-THROTTLE",
    15: "L-BOOST",
    16: "L-POS"
}

In [5]:
label2id = {
    "O": 0,
    "L-DEMO": 1,
    "L-BA": 2,
    "V-BA": 3,
    "L-GROUND": 4,
    "L-BALL": 5,
    "L-SPEED": 6,
    "V-SPEED": 7,
    "L-DIR": 8,
    "V-DIR": 9,
    "L-BRAKE": 10,
    "L-STEER": 11,
    "V-STEER": 12,
    "L-THROTTLE": 13,
    "V-THROTTLE": 14,
    "L-BOOST": 15,
    "L-POS": 16
}

In [6]:
dataset = load_dataset("cw1521/nl-st")

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 865328
    })
    validation: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 247238
    })
    test: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 123619
    })
})

In [8]:
example = dataset["train"][0]

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [10]:
example = dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)



tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokens

['[CLS]',
 'i',
 "'",
 'm',
 'heading',
 'east',
 '.',
 'i',
 'cu',
 '##rre',
 '##nl',
 '##y',
 'have',
 '34',
 'percent',
 'boost',
 '.',
 'i',
 "'",
 'm',
 'not',
 'in',
 'the',
 'air',
 '.',
 'i',
 "'",
 'm',
 'in',
 'quadrant',
 '4',
 '.',
 'i',
 "'",
 'm',
 'about',
 'to',
 'turn',
 'right',
 'and',
 'i',
 "'",
 'm',
 'travelling',
 'forwards',
 '.',
 'i',
 "'",
 'm',
 'travelling',
 '140',
 '##7',
 'miles',
 'per',
 'hour',
 '.',
 '[SEP]']

In [11]:
tokenized_input

{'input_ids': [101, 1045, 1005, 1049, 5825, 2264, 1012, 1045, 12731, 14343, 20554, 2100, 2031, 4090, 3867, 12992, 1012, 1045, 1005, 1049, 2025, 1999, 1996, 2250, 1012, 1045, 1005, 1049, 1999, 29371, 1018, 1012, 1045, 1005, 1049, 2055, 2000, 2735, 2157, 1998, 1045, 1005, 1049, 8932, 19390, 1012, 1045, 1005, 1049, 8932, 8574, 2581, 2661, 2566, 3178, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [13]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/865328 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset["train"]["input_ids"]

[[101,
  1045,
  1005,
  1049,
  5825,
  2264,
  1012,
  1045,
  12731,
  14343,
  20554,
  2100,
  2031,
  4090,
  3867,
  12992,
  1012,
  1045,
  1005,
  1049,
  2025,
  1999,
  1996,
  2250,
  1012,
  1045,
  1005,
  1049,
  1999,
  29371,
  1018,
  1012,
  1045,
  1005,
  1049,
  2055,
  2000,
  2735,
  2157,
  1998,
  1045,
  1005,
  1049,
  8932,
  19390,
  1012,
  1045,
  1005,
  1049,
  8932,
  8574,
  2581,
  2661,
  2566,
  3178,
  1012,
  102],
 [101,
  1045,
  1005,
  1049,
  8932,
  13412,
  2581,
  2661,
  2566,
  3178,
  1012,
  1045,
  1005,
  1049,
  3810,
  2187,
  1998,
  1045,
  1005,
  1049,
  3048,
  19390,
  1012,
  1045,
  1005,
  1049,
  1999,
  29371,
  1017,
  1012,
  1045,
  1005,
  1049,
  2747,
  8932,
  4643,
  1012,
  1045,
  1005,
  1049,
  2006,
  1996,
  2598,
  1012,
  1045,
  2031,
  28043,
  1012,
  102],
 [101,
  1045,
  1005,
  1049,
  2025,
  1999,
  1996,
  2250,
  1012,
  2026,
  2783,
  12992,
  2003,
  2260,
  1012,
  1045,
  1005,
  1049,


In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
labels = [label_list[i] for i in example[f"ner_ids"]]

In [17]:
labels

['O',
 'L-DIR',
 'V-DIR',
 'O',
 'O',
 'L-BA',
 'O',
 'V-BA',
 'O',
 'L-BA',
 'O',
 'O',
 'L-GROUND',
 'O',
 'O',
 'L-GROUND',
 'O',
 'O',
 'O',
 'L-POS',
 'L-POS',
 'O',
 'O',
 'O',
 'O',
 'L-STEER',
 'V-STEER',
 'O',
 'O',
 'L-THROTTLE',
 'V-THROTTLE',
 'O',
 'O',
 'O',
 'V-SPEED',
 'L-SPEED',
 'L-SPEED',
 'L-SPEED',
 'O']

In [18]:

seqeval = evaluate.load("seqeval")


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [19]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [20]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=17, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="nl-ner",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 17.69 MiB is free. Including non-PyTorch memory, this process has 3.49 GiB memory in use. Of the allocated memory 3.22 GiB is allocated by PyTorch, and 214.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)