In [1]:
! pip3 install numpy torch torchvision torchaudio transformers datasets evaluate



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import Trainer, DataCollatorForTokenClassification
from datasets import load_dataset
import numpy as np
from evaluate import load


In [3]:
label_list = [
    "O",
    "LDEMO",
    "LB",
    "VBA",
    "LGROUND",
    "LBALL",
    "LSP",
    "VSPEED",
    "LD",
    "VDIR",
    "LBRAKE",
    "LSTE",
    "VSTEER",
    "LTHROT",
    "VTHROTTLE",
    "LBOOST",
    "LPOS"
]

In [4]:
dataset = load_dataset("cw1521/nl-st")
# dataset["train"] = dataset["train"].shard(10, 0)
# dataset["validation"] = dataset["validation"].shard(10, 0)
# dataset["test"] = dataset["test"].shard(10, 0)

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 865328
    })
    validation: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 247238
    })
    test: Dataset({
        features: ['ner_ids', 'ner_sentence', 'ner_tags', 'sentence', 'state', 'tokens'],
        num_rows: 123619
    })
})

In [6]:
example = dataset["train"][0]

In [7]:
model_name = "distilbert-base-uncased"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [9]:
example = dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokens

['[CLS]',
 'i',
 "'",
 'm',
 'heading',
 'east',
 '.',
 'i',
 'cu',
 '##rre',
 '##nl',
 '##y',
 'have',
 '34',
 'percent',
 'boost',
 '.',
 'i',
 "'",
 'm',
 'not',
 'in',
 'the',
 'air',
 '.',
 'i',
 "'",
 'm',
 'in',
 'quadrant',
 '4',
 '.',
 'i',
 "'",
 'm',
 'about',
 'to',
 'turn',
 'right',
 'and',
 'i',
 "'",
 'm',
 'travelling',
 'forwards',
 '.',
 'i',
 "'",
 'm',
 'travelling',
 '140',
 '##7',
 'miles',
 'per',
 'hour',
 '.',
 '[SEP]']

In [10]:
tokenized_input.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 3,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 11,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 17,
 18,
 19,
 20,
 21,
 22,
 22,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 28,
 28,
 29,
 30,
 31,
 32,
 32,
 32,
 33,
 34,
 34,
 35,
 36,
 37,
 38,
 None]

In [11]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_ids"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_list[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

I'm heading east . I currenly have 34  percent boost . I'm not     in the air     . I'm in quadrant 4    . I'm about to turn right  and I'm travelling forwards  . I'm travelling 1407   miles per hour . 
O   LD      VDIR O O LB       O    VBA O       LB    O O   LGROUND O  O   LGROUND O O   O  LPOS     LPOS O O   O     O  LSTE VSTEER O   O   LTHROT     VTHROTTLE O O   O          VSPEED LSP   LSP LSP  O 


In [12]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]

    return new_labels

In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_ids"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [14]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/865328 [00:00<?, ? examples/s]

Map:   0%|          | 0/247238 [00:00<?, ? examples/s]

Map:   0%|          | 0/123619 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset["train"][0]

{'input_ids': [101,
  1045,
  1005,
  1049,
  5825,
  2264,
  1012,
  1045,
  12731,
  14343,
  20554,
  2100,
  2031,
  4090,
  3867,
  12992,
  1012,
  1045,
  1005,
  1049,
  2025,
  1999,
  1996,
  2250,
  1012,
  1045,
  1005,
  1049,
  1999,
  29371,
  1018,
  1012,
  1045,
  1005,
  1049,
  2055,
  2000,
  2735,
  2157,
  1998,
  1045,
  1005,
  1049,
  8932,
  19390,
  1012,
  1045,
  1005,
  1049,
  8932,
  8574,
  2581,
  2661,
  2566,
  3178,
  1012,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  8,
  9,
  0,
  0,
  2,
  0,
  3,
  0,
  2,
  0,
  0,
  4,
  0,
  0,
  4,
  0,
  0,
  0,
  16,
  16,
  0,
  0,
  0,
  0,
  11,
  12,
  0,
  0,
  13,
  14,
  0,
  0,
  0,
  7,
  6,
  6,
  6,
  0,
  -

In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [17]:
labels = [label_list[i] for i in example[f"ner_ids"]]

In [18]:
labels

['O',
 'LD',
 'VDIR',
 'O',
 'O',
 'LB',
 'O',
 'VBA',
 'O',
 'LB',
 'O',
 'O',
 'LGROUND',
 'O',
 'O',
 'LGROUND',
 'O',
 'O',
 'O',
 'LPOS',
 'LPOS',
 'O',
 'O',
 'O',
 'O',
 'LSTE',
 'VSTEER',
 'O',
 'O',
 'LTHROT',
 'VTHROTTLE',
 'O',
 'O',
 'O',
 'VSPEED',
 'LSP',
 'LSP',
 'LSP',
 'O']

In [19]:
batch = data_collator([tokenized_dataset["train"][i] for i in range(50)])
batch["labels"]

tensor([[-100,    0,    8,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    4,  ..., -100, -100, -100],
        ...,
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    0,    8,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])

In [20]:
metric = load("seqeval")

In [21]:

predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels], zero_division=0.5)



{'B': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'BA': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'D': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'DIR': {'precision': np.float64(1.0),
  'recall': np.float64(0.0),
  'f1': np.float64(0.0),
  'number': np.int64(1)},
 'GROUND': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(2)},
 'POS': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'SP': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'SPEED': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1': np.float64(1.0),
  'number': np.int64(1)},
 'STE': {'precision': np.float64(1.0),
  'recall'

In [22]:

def compute_metrics(p):
    metric = load("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0.5)
    return {
        "precision": results["overall_precision"], 
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [23]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [24]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=17, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    output_dir="nl-ner-1",
    learning_rate=2e-5,
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=1,
    weight_decay=1e-7,
    save_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50
    # load_best_model_at_end=True
)



In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [30]:
trainer.train()
trainer.save_model()
trainer.save_state()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 