In [1]:
!pip install transformers[torch] datasets tokenizers seqeval




[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: C:\Python310\python.exe -m pip install --upgrade pip





Load Modules

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import DataCollatorForTokenClassification, Trainer
from datasets import load_dataset, load_metric
import numpy as np


Constants

In [3]:
model_checkpoint = "distilbert-base-uncased"
dataset_name = "cw1521/nl-st"
model_name = "nl-ner-sm-10"

Load Dataset

In [4]:
dataset = load_dataset("cw1521/nl-st")

Using custom data configuration cw1521--nl-st-0535027a99994970
Found cached dataset json (C:/Users/school/.cache/huggingface/datasets/cw1521___json/cw1521--nl-st-0535027a99994970/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

Print Structure of Dataset and Number of Elements

In [5]:
print(f"Structure of Dataset:\n{dataset}")
print(f'Total number of elements in dataset: {len(dataset["train"]) + len(dataset["validation"])}')

Structure of Dataset:
DatasetDict({
    train: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 865320
    })
    test: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 123615
    })
    validation: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 247230
    })
})
Total number of elements in dataset: 1112550


Example Data Item

In [6]:
print(f"Example data item:\n{dataset['train'][0]}")

Example data item:
{'ner_sentence': 'speed 1237 currently braking boost 23 direction east quadrant 2 east wall', 'ner_tags': ['O', 'O', 'L-SPEED', 'O', 'V-SPEED', 'O', 'O', 'L-BRAKE', 'L-BRAKE', 'O', 'O', 'L-BA', 'L-BA', 'O', 'V-BA', 'O', 'O', 'L-DIR', 'L-DIR', 'O', 'V-DIR', 'O', 'O', 'O', 'L-POS', 'L-POS', 'O', 'O', 'L-POS', 'L-POS', 'O'], 'sentence': "My current speed is 1237. I'M currently braking. My current boost is 23. My current direction is east. I'm in quadrant 2 near the east wall.", 'state': 'is_demoed False ball_touched False boost_amount 23 position -4048 3552 239 direction 19 speed 1237 on_ground False throttle 0 steer 0 jump 1 boost 0 handbrake 1'}


In [7]:
label_encoding_dict = {
    "O": 0,
    "L-DEMO": 1,
    "L-BA": 2,
    "V-BA": 3,
    "L-GROUND": 4,
    "L-BALL": 5,
    "L-SPEED": 6,
    "V-SPEED": 7,
    "L-DIR": 8,
    "V-DIR": 9,
    "L-BRAKE": 10,
    "L-STEER": 11,
    "V-STEER": 12,
    "L-THROTTLE": 13,
    "V-THROTTLE": 14,
    "L-BOOST": 15,
    "L-POS": 16
}

In [8]:
label_list = [
    "O",
    "L-DEMO",
    "L-BA",
    "V-BA",
    "L-GROUND",
    "L-BALL",
    "L-SPEED",
    "V-SPEED",
    "L-DIR",
    "V-DIR",
    "L-BRAKE",
    "L-STEER",
    "V-STEER",
    "L-THROTTLE",
    "V-THROTTLE",
    "L-BOOST",
    "L-POS"
]

Load Tokenizer, Model, and Data Collator

In [9]:
max_input = 512

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=max_input)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_encoding_dict))
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="pt")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 865320
    })
    test: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 123615
    })
    validation: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 247230
    })
})

In [11]:
dataset = dataset.remove_columns(["state", "ner_sentence"])

In [12]:

def tokenize_sentence(sentence):
    return sentence.replace(".", " .").replace("!", " !").split(" ")

def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(
        list(map(tokenize_sentence, examples["sentence"])),
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == 'O':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Tokenize Datasets

In [13]:
train = dataset["train"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["sentence", "ner_tags"]
)

Loading cached processed dataset at C:/Users/school/.cache/huggingface/datasets/cw1521___json/cw1521--nl-st-0535027a99994970/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-021e10c334392bab.arrow


In [14]:

valid = dataset["validation"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["sentence", "ner_tags"]
)

  0%|          | 0/248 [00:00<?, ?ba/s]

In [15]:
valid

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 247230
})

In [16]:
# print(len(valid["input_ids"][0]), len(valid["labels"][0]), len(valid["attention_mask"][0]))

In [17]:
train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

In [18]:
valid.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

Metrics

In [19]:
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    

  metric = load_metric("seqeval")


Trainer Arguments and Trainer

In [20]:
def get_training_args(num_epochs):
    batch_size = 8
    args = TrainingArguments(
        model_name,
        save_steps=50,
        evaluation_strategy = "epoch",
        learning_rate=1e-4,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=1e-5,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        logging_dir='./logs',
    	gradient_accumulation_steps=4,
	    fp16=True
    )
    return args


In [21]:
args = get_training_args(1)

trainer = Trainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Using cuda_amp half precision backend


Train Model

In [22]:
trainer.train()

***** Running training *****
  Num examples = 865320
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 27041


  0%|          | 0/27041 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to nl-ner-sm-10\checkpoint-50
Configuration saved in nl-ner-sm-10\checkpoint-50\config.json
Model weights saved in nl-ner-sm-10\checkpoint-50\pytorch_model.bin
tokenizer config file saved in nl-ner-sm-10\checkpoint-50\tokenizer_config.json
Special tokens file saved in nl-ner-sm-10\checkpoint-50\special_tokens_map.json
Saving model checkpoint to nl-ner-sm-10\checkpoint-100
Configuration saved in nl-ner-sm-10\checkpoint-100\config.json
Model weights saved in nl-ner-sm-10\checkpoint-100\pytorch_model.bin
tokenizer config file saved in nl-ner-sm-10\checkpoint-100\tokenizer_config.json
Special tokens file saved in nl-ner-sm-10\checkpoint-100\special_tokens_map.json


KeyboardInterrupt: 