Load Modules

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import DataCollatorForTokenClassification, Trainer
from datasets import load_dataset, load_metric
import numpy
from json import load
from os import getcwd
import pandas as pd
import numpy as np


def get_auth_key(path):
    with open(path, "r") as f:
        key = load(f)
    return key["auth_key"]



Constants

In [2]:
auth_token_path = f"{getcwd()}\\..\\auth_key.json"



model_checkpoint = "distilbert-base-uncased"
dataset_name = "cw1521/en-st-ner-small"
model_name = "nl-ner-sm-10"



auth_token = get_auth_key(auth_token_path)
output_path = f"{getcwd()}\\output\\{model_name}"


Conversion Maps

In [3]:
ner_id_map = {
    "0": "O",
    "1": "L-DEMO",
    "2": "L-BA",
    "3": "V-BA",
    "4": "L-GROUND",
    "5": "L-BALL",
    "6": "L-SPEED",
    "7": "V-SPEED",
    "8": "L-DIR",
    "9": "V-DIR",
    "10": "L-BRAKE",
    "11": "L-STEER",
    "12": "V-STEER",
    "13": "L-THROTTLE",
    "14": "V-THROTTLE",
    "15": "L-BOOST",
    "16": "L-POS"
  }

In [4]:
ner_tag_map = {
    "O": 0,
    "L-DEMO": 1,
    "L-BA": 2,
    "V-BA": 3,
    "L-GROUND": 4,
    "L-BALL": 5,
    "L-SPEED": 6,
    "V-SPEED": 7,
    "L-DIR": 8,
    "V-DIR": 9,
    "L-BRAKE": 10,
    "L-STEER": 11,
    "V-STEER": 12,
    "L-THROTTLE": 13,
    "V-THROTTLE": 14,
    "L-BOOST": 15,
    "L-POS": 16
  }

In [5]:
label_list =  [
    "O",
    "L-DEMO",
    "L-BA",
    "V-BA",
    "L-GROUND",
    "L-BALL",
    "L-SPEED",
    "V-SPEED",
    "L-DIR",
    "V-DIR",
    "L-BRAKE",
    "L-STEER",
    "V-STEER",
    "L-THROTTLE",
    "V-THROTTLE",
    "L-BOOST",
    "L-POS"
]

Load Dataset

In [6]:
def get_datafiles():
    train = [
    'oracle-train1.json',
    'oracle-train2.json',
    'oracle-train3.json',
    'oracle-train4.json',
    'oracle-train5.json',
    'oracle-train6.json',
    'oracle-train7.json',
    'oracle-train8.json',
    'oracle-train9.json',
    'oracle-train10.json'
    ]   

    valid = ['oracle-valid.json']
    return train, valid



def get_dataset(name):
    train, valid = get_datafiles()
    return load_dataset(    
        name,
        data_files={'train':train, 'valid':valid},
        use_auth_token=auth_token,
        field="data"
    )

In [7]:
dataset = get_dataset(dataset_name)
train, valid = get_datafiles()


Using custom data configuration cw1521--en-st-ner-small-5590803a90d98b46
Found cached dataset json (C:/Users/school/.cache/huggingface/datasets/cw1521___json/cw1521--en-st-ner-small-5590803a90d98b46/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

Print Structure of Dataset and Number of Elements

In [11]:
print(f"Structure of Dataset:\n{dataset}")
print(f'Total number of elements in dataset: {len(dataset["train"]) + len(dataset["valid"])}')

Structure of Dataset:
DatasetDict({
    train: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 865330
    })
    valid: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 123620
    })
})
Total number of elements in dataset: 988950


Example Data Item

In [12]:
print(f"Example data item:\n{dataset['train'][0]}")

Example data item:
{'ner_sentence': 'quadrant 3 blue goal south wall currenly 48 boost currently travelling north pressed brakes on ground speed 618 miles per hour', 'ner_tags': [0, 0, 16, 16, 0, 0, 16, 16, 0, 0, 0, 16, 16, 0, 0, 2, 0, 3, 0, 2, 0, 0, 8, 8, 9, 0, 0, 10, 0, 10, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 6, 0, 7, 6, 6, 6, 0], 'sentence': "I'm in quadrant 3 near the blue goal and near the south wall. I currenly have 48 percent boost. I'm currently travelling north. I pressed the brakes. My car is on the ground. My current speed is 618 miles per hour.", 'state': 'position -188 -4789 17 direction 75 on_ground True is_demoed False ball_touched False boost_amount 48 speed 618 throttle 0 steer 0 jump 1 boost 0 handbrake 1'}


Load Tokenizer, Model, and Data Collator

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(ner_id_map))
data_collator = DataCollatorForTokenClassification(tokenizer)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Tokenize Datasets

In [15]:

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True)
    tokenized_inputs["labels"] = examples["ner_tags"]
    return tokenized_inputs


train_tokenized_datasets = dataset["train"].map(tokenize_and_align_labels, batched=True)
valid_tokenized_datasets = dataset["valid"].map(tokenize_and_align_labels, batched=True)

  0%|          | 0/866 [00:00<?, ?ba/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

Metrics

In [16]:
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    

  metric = load_metric("seqeval")


Trainer Arguments and Trainer

In [38]:
def get_training_args(num_epochs):
    batch_size = 64
    args = TrainingArguments(
        model_name,
        save_steps=50,
        evaluation_strategy = "epoch",
        learning_rate=1e-4,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=1e-5,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        logging_dir='./logs',
    	gradient_accumulation_steps=4,
	    tf32=True
    )
    return args


In [39]:
args = get_training_args(10)

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Train, Evaluate, Save Model

In [40]:
trainer.train()
trainer.evaluate()
trainer.save_model()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, ner_sentence, state, sentence. If ner_tags, ner_sentence, state, sentence are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 865330
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 540830


  0%|          | 0/540830 [00:00<?, ?it/s]

: 

: 