Load Modules

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import DataCollatorForTokenClassification, Trainer
from datasets import load_dataset, load_metric
from os import getcwd



Constants

In [62]:
model_checkpoint = "distilbert-base-uncased"
dataset_name = "cw1521/nl-st"
model_name = "nl-ner-sm-10"

output_path = f"{getcwd()}\\output\\{model_name}"


Load Dataset

In [133]:
dataset = load_dataset("cw1521/nl-st")

Using custom data configuration cw1521--nl-st-0535027a99994970
Found cached dataset json (C:/Users/school/.cache/huggingface/datasets/cw1521___json/cw1521--nl-st-0535027a99994970/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

Print Structure of Dataset and Number of Elements

In [64]:
print(f"Structure of Dataset:\n{dataset}")
print(f'Total number of elements in dataset: {len(dataset["train"]) + len(dataset["validation"])}')

Structure of Dataset:
DatasetDict({
    train: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 865320
    })
    test: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 123615
    })
    validation: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 247230
    })
})
Total number of elements in dataset: 1112550


Example Data Item

In [65]:
print(f"Example data item:\n{dataset['train'][0]}")

Example data item:
{'ner_sentence': 'speed 1237 currently braking boost 23 direction east quadrant 2 east wall', 'ner_tags': ['O', 'O', 'L-SPEED', 'O', 'V-SPEED', 'O', 'O', 'L-BRAKE', 'L-BRAKE', 'O', 'O', 'L-BA', 'L-BA', 'O', 'V-BA', 'O', 'O', 'L-DIR', 'L-DIR', 'O', 'V-DIR', 'O', 'O', 'O', 'L-POS', 'L-POS', 'O', 'O', 'L-POS', 'L-POS', 'O'], 'sentence': "My current speed is 1237. I'M currently braking. My current boost is 23. My current direction is east. I'm in quadrant 2 near the east wall.", 'state': 'is_demoed False ball_touched False boost_amount 23 position -4048 3552 239 direction 19 speed 1237 on_ground False throttle 0 steer 0 jump 1 boost 0 handbrake 1'}


Load Tokenizer, Model, and Data Collator

In [137]:
max_input = 512

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=max_input)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(ner_id_map))
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="pt")

loading configuration file config.json from cache at C:\Users\school/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_length": 512,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.22.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\school/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\vocab.txt
loading file tokenizer.json from cache at C:\Users\school/.cache\hug

In [67]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 865320
    })
    test: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 123615
    })
    validation: Dataset({
        features: ['ner_sentence', 'ner_tags', 'sentence', 'state'],
        num_rows: 247230
    })
})

In [134]:
dataset = dataset.remove_columns(["state", "ner_sentence"])

In [139]:
label_encoding_dict = {
    "O": 0,
    "L-DEMO": 1,
    "L-BA": 2,
    "V-BA": 3,
    "L-GROUND": 4,
    "L-BALL": 5,
    "L-SPEED": 6,
    "V-SPEED": 7,
    "L-DIR": 8,
    "V-DIR": 9,
    "L-BRAKE": 10,
    "L-STEER": 11,
    "V-STEER": 12,
    "L-THROTTLE": 13,
    "V-THROTTLE": 14,
    "L-BOOST": 15,
    "L-POS": 16
}

In [138]:

def tokenize_sentence(sentence):
    return sentence.replace(".", " .").replace("!", " !").split(" ")

def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(
        list(map(tokenize_sentence, examples["sentence"])),
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == 'O':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Tokenize Datasets

In [140]:
train = dataset["train"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["sentence", "ner_tags"]
)

  0%|          | 0/866 [00:00<?, ?ba/s]

In [141]:

valid = dataset["validation"].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["sentence", "ner_tags"]
)

  0%|          | 0/248 [00:00<?, ?ba/s]

In [123]:
valid

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 247230
})

In [None]:
# print(len(valid["input_ids"][0]), len(valid["labels"][0]), len(valid["attention_mask"][0]))

In [142]:
train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

In [143]:
valid.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

Metrics

In [144]:
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    

Trainer Arguments and Trainer

In [173]:
def get_training_args(num_epochs):
    batch_size = 16
    args = TrainingArguments(
        model_name,
        save_steps=50,
        evaluation_strategy = "epoch",
        learning_rate=1e-4,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=1e-5,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        logging_dir='./logs',
    	gradient_accumulation_steps=4,
	    fp16=True
    )
    return args


In [174]:
args = get_training_args(1)

trainer = Trainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Train Model

In [175]:
trainer.train()

***** Running training *****
  Num examples = 865320
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 216330


  0%|          | 0/216330 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.47 GiB already allocated; 0 bytes free; 3.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF