In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments
from transformers import Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric
import numpy as np


In [26]:
label_list = [
    "O",
    "L-DEMO",
    "L-BA",
    "V-BA",
    "L-GROUND",
    "L-BALL",
    "L-SPEED",
    "V-SPEED",
    "L-DIR",
    "V-DIR",
    "L-BRAKE",
    "L-STEER",
    "V-STEER",
    "L-THROTTLE",
    "V-THROTTLE",
    "L-BOOST",
    "L-POS"
]

In [27]:
model_checkpoint = "distilbert-base-uncased"
dataset_name = "cw1521/nl-st"
model_name = "test-ner-1"
input = "tokens"
target = "ner_ids"
test = ""
num_epochs = 1
 

   

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list)
)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding='max_length',
    max_length=512
)

loading configuration file config.json from cache at C:\Users\school/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.22.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\school/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\vocab.txt
loading file tokenizer.json from cache at C:\Users\school/.cache\huggingface\hub\models--

In [29]:
dataset = load_dataset(dataset_name)             
dataset["train"] = dataset["train"].shard(10, 0)
dataset["valid"] = dataset["validation"].shard(10, 0)

Using custom data configuration cw1521--nl-st-9e1c1e3c22c3c0a9
Found cached dataset json (C:/Users/school/.cache/huggingface/datasets/cw1521___json/cw1521--nl-st-9e1c1e3c22c3c0a9/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [30]:
def get_tokenized_datasets():

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[input],
            truncation=True,
            is_split_into_words=True,
            padding="max_length",
            max_length=512
        )
        labels = []
        for i, label in enumerate(examples[target]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to the current label
                else:
                    label_ids.append(label[word_idx])
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_data = dataset.map(tokenize_and_align_labels, batched=True)
    train = tokenized_data["train"]
    valid = tokenized_data["validation"]
    return train, valid

In [31]:
train, valid = get_tokenized_datasets()

  0%|          | 0/87 [00:00<?, ?ba/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

  0%|          | 0/248 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [32]:
def compute_metrics(p):
    metric = load_metric("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"], 
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


In [33]:

def get_training_args(num_epochs):
    batch_size = 32
    args = TrainingArguments(
    model_name,
    save_steps=50,
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=1e-5,
    save_total_limit=3,
    num_train_epochs=num_epochs,
    logging_dir='./logs',
    gradient_accumulation_steps=4,
    fp16=True
    )
    return args

In [34]:

args = get_training_args(num_epochs)
train.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
        )
valid.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

trainer = Trainer(
    model,
    args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [36]:
trainer.train()
trainer.save_model()
trainer.save_state()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_ids, ner_tags, state, tokens, sentence, ner_sentence. If ner_ids, ner_tags, state, tokens, sentence, ner_sentence are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 86533
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 2704


  0%|          | 0/2704 [00:00<?, ?it/s]

KeyboardInterrupt: 