In [1]:
import torch
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset

### Fine Tuning Using DistillBERT

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
wnut = load_dataset("wnut_17")
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Reusing dataset wnut_17 (C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9\cache-c2f22eacb5353639.arrow
Loading cached processed dataset at C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9\cache-d00931926d0d0eb0.arrow
Loading cached processed dataset at C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9\cache-a9a2fc2c963fdda0.arrow
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task o

Epoch,Training Loss,Validation Loss
1,No log,0.29376
2,No log,0.274022
3,0.210800,0.277763


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special to

TrainOutput(global_step=639, training_loss=0.1838513047482486, metrics={'train_runtime': 63.4307, 'train_samples_per_second': 160.522, 'train_steps_per_second': 10.074, 'total_flos': 137822474941512.0, 'train_loss': 0.1838513047482486, 'epoch': 3.0})

In [7]:
classifier = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
label_list = wnut["train"].features[f"ner_tags"].feature.names
print(label_list)
text = "Isaiah expected Taiwan to return to China"
def predict(classifier, text, label_list):
    result = []
    for x in classifier(text):
        x = x['entity'].split("_")
        x = int(x[1])
        result.append(label_list[x])
    return result
predict(classifier, text, label_list)

['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product']


['B-person', 'O', 'B-location', 'O', 'O', 'O', 'B-location']

In [6]:
torch.save(model.state_dict(), './app/saved_no_glove_DistillBERT.pth')