In [1]:
import torch
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator, GloVe
from app.utils import getData, iterator, train_and_eval, predict, Data
from app.model import NER, NER_transformer

### Data

In [8]:
data = pd.read_csv("./data/ner_datasetreference.csv", encoding='latin1')
sentences, labels = getData(data)
split_idx = int(0.9 * len(sentences))
train, train_labels, test, test_labels = sentences[:split_idx], labels[:split_idx], sentences[split_idx:], labels[split_idx:]
vocab = build_vocab_from_iterator(iterator(train), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
labels_dict = build_vocab_from_iterator(iterator(train_labels), specials=["<pad>"])
glove_vocab = GloVe(name='42B', dim=300)
use_glove = False
max_len = max([len(s) for s in sentences])
train_pipeline = Data(train, train_labels, max_len, vocab, glove_vocab, labels_dict, use_glove=use_glove)
test_pipeline = Data(test, test_labels, max_len, vocab, glove_vocab, labels_dict, use_glove=use_glove)
train_loader = DataLoader(train_pipeline, batch_size=64, shuffle=True)
test_loader = DataLoader(test_pipeline)

In [9]:
print(max_len)

104


### Hyperparameters

In [10]:
vocab_size = len(vocab)
tag_size = len(labels_dict)
embed_size = 300
num_layers = 3
hidden_size = 64
n_epochs = 20
learning_rate = 1e-3
n_head = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model

In [11]:
model1 = NER(vocab_size, embed_size, hidden_size, num_layers, tag_size, use_glove=use_glove).to(device=device)
model2 = NER_transformer(vocab_size, embed_size, num_layers, tag_size, n_head, use_glove=use_glove).to(device=device)

### Training

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)
for epoch in range(n_epochs):
    train_and_eval(model2, train_loader, test_loader, optimizer, criterion, device, epoch)

training loss of epoch 1 is 0.63421
test acc after epoch 1 is 0.92500
training loss of epoch 2 is 0.58966
test acc after epoch 2 is 0.93448
training loss of epoch 3 is 0.58042
test acc after epoch 3 is 0.94019
training loss of epoch 4 is 0.57586
test acc after epoch 4 is 0.94348
training loss of epoch 5 is 0.57162
test acc after epoch 5 is 0.94448
training loss of epoch 6 is 0.57040
test acc after epoch 6 is 0.94524
training loss of epoch 7 is 0.56931
test acc after epoch 7 is 0.94630
training loss of epoch 8 is 0.56762
test acc after epoch 8 is 0.94716
training loss of epoch 9 is 0.56699
test acc after epoch 9 is 0.94846
training loss of epoch 10 is 0.56618
test acc after epoch 10 is 0.94791
training loss of epoch 11 is 0.56488
test acc after epoch 11 is 0.94764
training loss of epoch 12 is 0.56379
test acc after epoch 12 is 0.95008
training loss of epoch 13 is 0.56352
test acc after epoch 13 is 0.94977
training loss of epoch 14 is 0.56319
test acc after epoch 14 is 0.95001
training l

### Fine Tuning Using DistillBERT

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
wnut = load_dataset("wnut_17")
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Reusing dataset wnut_17 (C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9\cache-c2f22eacb5353639.arrow
Loading cached processed dataset at C:\Users\fengq\.cache\huggingface\datasets\wnut_17\wnut_17\1.0.0\077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9\cache-d00931926d0d0eb0.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Epoch,Training Loss,Validation Loss
1,No log,0.270019
2,No log,0.25512
3,0.188900,0.265892


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1287
  Batch size = 16
Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special to

TrainOutput(global_step=639, training_loss=0.1645464277789813, metrics={'train_runtime': 68.1453, 'train_samples_per_second': 149.416, 'train_steps_per_second': 9.377, 'total_flos': 137822474941512.0, 'train_loss': 0.1645464277789813, 'epoch': 3.0})

In [8]:
classifier = pipeline("ner", model=model, tokenizer=tokenizer, device=0)
label_list = wnut["train"].features[f"ner_tags"].feature.names
text = "Isaiah expected the China to return to Taiwan"
def predict(classifier, text, label_list):
    result = []
    for x in classifier(text):
        x = x['entity'].split("_")
        x = int(x[1])
        result.append(label_list[x])
    return result
predict(classifier, text, label_list)

['B-person', 'O', 'O', 'B-location', 'O', 'O', 'O', 'B-location']

### Testing

In [22]:
print(predict(model2, "Isaiah expected the China to return to Taiwan", max_len, vocab, glove_vocab, labels_dict, use_glove, device))

['B-per', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'B-geo']


### Saving Model

In [15]:
torch.save(model2.state_dict(), './app/saved_no_glove_transformer.pth')