In [39]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model,LoraConfig,TaskType
import evaluate
import torch
import numpy as np


model_checkpoint = "roberta-large"
lr = 1e-3
batch_size = 16
num_epochs = 10

In [7]:
bionlp = load_dataset(path="/home/bocheng/data/corpus/bionlp2004")

Downloading and preparing dataset json/bionlp2004 to /home/bocheng/.cache/huggingface/datasets/json/bionlp2004-34151569244775e8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/bocheng/.cache/huggingface/datasets/json/bionlp2004-34151569244775e8/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
seqeval=evaluate.load("seqeval")

In [11]:
label_list = [
    "O",
    "B-DNA",
    "I-DNA",
    "B-protein",
    "I-protein",
    "B-cell_type",
    "I-cell_type",
    "B-cell_line",
    "I-cell_line",
    "B-RNA",
    "I-RNA",
]

In [51]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # print(label)
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [30]:
tokenize_and_align_labels(bionlp['train'][0:2])

{'input_ids': [[0, 1773, 289, 31812, 3586, 29, 703, 2422, 24260, 41, 2485, 11, 1263, 7, 255, 25356, 2156, 8, 289, 176, 673, 176, 45645, 19390, 2620, 12, 134, 2156, 11707, 6078, 189, 1760, 25, 10, 7371, 35387, 2403, 479, 2], [0, 2223, 8242, 2620, 12, 134, 26076, 21, 32512, 2156, 34596, 9, 44174, 7561, 44322, 3175, 36, 6256, 1975, 3892, 179, 4839, 50, 19258, 560, 46719, 221, 12, 13872, 36, 14795, 597, 29451, 102, 4839, 31683, 19390, 2620, 12, 134, 26076, 30, 255, 25356, 2156, 9378, 14, 484, 7371, 12, 20557, 1295, 1743, 32, 963, 11, 63, 5746, 479, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 7, -100, -100, -100, 0, 0, -100, 0, -100, 0, 0, 0, 0, -100, 0, 0, 0, -100, -100, -100, 0, 3, -100, -10

In [24]:
tokenized_bionlp = bionlp.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/16619 [00:00<?, ? examples/s]

Map:   0%|          | 0/3856 [00:00<?, ? examples/s]

Map:   0%|          | 0/1927 [00:00<?, ? examples/s]

In [31]:
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)

In [33]:
id2label = {
    0: "O",
    1: "B-DNA",
    2: "I-DNA",
    3: "B-protein",
    4: "I-protein",
    5: "B-cell_type",
    6: "I-cell_type",
    7: "B-cell_line",
    8: "I-cell_line",
    9: "B-RNA",
    10: "I-RNA",
}
label2id = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10,
}

In [37]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=11, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

In [40]:
peft_config=LoraConfig(
    task_type=TaskType.TOKEN_CLS,inference_mode=False,r=16,lora_alpha=16,lora_dropout=0.1,bias='all'
)

In [41]:
model=get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 1855499 || all params: 355894283 || trainable%: 0.5213624069370061


In [48]:
training_args=TrainingArguments(
    output_dir='roberta-large-lora-token-classification',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bionlp["train"],
    eval_dataset=tokenized_bionlp["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [61]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1714,0.182816,0.741502,0.80533,0.772099,0.943393
2,0.1845,0.162271,0.766342,0.808572,0.786891,0.946482
3,0.1741,0.166993,0.751654,0.838826,0.792851,0.945557
4,0.1592,0.162313,0.783126,0.829101,0.805458,0.948823
5,0.1458,0.160937,0.778176,0.820637,0.798843,0.9478
6,0.1351,0.151958,0.791861,0.82694,0.80902,0.951047
7,0.1226,0.155131,0.794053,0.841527,0.817101,0.951991
8,0.1127,0.162576,0.791179,0.839906,0.814815,0.950732
9,0.1026,0.164145,0.797462,0.837385,0.816936,0.95087
10,0.0927,0.168549,0.796523,0.833243,0.814469,0.950437


Could not load adapter model, make sure to have `peft>=0.3.0` installed


TrainOutput(global_step=10390, training_loss=0.1389817180027746, metrics={'train_runtime': 2021.116, 'train_samples_per_second': 82.227, 'train_steps_per_second': 5.141, 'total_flos': 2.3527334960649196e+16, 'train_loss': 0.1389817180027746, 'epoch': 10.0})

In [62]:
text = "The activation of IL-2 gene expression and NF-kappa B through CD28 requires reactive oxygen production by 5-lipoxygenase."
inputs = tokenizer(text, return_tensors="pt")

In [64]:
inputs

{'input_ids': tensor([[    0,    20, 29997,     9, 11935,    12,   176, 10596,  8151,     8,
         33861,    12,   330, 22181,   163,   149,  7522,  2517,  3441, 34729,
         11747,   931,    30,   195,    12, 33330, 25456,  4138,  3175,     4,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}

In [65]:
new_model=model.to("cpu")
with torch.no_grad():
    logits = new_model(**inputs).logits

tokens = inputs.tokens()
predictions = torch.argmax(logits, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

('<s>', 'O')
('ĠThe', 'O')
('Ġactivation', 'O')
('Ġof', 'O')
('ĠIL', 'B-DNA')
('-', 'I-DNA')
('2', 'I-DNA')
('Ġgene', 'I-DNA')
('Ġexpression', 'O')
('Ġand', 'O')
('ĠNF', 'B-protein')
('-', 'I-protein')
('k', 'I-protein')
('appa', 'I-protein')
('ĠB', 'I-protein')
('Ġthrough', 'O')
('ĠCD', 'B-protein')
('28', 'I-protein')
('Ġrequires', 'O')
('Ġreactive', 'O')
('Ġoxygen', 'O')
('Ġproduction', 'O')
('Ġby', 'O')
('Ġ5', 'B-protein')
('-', 'I-protein')
('lip', 'I-protein')
('oxy', 'I-protein')
('gen', 'I-protein')
('ase', 'I-protein')
('.', 'O')
('</s>', 'O')
