# NER with PEFT

In this notebook we will perform PEFT for a token classification task on the CoNLLpp dataset.

In [1]:
import json
import wandb
import torch
import random
import numpy as np
import pandas as pd

In [2]:
from datasets import load_dataset, Dataset

import transformers
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [3]:
# fix random seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
transformers.set_seed(seed)

In [4]:
project_name = "PEFT-NER"
access_tokens_path = "./data/access_tokens.json"
model_ckpt = "distilroberta-base"
data_ckpt = "conllpp"

model_name = f"{model_ckpt}-finetuned-{data_ckpt}-mlm-adapters"
model_path = f"./models/{model_name}"

In [5]:
raw_datasets = load_dataset(data_ckpt)

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/279k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [7]:
cols_to_remove = ["id", "pos_tags", "chunk_tags", "ner_tags"]  # used for conll datasets

In [8]:
raw_datasets = raw_datasets.remove_columns(cols_to_remove)

In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens'],
        num_rows: 3453
    })
})

In [10]:
raw_datasets = raw_datasets.map(lambda x: {"text": [" ".join(t) for t in x["tokens"]]}, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [11]:
raw_datasets["train"][0]

{'text': 'EU rejects German call to boycott British lamb .'}

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [13]:
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
    )
    return tokenized

In [14]:
tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3453
    })
})

In [16]:
from adapters import AutoAdapterModel, init, AdapterConfig

In [17]:
model = AutoAdapterModel.from_pretrained(model_ckpt)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               1


In [19]:
adapter_config = AdapterConfig.load("lora", r=8, alpha=32)

In [20]:
model.add_adapter("conllpp_lora_mlm", config=adapter_config)

In [21]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
conllpp_lora_mlm         lora                147,456       0.180       0       1
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               1


In [22]:
model.add_masked_lm_head("conllpp_head_mlm")

In [23]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
conllpp_lora_mlm         lora                147,456       0.180       0       1
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               1


In [24]:
model.set_active_adapters(["conllpp_lora_mlm"])

In [25]:
model.train_adapter(["conllpp_lora_mlm"])

In [26]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
conllpp_lora_mlm         lora                147,456       0.180       1       1
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               0


In [27]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [28]:
with open(access_tokens_path) as f:
    login_key = json.load(f)["wandb"]["login"]

wandb.login(key=login_key)

[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\ehhho\.netrc


True

In [29]:
wandb.init(project=project_name, name=model_name)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [30]:
args = TrainingArguments(
    model_path,
    overwrite_output_dir=True,
    num_train_epochs=5,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="wandb",
    seed=seed,
    data_seed=seed,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=1e-4,
    weight_decay=1e-3,
    warmup_ratio=0.05,
)

In [31]:
from adapters import AdapterTrainer

trainer = AdapterTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [32]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.935576
2,No log,4.034911
3,6.289700,3.641251
4,6.289700,3.583717
5,3.918800,3.531677


Overwriting existing adapter 'conllpp_lora_mlm'.


TrainOutput(global_step=1100, training_loss=4.983563315651634, metrics={'train_runtime': 2760.7222, 'train_samples_per_second': 25.43, 'train_steps_per_second': 0.398, 'total_flos': 1087791338930472.0, 'train_loss': 4.983563315651634, 'epoch': 5.0})

In [33]:
trainer.evaluate(tokenized_datasets["test"], metric_key_prefix="test")

{'test_loss': 3.8876593112945557,
 'test_runtime': 28.9795,
 'test_samples_per_second': 119.153,
 'test_steps_per_second': 1.863,
 'epoch': 5.0}

In [34]:
trainer.save_model(model_path)

In [35]:
wandb.finish()

VBox(children=(Label(value='0.030 MB of 0.044 MB uploaded (0.003 MB deduped)\r'), FloatProgress(value=0.675803…

0,1
eval/loss,█▄▂▁▁
eval/runtime,▁▃██▁
eval/samples_per_second,█▆▁▁█
eval/steps_per_second,█▆▁▁█
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▃▃▅▆▇███
train/global_step,▁▃▃▄▆▇███

0,1
eval/loss,3.53168
eval/runtime,36.2671
eval/samples_per_second,89.613
eval/steps_per_second,1.406
test/loss,3.88766
test/runtime,28.9795
test/samples_per_second,119.153
test/steps_per_second,1.863
train/epoch,5.0
train/global_step,1100.0


In [36]:
model = AutoAdapterModel.from_pretrained(model_ckpt)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
model.load_adapter(model_path+"/conllpp_lora_mlm")

'conllpp_lora_mlm'

In [38]:
model.load_head(model_path+"/conllpp_head_mlm")

('./models/distilroberta-base-finetuned-conllpp-mlm-adapters/conllpp_head_mlm',
 'conllpp_head_mlm')

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [40]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
conllpp_lora_mlm         lora                147,456       0.180       0       1
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               1


In [41]:
model.set_active_adapters(["conllpp_lora_mlm"])

In [42]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
conllpp_lora_mlm         lora                147,456       0.180       1       1
--------------------------------------------------------------------------------
Full model                                82,118,400     100.000               1


In [43]:
ner = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [44]:
ner("I love Egypt<mask>")

[{'score': 0.9482322931289673,
  'token': 479,
  'token_str': '.',
  'sequence': 'I love Egypt.'},
 {'score': 0.010403621941804886,
  'token': 1666,
  'token_str': '...',
  'sequence': 'I love Egypt...'},
 {'score': 0.00968374963849783,
  'token': 4832,
  'token_str': ' :',
  'sequence': 'I love Egypt :'},
 {'score': 0.008040284737944603,
  'token': 22,
  'token_str': ' "',
  'sequence': 'I love Egypt "'},
 {'score': 0.004114451352506876,
  'token': 2156,
  'token_str': ',',
  'sequence': 'I love Egypt,'}]