# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
## Important libs ##
import os
from pathlib import Path
import huggingface_hub
from datasets import load_dataset
from sklearn.metrics import classification_report

os.chdir(Path.cwd().parent)

from src.utils import load_env_file

load_env_file()
api_key = os.getenv("HF_TOKEN")

huggingface_hub.login(api_key)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Fine-tuning BERT for a NER model

In [3]:
raw_datasets = load_dataset("conll2003", trust_remote_code="true")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
from transformers import AutoTokenizer

model_id = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
from src.model.dataset_configs import tokenize_ner_models

tokenized_datasets = raw_datasets.map(
    tokenize_ner_models,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [8]:
label_names = raw_datasets["train"].features["ner_tags"].feature.names

words = raw_datasets["train"][4]["tokens"]
labels = raw_datasets["train"][4]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . 
B-LOC   O  O              O  O   B-ORG    I-ORG O  O          O         B-PER  I-PER     O    O  O         O         O      O   O         O    O         O     O    B-LOC   O     O   O          O      O   O       O 


## Fine-tuning distilBERT (baseline to compare with LLM later)

In [9]:
from transformers import AutoModelForTokenClassification

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_id,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [24]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [30]:
for i in range(5):
    print(len(tokenized_datasets["train"][i]['labels']))

11
4
8
33
39


# Fine-tuning a Language Model

We will use the annotated dataset from the previous step (pretending that we don't have the ground truth labels)

In [10]:
import wandb

project_name = "ner_fine_tuning"
group = "ner_fine_tuning"
# This will open a window so you can login to W&B.
# If that doesn't work, set your W&B API key below
# If you do, remove your key before publishing to GitHub.

# %env WANDB_API_KEY=YOUR_WANDB_API_KEY
#wandb.login()
run = wandb.init(project=project_name, group=group, mode="online")

[34m[1mwandb[0m: Currently logged in as: [33mgabrieldiasmp[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
tokenized_datasets.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)


In [14]:
from src.model.dataset_configs import HFTextDataset
from torch.utils.data import DataLoader

In [32]:
train_ds = HFTextDataset(tokenized_datasets["train"])

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator
)

In [33]:
val_ds = HFTextDataset(tokenized_datasets["validation"])

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator
)

In [None]:
### Without padding

tokenized_datasets["train"]['labels']

[tensor([-100,    3,    0,    7,    0,    0,    0,    7,    0,    0, -100]),
 tensor([-100,    1,    2, -100]),
 tensor([-100,    5,    0,    0,    0,    0,    0, -100]),
 tensor([-100,    0,    3,    4,    0,    0,    0,    0,    0,    0,    7,    0,
            0,    0,    0,    0,    0,    7,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0, -100]),
 tensor([-100,    5,    0,    0,    0,    0,    0,    3,    4,    0,    0,    0,
            0,    1,    2,    2,    2,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    5,    0,    0,    0,    0,    0,
            0,    0, -100]),
 tensor([-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    3,    0,    0,    0,    0,    1,    2,    2,    2,    2,    0,
            0,    0,    0,    0, -100]),
 tensor([-100,    0,    0,    0,    0,    0,  

In [18]:
train_last_layers_only = True

if train_last_layers_only:
    for param in model.parameters():
        param.requires_grad = False

    for param in model.classifier.parameters():
        param.requires_grad = True

In [39]:
from src.model.training import HFLightningModel, train_model_lightning
from lightning.pytorch.loggers import WandbLogger

lightning_model = HFLightningModel(
    model=model, label_name="labels", learning_rate=0.05, num_classes=9, task_type="token_classification")

wandb_logger = WandbLogger(log_model="best")

trainer = train_model_lightning(
    lightning_model=lightning_model,
    train_loader=train_loader,
    val_loader=val_loader,
    logger=wandb_logger,
    max_epochs=5,
    project_name=project_name,
    group=group
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

In [None]:
wandb.finish()

## Test inference

In [59]:
test_dataset = DatasetDict(
    {
        "test": dataset.select(range(0, 300))
    }
)

In [60]:
test_dataset_tokenized = tokenize_hugging_face(test_dataset, model_str="distilbert-base-uncased")

test_dataset_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

#test_ds = HFTextDataset(test_dataset_tokenized)

test_loader = DataLoader(
    dataset=test_dataset_tokenized['test'],
    batch_size=32,
    shuffle=False,
)

In [61]:
f"{wandb.run.entity}/{project_name}/model-{wandb.run.id}:best"

'gabrieldiasmp/llm_annotation_ft/model-l0zsmva2:best'

In [37]:
# Define checkpoint reference.
checkpoint_reference = f"{wandb.run.entity}/{project_name}/model-{wandb.run.id}:best"

# Download checkpoint locally (if not already cached).
artifact = run.use_artifact(checkpoint_reference, type="model")
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact model-l0zsmva2:best, 255.52MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.1 (239.5MB/s)


In [38]:
# Load checkpoint.
model = HFLightningModel.load_from_checkpoint(str(artifact_dir) + "/model.ckpt")

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [47]:
import torch

predicted_labels = batch_outputs = trainer.predict(model=model, dataloaders=test_loader)
logits = torch.cat([batch_output["logits"] for batch_output in batch_outputs])
predicted_labels = torch.argmax(logits, dim=1)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [55]:
compute_metrics(test_dataset['test']['label'], predicted_labels)

{'0': {'precision': 0.7666666666666667,
  'recall': 0.42592592592592593,
  'f1-score': 0.5476190476190477,
  'support': 54.0},
 '1': {'precision': 0.84375,
  'recall': 0.9529411764705882,
  'f1-score': 0.8950276243093923,
  'support': 170.0},
 '2': {'precision': 0.5769230769230769,
  'recall': 0.5921052631578947,
  'f1-score': 0.5844155844155844,
  'support': 76.0},
 'accuracy': 0.7666666666666667,
 'macro avg': {'precision': 0.7291132478632477,
  'recall': 0.6569907885181362,
  'f1-score': 0.6756874187813414,
  'support': 300.0},
 'weighted avg': {'precision': 0.7622788461538461,
  'recall': 0.7666666666666667,
  'f1-score': 0.7538056970653656,
  'support': 300.0}}

In [49]:
len(predicted_labels)

300

In [52]:
test_dataset['test']['label']

[1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 1,
 1,
 1,
 2,
 2,
