# Token classification (PyTorch)

Original: [HuggingFace Token Classification Fine-tuning Tutorial](https://huggingface.co/learn/nlp-course/en/chapter7/2#defining-the-model)

# Set up

In [1]:
# Disable tokenizers warnings when constructing pipelines
%env TOKENIZERS_PARALLELISM=false
%env MLFLOW_TRACKING_URI=http://localhost:5002

env: TOKENIZERS_PARALLELISM=false
env: MLFLOW_TRACKING_URI=http://localhost:5002


In [2]:
import os
from loguru import logger
import mlflow

In [3]:
mlflow.set_experiment("Cold Embrace - OSS LLM training data")

<Experiment: artifact_location='s3://mlflow/1', creation_time=1720508505405, experiment_id='1', last_update_time=1720508505405, lifecycle_stage='active', name='Cold Embrace - OSS LLM training data', tags={}>

# Load dataset

In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("dvquys/restaurant-reviews-public-sources", token=os.environ.get('HUGGINGFACE_READ_TOKEN'))

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'Comments', 'tokens', 'ner_tags'],
        num_rows: 1590
    })
    val: Dataset({
        features: ['id', 'text', 'Comments', 'tokens', 'ner_tags'],
        num_rows: 398
    })
    test: Dataset({
        features: ['id', 'text', 'Comments', 'tokens', 'ner_tags'],
        num_rows: 10
    })
})

In [6]:
raw_datasets["train"][0]["tokens"]

['Good',
 'atmosphere',
 ',',
 'combination',
 'of',
 'all',
 'the',
 'hottest',
 'music',
 'dress',
 'code',
 'is',
 'relatively',
 'strict',
 'except',
 'on',
 'Fridays',
 '.']

In [7]:
raw_datasets["train"][0]["ner_tags"]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 14, 0, 0, 0, 0]

In [8]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-AMBIENCE', 'I-AMBIENCE', 'B-BEVERAGE', 'I-BEVERAGE', 'B-FOOD', 'I-FOOD', 'B-LOCATION', 'I-LOCATION', 'B-OVERALL', 'I-OVERALL', 'B-PRICE', 'I-PRICE', 'B-SERVICE', 'I-SERVICE', 'B-STAFF', 'I-STAFF', 'B-VALUE', 'I-VALUE', 'B-VIEW', 'I-VIEW'], id=None), length=-1, id=None)

In [9]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-AMBIENCE',
 'I-AMBIENCE',
 'B-BEVERAGE',
 'I-BEVERAGE',
 'B-FOOD',
 'I-FOOD',
 'B-LOCATION',
 'I-LOCATION',
 'B-OVERALL',
 'I-OVERALL',
 'B-PRICE',
 'I-PRICE',
 'B-SERVICE',
 'I-SERVICE',
 'B-STAFF',
 'I-STAFF',
 'B-VALUE',
 'I-VALUE',
 'B-VIEW',
 'I-VIEW']

In [10]:
words = raw_datasets["train"][1]["tokens"]
labels = raw_datasets["train"][1]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

The lobster sandwich is     good   and the spaghetti with   Scallops and    Shrimp is     great  . 
O   B-FOOD  I-FOOD   I-FOOD I-FOOD O   O   B-FOOD    I-FOOD I-FOOD   I-FOOD I-FOOD I-FOOD I-FOOD O 


# Processing the data

## Tokenize text

In [11]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer.is_fast

True

In [13]:
inputs = tokenizer(raw_datasets["train"][1]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'The',
 'lo',
 '##bs',
 '##ter',
 'sandwich',
 'is',
 'good',
 'and',
 'the',
 'spa',
 '##gh',
 '##etti',
 'with',
 'Sc',
 '##allo',
 '##ps',
 'and',
 'Shri',
 '##mp',
 'is',
 'great',
 '.',
 '[SEP]']

In [14]:
inputs.word_ids()

[None,
 0,
 1,
 1,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 7,
 7,
 8,
 9,
 9,
 9,
 10,
 11,
 11,
 12,
 13,
 14,
 None]

In [15]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
labels = raw_datasets["train"][1]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 5, 6, 6, 6, 0, 0, 5, 6, 6, 6, 6, 6, 6, 0]
[-100, 0, 5, 6, 6, 6, 6, 6, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, -100]


In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [18]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

# Fine Tuning with custom training loop

## Padding the data

In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   13,
           14,   14,   14,   14,    0,    0,    0,    0,    0, -100, -100, -100],
        [-100,    0,    5,    6,    6,    6,    6,    6,    0,    0,    5,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    0, -100]])

In [21]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 14, 0, 0, 0, 0, 0, -100]
[-100, 0, 5, 6, 6, 6, 6, 6, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, -100]


In [22]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["val"], collate_fn=data_collator, batch_size=8
)

## Defining the model

In [23]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-SERVICE',
 'I-SERVICE',
 'I-SERVICE',
 'I-SERVICE',
 'I-SERVICE',
 'O',
 'O',
 'O',
 'O']

In [24]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [25]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
model.config.num_labels

21

In [27]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [28]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [29]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [30]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "ner-finetune-restaurant-reviews-aspects"
repo_name = get_full_repo_name(model_name)
repo_name

'dvquys/ner-finetune-restaurant-reviews-aspects'

In [31]:
from huggingface_hub import create_repo, repo_exists
if not repo_exists(repo_name):
    create_repo(repo_name, token=os.environ.get("HUGGINGFACE_WRITE_TOKEN"))

In [32]:
output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name, token=os.environ.get("HUGGINGFACE_WRITE_TOKEN"))
repo.git_pull()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/dvquys/frostmourne/lets-build-mlsys/notebooks/ner-finetune-restaurant-reviews-aspects is already a clone of https://huggingface.co/dvquys/ner-finetune-restaurant-reviews-aspects. Make sure you pull the latest changes with `repo.git_pull()`.


## Training loop

In [33]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

import evaluate

metric = evaluate.load("seqeval")

In [34]:
import pandas as pd

def evaluate_on_evalset(model, evalset, metric):
    """
    Params:
        model: Transformers model
        evalset: HuggingFace dataset (train, eval, test) in Data Loader format
        metric: a metric instance initiated by `import evaluate; metric = evaluate.load("seqeval")`
    """
    device = torch.device("cuda")
    model.eval()
    for batch in evalset:
        with torch.no_grad():
            outputs = model(**batch.to(device))

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()

    return results

def log_evaluation_metrics(results, prefix='eval', to_mlflow=True, step=None):
    results_reformatted = {}
    aggregated = dict()
    for key, value in results.items():
        if key.startswith('overall_'):
            assert isinstance(value, float)
            metric = key.replace('overall_', '')
            metric_key = f"{prefix}_aggregated_{metric}"
            aggregated[metric] = value
            if to_mlflow:
                mlflow.log_metric(metric_key, value, step=step)
        else:
            label = key
            for metric, metric_value in value.items():
                metric_key = f"{prefix}_{key}_{metric}"
                if to_mlflow:
                    mlflow.log_metric(metric_key, metric_value, step=step)
            results_reformatted.update({key: value})
    results_reformatted.update({"aggregated": aggregated})
    results_reformatted_df = pd.DataFrame.from_dict(results_reformatted, orient='index')
    logger.info(f"\n{results_reformatted_df}")
    return results_reformatted

In [35]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("num_train_epochs", num_train_epochs)
    mlflow.log_param("num_update_steps_per_epoch", num_update_steps_per_epoch)
    mlflow.log_param("num_training_steps", num_training_steps)
    mlflow.log_param("learning_rate", optimizer.param_groups[0]['lr'])
    
    progress_bar = tqdm(range(num_training_steps))
    
    for epoch in range(num_train_epochs):
        # Training
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
    
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
        mlflow.log_metric("train_loss", loss.item(), step=epoch)
    
        # Evaluation
        results = evaluate_on_evalset(model, eval_dataloader, metric)
        logger.info(
            f"evaluation on eval set at epoch {epoch}:"
        )
        log_evaluation_metrics(results, prefix='eval', step=epoch)
    
        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)
            logger.info(f"Pushing to HuggingFace Hub...")
            repo.push_to_hub(
                commit_message=f"Training in progress epoch {epoch}", blocking=False
            )
    
    test_dataloader = DataLoader(
        tokenized_datasets["test"], collate_fn=data_collator, batch_size=8
    )
    results = evaluate_on_evalset(model, test_dataloader, metric)
    log_evaluation_metrics(results, prefix='test')

  0%|          | 0/597 [00:00<?, ?it/s]

  0%|          | 0/597 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
[32m2024-07-09 16:53:27.143[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m32[0m - [1mevaluation on eval set at epoch 0:[0m
[32m2024-07-09 16:53:28.039[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_evaluation_metrics[0m:[36m53[0m - [1m
            precision    recall        f1  number  accuracy
AMBIENCE     0.278351  0.281250  0.279793    96.0       NaN
BEVERAGE     0.000000  0.000000  0.000000     6.0       NaN
FOOD         0.398577  0.275862  0.326055   406.0       NaN
LOCATION     0.000000  0.000000  0.000000     0.0       NaN
OVERALL      0.000000  0.000000  0.000000     0.0       NaN
PRICE        0.000000  0.000000  0.000000     0.0       NaN
SERVICE      0.256684  0.155844  0.193939   308.0       NaN
STAFF        0.000000  0.000000  0.000000     0.0       NaN
VALUE        0.000000  0.000000  0.000000     0.0       NaN
VIEW         0.000000  0.000000  0.000000     0.0       NaN
AGGREGATED   0.

# Inference

In [37]:
from transformers import pipeline

In [38]:
# Local model
token_classifier = pipeline(
    "token-classification", model=output_dir, aggregation_strategy="simple", device='cuda'
)
token_classifier('Delicious food friendly staff and one good celebration!')

[{'entity_group': 'SERVICE',
  'score': 0.61868334,
  'word': 'friendly staff',
  'start': 15,
  'end': 29},
 {'entity_group': 'SERVICE',
  'score': 0.28880322,
  'word': 'celebration',
  'start': 43,
  'end': 54}]