## Install Dependencies

In [None]:
!pip install accelerate==0.23.0 datasets evaluate transformers seqeval ipywidgets peft

Collecting accelerate==0.23.0
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.7 MB/s[0m eta [36m

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device != "cpu":
  torch.cuda.set_device(device)

In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

## Preprocess CONLL data input

In [None]:
# labels
label_names = ['O', 'B-MethodName', 'I-MethodName', 'B-HyperparameterName', 'I-HyperparameterName', 'B-HyperparameterValue', 'I-HyperparameterValue',
               'B-MetricName', 'I-MetricName', 'B-MetricValue', 'I-MetricValue', 'B-TaskName', 'I-TaskName', 'B-DatasetName', 'I-DatasetName']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

from datasets import Dataset
import os
def generate_dataset(data_dir):
  dataset = []

  conll_files = os.listdir(data_dir)
  for filename in conll_files:
    if "conll" in filename:
      filename = os.path.join(data_dir, filename)
      dataset += parse_conll(filename, label2id)
  return dataset

def parse_conll(filename, label2id):
    with open(filename) as f:
      lines = f.readlines()
      lines_info_list = []
      line_info = {"tokens": [], "tags": []}
      for line in lines:
        if line == "\n":
          line_info["tags"] = [label2id[tag] for tag in line_info["tags"]]
          lines_info_list.append(line_info)
          line_info = {"tokens": [], "tags": []}
        else:
          if "DOCSTART" in line:
            continue
          try:
            token, _, _, tag = line.rstrip().split()

            line_info["tokens"].append(token)
            line_info["tags"].append(tag)
          except:
            continue
    return lines_info_list

data_list = generate_dataset(".")
raw_dataset = Dataset.from_list(data_list)

In [None]:
raw_dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 6027
})

## Tokenize Dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

"""
Model choices:
- bert-base-cased
- bert-base-uncased
- dslim/bert-base-NER
- dbmdz/bert-large-cased-finetuned-conll03-english
- dslim/bert-large-NER
- QCRI/bert-base-multilingual-cased-pos-english
- Jean-Baptiste/roberta-large-ner-english
"""
model_checkpoint = "bert-base-cased"
lr = 1e-3
batch_size = 8
num_epochs = 10

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# build tokenized dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset.column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

shuffle_seed = None

if shuffle_seed:
  tokenized_dataset = tokenized_dataset.shuffle(seed=shuffle_seed)

train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.1).values()

Map:   0%|          | 0/6027 [00:00<?, ? examples/s]

## Define Metrics

In [None]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import numpy as np
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # TODO: create per-class F1 score / binary cross entropy
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


## Initialize Model

In [None]:
from transformers import AutoConfig, AutoModelForTokenClassification

configuration = AutoConfig.from_pretrained(model_checkpoint)
configuration.update({"_num_labels": 15, 'label2id': label2id, 'id2label':id2label})
model = AutoModelForTokenClassification.from_config(configuration)


In [None]:
model._init_weights(model.classifier)

for param in model.parameters():
  param.require_grad = False
for param in model.classifier.parameters():
  param.require_grad = True

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=15, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from peft import TaskType, LoraConfig, get_peft_model


peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)
model = get_peft_model(model, peft_config)

# model.bert.print_trainable_parameters()

## Setup Trainer

In [None]:
from transformers import TrainingArguments, Trainer

model_output_dir = f"{model_checkpoint}-finetuned-ner"

args = TrainingArguments(
    output_dir=model_output_dir,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # learning_rate=2e-5,
    # num_train_epochs=10,
    # weight_decay=0.01,
    # push_to_hub=False,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard"
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

## Training

In [None]:
/from transformers import TrainerCallback
import torch

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()


with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA],
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler('hf-training-trainer'),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:

    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()

# TODO: Record Eval

# TODO: Performance Analysis using torch profiler (tensorboardX)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2153,0.119044,0.336245,0.305556,0.320166,0.964778
2,0.1184,0.103856,0.485106,0.452381,0.468172,0.970928
3,0.0865,0.086363,0.481752,0.52381,0.501901,0.971549
4,0.0704,0.085824,0.505017,0.599206,0.548094,0.973599
5,0.0569,0.102814,0.438903,0.698413,0.539051,0.968195
6,0.0413,0.091967,0.561265,0.563492,0.562376,0.974655
7,0.0329,0.081929,0.535354,0.630952,0.579235,0.975711
8,0.0237,0.095676,0.555932,0.650794,0.599634,0.975463
9,0.0191,0.096814,0.558824,0.678571,0.612903,0.97596
10,0.0125,0.098242,0.568562,0.674603,0.61706,0.977078


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## PEFT

In [None]:
!pip install torch_tb_profiler



In [None]:
!tensorboard --logdir=./hf-training-trainer

2023-10-24 21:56:30.792817: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-24 21:56:30.792877: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-24 21:56:30.792918: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

I1024 21:56:35.335391 137989413910080 plugin.py:429] Monitor runs begin
I1024 21:56:35.335770 137989413910080 plugin.py:444] Find run directory /content/hf-training-tra

KeyboardInterrupt: ignored

In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()