## Install Dependencies

In [1]:
!pip install accelerate==0.23.0 datasets evaluate transformers seqeval ipywidgets peft



In [2]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device != "cpu":
  torch.cuda.set_device(device)

In [3]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

## Preprocess CONLL data input

In [4]:
# labels
label_names = ['O', 'B-MethodName', 'I-MethodName', 'B-HyperparameterName', 'I-HyperparameterName', 'B-HyperparameterValue', 'I-HyperparameterValue',
               'B-MetricName', 'I-MetricName', 'B-MetricValue', 'I-MetricValue', 'B-TaskName', 'I-TaskName', 'B-DatasetName', 'I-DatasetName']

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

from datasets import Dataset
import os
def generate_dataset(data_dir):
  dataset = []

  conll_files = os.listdir(data_dir)
  for filename in conll_files:
    if "conll" in filename:
      filename = os.path.join(data_dir, filename)
      dataset += parse_conll(filename, label2id)
  return dataset

def parse_conll(filename, label2id):
    with open(filename) as f:
      lines = f.readlines()
      lines_info_list = []
      line_info = {"tokens": [], "tags": []}
      for line in lines:
        if line == "\n":
          line_info["tags"] = [label2id[tag] for tag in line_info["tags"]]
          lines_info_list.append(line_info)
          line_info = {"tokens": [], "tags": []}
        else:
          if "DOCSTART" in line:
            continue
          try:
            token, _, _, tag = line.rstrip().split()

            line_info["tokens"].append(token)
            line_info["tags"].append(tag)
          except:
            continue
    return lines_info_list

data_list = generate_dataset(".")
raw_dataset = Dataset.from_list(data_list)

In [5]:
raw_dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 6027
})

## Tokenize Dataset

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

"""
Model choices:
- bert-base-cased
- bert-base-uncased
- dslim/bert-base-NER
- dbmdz/bert-large-cased-finetuned-conll03-english
- dslim/bert-large-NER
- QCRI/bert-base-multilingual-cased-pos-english
- Jean-Baptiste/roberta-large-ner-english
- sileod/deberta-v3-base-tasksource-nli
"""
model_checkpoint = "dslim/bert-base-NER"
lr = 1e-5
batch_size = 8
num_epochs = 1

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# build tokenized dataset
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_dataset = raw_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_dataset.column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

shuffle_seed = None

if shuffle_seed:
  tokenized_dataset = tokenized_dataset.shuffle(seed=shuffle_seed)

train_dataset, test_dataset = tokenized_dataset.train_test_split(test_size=0.1).values()

Map:   0%|          | 0/6027 [00:00<?, ? examples/s]

## Define Metrics

In [7]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import numpy as np
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # TODO: create per-class F1 score / binary cross entropy
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


## Initialize Model

In [8]:
from transformers import AutoConfig, AutoModelForTokenClassification

configuration = AutoConfig.from_pretrained(model_checkpoint)
configuration.update({"_num_labels": 15, 'label2id': label2id, 'id2label':id2label})
model = AutoModelForTokenClassification.from_config(configuration)


In [9]:
# model._init_weights(model.classifier)

# for param in model.parameters():
#   param.require_grad = False
# for param in model.classifier.parameters():
#   param.require_grad = True
# model = AutoModelForTokenClassification.from_pretrained(
#     model_checkpoint, num_labels=15, id2label=id2label, label2id=label2id
# )

In [10]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
model.classifier = torch.nn.Linear(model.classifier.in_features, 15, bias=model.classifier.bias is not None)
model.config.update({'_num_labels': 15, 'label2id': label2id, 'id2label': id2label})
model.num_labels = 15
model.init_weights()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# from peft import TaskType, LoraConfig, get_peft_model


# peft_config = LoraConfig(
#     task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
# )
# model = get_peft_model(model, peft_config)

# # model.bert.print_trainable_parameters()

## Setup Trainer

In [12]:
from transformers import TrainingArguments, Trainer

model_output_dir = f"{model_checkpoint}-finetuned-ner"

args = TrainingArguments(
    output_dir=model_output_dir,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    # learning_rate=2e-5,
    # num_train_epochs=10,
    # weight_decay=0.01,
    # push_to_hub=False,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard"
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

## Training

In [13]:
from transformers import TrainerCallback
import torch

class ProfCallback(TrainerCallback):
    def __init__(self, prof):
        self.prof = prof

    def on_step_end(self, args, state, control, **kwargs):
        self.prof.step()


with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,
                                        torch.profiler.ProfilerActivity.CUDA],
                            schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2, repeat=2),
                            on_trace_ready=torch.profiler.tensorboard_trace_handler('hf-training-trainer'),
                            profile_memory=True,
                            with_stack=True,
                            record_shapes=True) as prof:

    trainer.add_callback(ProfCallback(prof=prof))
    trainer.train()

# TODO: Record Eval

# TODO: Performance Analysis using torch profiler (tensorboardX)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2628,0.155386,0.397196,0.313653,0.350515,0.963613


[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
[W kineto_shim.cpp:343] Profiler is not initialized: skipping step() invocation
INFO:2023-10-27 13:15:16 606808:606808 init.cpp:149] If you see CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-10-27 13:15:16 606808:606808 ActivityProfilerController.cpp:321] Completed Stage: Post Processing
STAGE:2023-10-27 13:15:18 606808:606808 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-10-27 13:15:18 606808:606808 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-10-27 13:15:18 606808:606808 Acti

## Inference

In [None]:
def conll_format(results):
    conll_output = ""
    for result in results:
      for word, label in result:
        conll_output += f"{word}\t{label}\n"
      conll_output += "\n"
    return conll_output

def split_text(text, chunk_size):
    return [text[i:i+chunk_size]
          for i in range(0, len(text), chunk_size)]
  
def inference(input_file, output_file):
    text = []
    with open(input_file, "r") as file:
        for line in file:
            text.append(line.split())
    results = []
    for sentence in text:
        result = []
            
        inputs = tokenizer(sentence, return_tensors='pt', is_split_into_words=True, padding='max_length').to(device)
        word_ids = inputs.word_ids()
        print(len(word_ids))
        
        s = inputs['input_ids'].shape[1]
        with torch.no_grad():
            logits = model(**inputs).logits
        tokens = inputs.tokens()
        
        predictions = torch.argmax(logits, dim=2)
        predictions = predictions[0].cpu().numpy()
        
        prev_word_id = None
        for token, prediction, word_id in zip(tokens, predictions, word_ids):
            if word_id != None and word_id != prev_word_id:
                prev_word_id = word_id
                result.append((sentence[word_id], model.config.id2label[prediction]))
        results.append(result)
    conll_predictions = conll_format(results)

    with open(output_file, "w") as f:
        f.write(conll_predictions)

model_name = model_checkpoint.replace('/', '_')
inference(f"../data/bert.txt", f"../data/bert_output_{model_name}.conll")
inference(f"../data/test.txt", f"../data/test_output_{model_name}.conll")
inference(f"../data/test2.txt", f"../data/test2_output_{model_name}.conll")

NameError: name 'split' is not defined

## PEFT

In [None]:
# !pip install torch_tb_profiler

In [None]:
# !tensorboard --logdir=./hf-training-trainer

In [None]:
# from numba import cuda
# device = cuda.get_current_device()
# device.reset()