In [22]:
# Active Learning from scratch run
# based on random sampling by increasing dataset step by step
# Loop over all al_steps with local evaluation

In [12]:
# Cloud install
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install seqeval



In [13]:
# import data
from datasets import load_dataset

# raw_datasets = load_dataset("conll2003") # attention: conll2003 is already tokenized into words
raw_datasets = load_dataset("limsc/requirements-entity-recognition")

Using custom data configuration limsc--requirements-entity-recognition-bbdaea1faba0d374
Found cached dataset parquet (C:/Users/dekai/.cache/huggingface/datasets/limsc___parquet/limsc--requirements-entity-recognition-bbdaea1faba0d374/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
# get label names
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O',
 'B-ACT',
 'I-ACT',
 'B-ATTR',
 'I-ATTR',
 'B-RELOP',
 'I-RELOP',
 'B-QUANT',
 'I-QUANT',
 'B-ENT',
 'I-ENT']

In [15]:
# import transformer
from transformers import AutoTokenizer

model_config = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_config)

## Preprocessing

In [16]:
# tokenize pre-tokenized input (due to conll2003 being already split into words)
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()
# because "lamb" has been split, realignment is needed 

['[CLS]',
 'The',
 'operating',
 'humidity',
 'shall',
 'be',
 'between',
 '0',
 '.',
 '4',
 'and',
 '0',
 '.',
 '6',
 '[SEP]']

In [17]:
# example before alignment
print("labels:   " + str(raw_datasets["train"][0]["ner_tags"]))
print("word_ids: " + str(inputs.word_ids()))

labels:   [0, 3, 4, 0, 0, 5, 7, 5, 7]
word_ids: [None, 0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 8, 8, 8, None]


In [18]:
# alignment function
# special tokens = -100
# replace tokens with have been split from B into B + I

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id: # check if it is not same word, just split into two
            current_word_id = word_id 
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [19]:
# test of alignment
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 3, 4, 0, 0, 5, 7, 5, 7]
[-100, 0, 3, 4, 0, 0, 5, 7, 8, 8, 5, 7, 8, 8, -100]


In [20]:
# scaling align_labels_with_tokens function towards entire datasets by using lists in lists [[]]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) # similar to word_ids

    all_labels = examples["ner_tags"] # similar to labels but recursive [[ner_tag1],[ner_tag2],[ner_tag3]]
    new_labels = []
    for i, labels in enumerate(all_labels): # i = iterative number, labels = [ner_tag1]
        word_ids = tokenized_inputs.word_ids(i) # takes word_ids from tokenizer magic based on iterative number 
        new_labels.append(align_labels_with_tokens(labels, word_ids)) # creates [[new_labels1],[new_labels2],[new_labels3]]

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [21]:
# batched mapping
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets['train'].column_names,
)

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/limsc___parquet/limsc--requirements-entity-recognition-bbdaea1faba0d374/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-afbf53fbfadfcca3.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/limsc___parquet/limsc--requirements-entity-recognition-bbdaea1faba0d374/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-4f664c8fa6d601da.arrow


## Dataset slicing

In [22]:
# Slicing tokenized datasets into 20 chunks with each 500 sentences
# For shao: 646 tokens -> 10 steps â 60 

sliced_datasets = {}

for i in range(10): 
    sliced_datasets[i] = tokenized_datasets['train'].select(range(0,(i+1)*60)) # sliced_datasets[0] = first 10%


## Finetuning

In [23]:
# import DataCollator -> adds padding
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
# test data collator
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    3,    4,    0,    0,    5,    7,    8,    8,    5,    7,
            8,    8, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [-100,    0,    9,   10,   10,   10,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    3,    4,    4,
            4,    4,    0,    5,    7,    8,    8,    8,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100]])

In [25]:
import evaluate

metric = evaluate.load("seqeval")

In [32]:
# seqeval requires inputs in form of String lists -> give labels as string
labels = raw_datasets['train'][10]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'ACT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ATTR': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ENT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'QUANT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'RELOP': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [27]:
# write a function to strip special tokens and convert to strings -> true_labels + true_predictions
# compare true_labels with true_predictions using seqeval

import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] # single labels
        for label in labels] # scale towards every label in sentence
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100] # single labels, zip with l in order to delete -100 tokens
        for prediction, label in zip(predictions, labels) # scale towards every label in sentence
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

## Define the model

In [28]:
# dictionarys for better mapping
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [29]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_config,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [1]:
# Train and save models over all al_steps locally

from transformers import TrainingArguments
from transformers import Trainer

model_path_list = []

for al_step in range(10): # loop over all AL iteration steps [0 = 10%, 1=20%, etc.]

    args = TrainingArguments(
        "cache/bert-finetuned-ner-" + str((al_step+1)*10) + "percent", # al_step = 0 -> 10% data
        evaluation_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=sliced_datasets[al_step],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    trainer.train()

    model_path = "./bert-finetuned-ner-" + str((al_step+1)*10) + "percent"
    model_path_list.append(model_path)
    trainer.save_model(model_path)

NameError: name 'al_step' is not defined

## Evaluation of local model

In [1]:
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline

In [None]:
# evaluate model directly from local path

# dataset_test = load_dataset("conll2003", split="test").select(range(1000)).shuffle()
dataset_test = load_dataset("limsc/requirements-entity-recognition", split="test")
task_evaluator = evaluator("token-classification")

results = []
for model in model_path_list:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=dataset_test, metric="seqeval"
            )
        )

df = pd.DataFrame(results, index=model_path_list)
df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]

In [None]:
# trainer.push_to_hub(commit_message="Training complete")