In [1]:
# Active Learning from scratch run
# based on random sampling by increasing dataset step by step
# Loop over all al_steps with local evaluation

In [12]:
# Cloud install
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install seqeval



In [28]:
# Login into huggingface_hub
# Token: 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
# from huggingface_hub import notebook_login
# notebook_login()

In [46]:
# import
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("dxiao/requirements-ner-id")
model_config = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_config)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [47]:
# get label names
ner_feature = raw_datasets["train"].features["ner_tags"] 
label_names = ner_feature.feature.names

## Preprocessing

In [48]:
# alignment function
# special tokens = -100
# replace tokens with have been split from B into B + I

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id: # check if it is not same word, just split into two
            current_word_id = word_id 
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [49]:
# scaling align_labels_with_tokens function towards entire datasets by using lists in lists [[]]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) # similar to word_ids

    all_labels = examples["ner_tags"] # similar to labels but recursive [[ner_tag1],[ner_tag2],[ner_tag3]]
    new_labels = []
    for i, labels in enumerate(all_labels): # i = iterative number, labels = [ner_tag1]
        word_ids = tokenized_inputs.word_ids(i) # takes word_ids from tokenizer magic based on iterative number 
        new_labels.append(align_labels_with_tokens(labels, word_ids)) # creates [[new_labels1],[new_labels2],[new_labels3]]

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [50]:
# batched mapping function -> combines tokenize_and_align_labels() and align_labels_with_tokens()
def batched_mapping(input_dataset):
    output_dataset = input_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    # remove_columns=raw_datasets['train'].column_names, # keep columns 
    )
    return output_dataset

In [51]:
# batched mapping on train, test and validation
tokenized_datasets = batched_mapping(raw_datasets)

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-5992c15d856b607f.arrow
Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-e3b73f63a5239f7b.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

## Dataset slicing

In [52]:
# import for inference
from transformers import AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd
import spacy

In [53]:
# Detokenize dataset
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

def detokenize(token_list):
    detokenizer = Detok()
    text = detokenizer.detokenize(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '.', text)
    return text

In [54]:
# calculate average of all certainty scores in ner_results
# might be calculated faster -> Vectorize with np
def calculate_ACS(text):
    ner_results = pipe(text)
    certainty_score_list = []
    for i in ner_results:
        certainty_score_list.append(i['score'])
    if not certainty_score_list: # if list is empty
        aggregated_certainty_score = 0
    else:
        aggregated_certainty_score = sum(certainty_score_list)/len(certainty_score_list)
    return aggregated_certainty_score

In [55]:
# dataset input has to be 'train' set

def top_candidates(dataset, STEP_SIZE):
    candidate_list = []
    df = dataset.to_pandas()
    # detokenize tokens into new text column
    df['text'] = df['tokens'].apply(lambda x: detokenize(x))
    # inference calculation of ACS and new ACS columns
    df['ACS'] = df['text'].apply(lambda x: calculate_ACS(x)) # takes ~30s for 600 rows
    # rank from lowest ACS to highest
    df = df.sort_values(by='ACS')
    candidate_list = list(df.index[:STEP_SIZE])
    non_candidate_list = list(df.index[STEP_SIZE+1:])
    return candidate_list, non_candidate_list

In [56]:
# function to perform inference_aggregation and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)

from datasets import concatenate_datasets

def inference_aggregation(input_train_dataset, input_inference_dataset, STEP_SIZE):
    candidate_list, non_candidate_list = top_candidates(input_inference_dataset, STEP_SIZE)
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

## Finetuning

In [57]:
# import DataCollator -> adds padding
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [58]:
import evaluate

metric = evaluate.load("seqeval")

In [59]:
# write a function to strip special tokens and convert to strings -> true_labels + true_predictions
# compare true_labels with true_predictions using seqeval

import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] # single labels
        for label in labels] # scale towards every label in sentence
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100] # single labels, zip with l in order to delete -100 tokens
        for prediction, label in zip(predictions, labels) # scale towards every label in sentence
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

## Define the model

In [60]:
# dictionarys for better mapping
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [61]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_config,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [62]:
# Train and save models over all al_steps locally

from transformers import TrainingArguments
from transformers import Trainer

model_path_list = []
STEP_SIZE = 12 # can be changed
STEP_PERCENT = 2
ITERATIONS = 25

for al_step in range(ITERATIONS): # loop over all AL iteration steps [0 = 10%, 1=20%, etc.]

    if(al_step == 0): # seed init in first iteration
        al_train_dataset = tokenized_datasets['train'].select(range(0,STEP_SIZE)) # seed train dataset
        al_inference_dataset = tokenized_datasets['train'].select(range(STEP_SIZE+1,tokenized_datasets['train'].num_rows)) # seed inference dataset
    else:
        inference_config = "./bert-finetuned-ner-" + str((al_step)*STEP_PERCENT) + "percent" #not al_step +1, as inference on last trained model
        inference_model = AutoModelForTokenClassification.from_pretrained(inference_config)
        pipe = pipeline("ner", model=inference_model, tokenizer=tokenizer, aggregation_strategy="simple")
        al_train_dataset, al_inference_dataset = inference_aggregation(al_train_dataset, al_inference_dataset, STEP_SIZE)

    args = TrainingArguments(
        "cache/bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent", # al_step = 0 -> 10% data
        evaluation_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        seed = 2022,
        push_to_hub=False,
        # hub_model_id="dxiao/bert-finetuned-ner-" + str((al_step+1)*10) + "percent"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=al_train_dataset,
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

    model_path = "./bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent"
    model_path_list.append(model_path)
    trainer.save_model(model_path)

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, tokens, ner_tags, tags. If id, tokens, ner_tags, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 60
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 24


  0%|          | 0/24 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

## Evaluation of local model

In [1]:
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline

In [None]:
# evaluate model directly from local path

# dataset_test = load_dataset("conll2003", split="test").select(range(1000)).shuffle()
dataset_test = load_dataset("dxiao/requirements-ner", split="test")
task_evaluator = evaluator("token-classification")

results = []
for model in model_path_list:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=dataset_test, metric="seqeval"
            )
        )

df = pd.DataFrame(results, index=model_path_list)
df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]