### Consolidated Active Learning

In [None]:
# Consolidated script for running entire active learning finetuning pipeline
# This script is meant to be run on a single GPU, evaluated on Google Colab
# Datasets are requested from https://huggingface.co/datasets/dxiao/requirements-ner-id
# Final finetuned model can be accessed from https://huggingface.co/dxiao/bert-finetuned-ner-100percent

In [None]:
# Cloud install for Colab
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install seqeval
!pip install numba

In [1]:
# In case model_upload in preferred: Login into huggingface_hub
# Token: *insert own huggingface token*
# from huggingface_hub import notebook_login
# notebook_login()

In [1]:
# General packages
import numpy as np
import pandas as pd

# HuggingFace packages
from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import evaluate
from evaluate import evaluator
from tqdm.notebook import trange

import torch
from numba import jit

In [2]:
# parameters
STEP_PERCENT = 2 # Step-size with each step as % of the training dataset
TOTAL_SIZE = 600 # total size of training dataset
SEED = 0 # chosen seed for reproducibility
DATASET_CONFIG = "dxiao/requirements-ner-id" # dataset from huggingface
MODEL_CONFIG = "bert-base-cased" # pre-trained model for further finetuning
AL_STRATEGY = 'MA'  # selected query strategy. 
                    # Select from "LC", "RS" or "MA" or "EN"
                    # "RS" = random sampling
                    # "LC" = least confidence
                    # "MA" = margin sampling
                    # "EN" = entropy sampling

# parameter calculations
iterations = int(100/STEP_PERCENT)
step_size = int(TOTAL_SIZE/iterations)

In [3]:
# import from huggingface
raw_datasets = load_dataset(DATASET_CONFIG)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# get label names
ner_feature = raw_datasets["train"].features["ner_tags"] 
label_names = ner_feature.feature.names

### Preprocessing

In [5]:
# alignment function
# special tokens = -100
# replace tokens with have been split from B into B + I

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id: # check if it is not same word, just split into two
            current_word_id = word_id 
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [6]:
# scaling align_labels_with_tokens function towards entire datasets by using lists in lists [[]]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) # similar to word_ids

    all_labels = examples["ner_tags"] # similar to labels but recursive [[ner_tag1],[ner_tag2],[ner_tag3]]
    new_labels = []
    for i, labels in enumerate(all_labels): # i = iterative number, labels = [ner_tag1]
        word_ids = tokenized_inputs.word_ids(i) # takes word_ids from tokenizer magic based on iterative number 
        new_labels.append(align_labels_with_tokens(labels, word_ids)) # creates [[new_labels1],[new_labels2],[new_labels3]]

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [7]:
# batched mapping function -> combines tokenize_and_align_labels() and align_labels_with_tokens()

def batched_mapping(input_dataset):
    output_dataset = input_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    # remove_columns=raw_datasets['train'].column_names, # keep columns 
    )
    return output_dataset

In [8]:
# apply batched mapping on train, test and validation

tokenized_datasets = batched_mapping(raw_datasets)
tokenized_datasets = tokenized_datasets.shuffle(seed=SEED)

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-5992c15d856b607f.arrow
Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-e3b73f63a5239f7b.arrow
Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-d01a31e78a4352eb.arrow
Loading cached shuffled indices for dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-5e0c786d550a9e87.arrow
Loading cached shuffled indices for dataset at C:/Users/d

### Query strategies

#### Random Sampling (RS)

In [9]:
# slicing dataset into chunks

sliced_datasets = {}

for i in range(iterations): 
    sliced_datasets[i] = tokenized_datasets['train'].select(range(0,(i+1)*step_size)) # sliced_datasets[0] = first 10%


#### Inference Strategies

In [10]:
# Generalizable inference
# Input: dataset + inference_model -> Utilizes inference function
# Output: df (incl. predictions)

def general_inference(dataset, inference_model):
    df = dataset.to_pandas()
    df['predictions'] = df['tokens'].apply(lambda x: inference(x.tolist(),inference_model))
    return df

In [11]:
# perform inference on sentences
# Input: word-tokenized sentence as list + model
# Output: predictions [#words, #labels] as np.array

def inference(sentence, inference_model):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = inference_model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

##### Least Confidence (LC)

In [12]:
# Least confidence strategy to obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes least_confidence_calculation function
# Output: candidate_list, non_candidate_list

def LC_candidates(df, step_size):
    df['LC'] = df['predictions'].apply(lambda x: least_confidence_calculation(x))
    df = df.sort_values(by='LC')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [13]:
# calculate least confidence score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: min margin value 

@jit(nopython=True)

def least_confidence_calculation(predictions):
    least_confidence_list = []
    for word in predictions: # word level
        pred_1st = np.partition(word,-1)[-1] # highest prediction
        least_confidence_list.append(pred_1st)
    return min(least_confidence_list) # minimum prediction of words in sentence is weakest link 

##### Margin Sampling (MA)

In [14]:
# Margin strategy to obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes max_entropy_calculation function
# Output: candidate_list, non_candidate_list

def MA_candidates(df, step_size):
    df['margin'] = df['predictions'].apply(lambda x: margin_calculation(x))
    df = df.sort_values(by='margin')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [15]:
# calculate min margin score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: min margin value 

@jit(nopython=True)

def margin_calculation(predictions):
    margin_list = []
    for word in predictions: # word level
        pred_1st = np.partition(word,-1)[-1] # highest prediction
        pred_2nd = np.partition(word,-2)[-2] # second-highest prediction
        margin = pred_1st - pred_2nd
        margin_list.append(margin)
    return min(margin_list) # minimum margin of word in sentence is weakest link 

##### Entropy Sampling (EN)

In [16]:
# Entropy strategy to obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes max_entropy_calculation function
# Output: candidate_list, non_candidate_list

def EN_candidates(df, step_size):
    df['entropy'] = df['predictions'].apply(lambda x: max_entropy_calculation(x))
    df = df.sort_values(by='entropy',ascending=False)
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [17]:
# calculate max entropy score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: max entropy value 

@jit(nopython=True)
def max_entropy_calculation(predictions):
    entropy_list = []

    for word in predictions: # word basis
        entropy = 0
        for label_prob in word: #label basis    
            added_entropy = -label_prob*np.log(label_prob)
            entropy += added_entropy
        entropy_list.append(entropy)

    return max(entropy_list)

##### Dataset shift

In [18]:
# function to shift dataset after inference and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)


def dataset_shift(input_train_dataset, input_inference_dataset, inference_config, step_size, AL_STRATEGY):
    inference_model = AutoModelForTokenClassification.from_pretrained(inference_config)
    df_inference = general_inference(input_inference_dataset, inference_model)
    if (AL_STRATEGY == 'LC'):
        candidate_list, non_candidate_list = LC_candidates(df_inference, step_size)
    elif (AL_STRATEGY == 'MA'):
        candidate_list, non_candidate_list = MA_candidates(df_inference, step_size)
    elif (AL_STRATEGY == 'EN'):
        candidate_list, non_candidate_list = EN_candidates(df_inference, step_size)
    
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

### Finetuning metrics

In [19]:
# DataCollator -> adds padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [20]:
# write a function to strip special tokens and convert to strings -> true_labels + true_predictions
# compare true_labels with true_predictions using seqeval

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] # single labels
        for label in labels] # scale towards every label in sentence
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100] # single labels, zip with l in order to delete -100 tokens
        for prediction, label in zip(predictions, labels) # scale towards every label in sentence
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Model training

In [22]:
# dictionarys for better mapping

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [28]:
# define model

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CONFIG,
    id2label=id2label,
    label2id=label2id,
)

NameError: name 'id2label' is not defined

In [None]:
# Train and save models over all al_steps locally

model_path_list = []

for al_step in trange(iterations): # loop over all AL iteration steps [0 = 10%, 1=20%, etc.]
    
    # select query strategies
    if(AL_STRATEGY == 'RS'):
        al_train_dataset = sliced_datasets[al_step]

    elif(AL_STRATEGY == 'LC' or AL_STRATEGY == 'MA' or AL_STRATEGY == 'EN'):
        if(al_step == 0): # seed init in first iteration
            al_train_dataset = tokenized_datasets['train'].select(range(0,step_size)) # seed train dataset
            al_inference_dataset = tokenized_datasets['train'].select(range(step_size+1,tokenized_datasets['train'].num_rows)) # seed inference dataset
        else:
            inference_config = "./bert-finetuned-ner-" + str((al_step)*STEP_PERCENT) + "percent" #not al_step +1, as inference on last trained model
            al_train_dataset, al_inference_dataset = dataset_shift(al_train_dataset, al_inference_dataset, inference_config, step_size, AL_STRATEGY)
    
    else:
        print("AL strategy not existent")
        break

    args = TrainingArguments(
        "cache/bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent", # al_step = 0 -> 10% data
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=5, #changed to 5
        weight_decay=0.01,
        per_device_train_batch_size = 12, # added
        per_device_eval_batch_size = 12, # added
        load_best_model_at_end = True,
        seed = 2022,
        push_to_hub=False,
        # report_to="wandb",
        # run_name="AL-" + AL_STRATEGY "-" + str((al_step+1)*STEP_PERCENT) + "percent-seed" + str(SEED)
        # hub_model_id="dxiao/bert-finetuned-ner-" + str((al_step+1)*10) + "percent"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=al_train_dataset,
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    model_path = "./bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent"
    model_path_list.append(model_path)
    trainer.save_model(model_path)
    %rm -rf ./cache*

### Model evaluation

In [None]:
# Evaluate model directly from local path

dataset_test = load_dataset("dxiao/requirements-ner", split="test")
task_evaluator = evaluator("token-classification")

results = []
for model in model_path_list:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=dataset_test, metric="seqeval"
            )
        )

df = pd.DataFrame(results, index=model_path_list)
df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]

In [None]:
# Preliminary plotting of F1 learning curve

import matplotlib.pyplot as plt

score_f1 = [0] + list(round(df["overall_f1"],3)) # add 0 at the start of non-tuned model
steps_relative = list(np.linspace(0,100,iterations+1))

plt.plot(steps_relative, score_f1, label=AL_STRATEGY)
plt.axhline(y=max(score_f1), color='orange', label='Maximum')
plt.xlim([0, 100])
plt.ylim([0, 1])
plt.title(f'Preliminary results: {AL_STRATEGY} strategy with {STEP_PERCENT}% steps')
plt.xlabel('Percentage of manually labeled data')
plt.xticks(list(np.linspace(0, 100, 11)))
plt.ylabel('F1 score')
plt.yticks(list(np.linspace(0, 1, 11)))
plt.legend(loc='lower right')
plt.grid(color='grey', linewidth=0.2)
plt.show()

In [None]:
score_f1