### Entropy Evalrun

In [None]:
# Cloud install for Colab
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install seqeval
!pip install numba

In [17]:
# Login into huggingface_hub
# Token: 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
# from huggingface_hub import notebook_login
# notebook_login()

In [18]:
# General packages
import numpy as np
import pandas as pd
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

# HuggingFace packages
from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import pipeline
import evaluate
from evaluate import evaluator
from tqdm.notebook import trange

import torch
from numba import jit


In [19]:
# parameters
STEP_PERCENT = 10 # can be changed
TOTAL_SIZE = 600
DATASET_CONFIG = "dxiao/requirements-ner-id"
MODEL_CONFIG = "bert-base-cased"
AL_STRATEGY = 'EN' # LC or RS or EN

# parameter calculations
iterations = 100/STEP_PERCENT
step_size = TOTAL_SIZE/iterations

In [20]:
# import from huggingface
raw_datasets = load_dataset(DATASET_CONFIG)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
# get label names
ner_feature = raw_datasets["train"].features["ner_tags"] 
label_names = ner_feature.feature.names

### Preprocessing

In [22]:
# alignment function
# special tokens = -100
# replace tokens with have been split from B into B + I

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word_id = None
    for word_id in word_ids:
        if word_id != current_word_id: # check if it is not same word, just split into two
            current_word_id = word_id 
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [23]:
# scaling align_labels_with_tokens function towards entire datasets by using lists in lists [[]]

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) # similar to word_ids

    all_labels = examples["ner_tags"] # similar to labels but recursive [[ner_tag1],[ner_tag2],[ner_tag3]]
    new_labels = []
    for i, labels in enumerate(all_labels): # i = iterative number, labels = [ner_tag1]
        word_ids = tokenized_inputs.word_ids(i) # takes word_ids from tokenizer magic based on iterative number 
        new_labels.append(align_labels_with_tokens(labels, word_ids)) # creates [[new_labels1],[new_labels2],[new_labels3]]

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [24]:
# batched mapping function -> combines tokenize_and_align_labels() and align_labels_with_tokens()

def batched_mapping(input_dataset):
    output_dataset = input_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    # remove_columns=raw_datasets['train'].column_names, # keep columns 
    )
    return output_dataset

In [25]:
# apply batched mapping on train, test and validation

tokenized_datasets = batched_mapping(raw_datasets)

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-5992c15d856b607f.arrow
Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-e3b73f63a5239f7b.arrow
Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-d01a31e78a4352eb.arrow


### Query strategies

#### Random Sampling

In [None]:
# slicing dataset into chunks

sliced_datasets = {}

for i in range(iterations): 
    sliced_datasets[i] = tokenized_datasets['train'].select(range(0,(i+1)*step_size)) # sliced_datasets[0] = first 10%


#### Least Confidence

In [9]:
# Detokenize dataset

def detokenize(token_list):
    detokenizer = Detok()
    text = detokenizer.detokenize(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '.', text)
    return text

In [10]:
# calculate average of all certainty scores in ner_results
# might be calculated faster -> Vectorize with np
def calculate_ACS(text):
    ner_results = pipe(text)
    certainty_score_list = []
    for i in ner_results:
        certainty_score_list.append(i['score'])
    if not certainty_score_list: # if list is empty
        aggregated_certainty_score = 0
    else:
        aggregated_certainty_score = sum(certainty_score_list)/len(certainty_score_list)
    return aggregated_certainty_score

In [11]:
# dataset input has to be 'train' set

def top_candidates(dataset, step_size):
    candidate_list = []
    df = dataset.to_pandas()
    # detokenize tokens into new text column
    df['text'] = df['tokens'].apply(lambda x: detokenize(x))
    # inference calculation of ACS and new ACS columns
    df['ACS'] = df['text'].apply(lambda x: calculate_ACS(x)) # takes ~30s for 600 rows
    # rank from lowest ACS to highest
    df = df.sort_values(by='ACS')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [12]:
# function to perform inference_aggregation and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)


def inference_aggregation(input_train_dataset, input_inference_dataset, step_size):
    candidate_list, non_candidate_list = top_candidates(input_inference_dataset, step_size)
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

### Entropy

In [None]:
# Generalizable inference
# Input: dataset + inference_model -> Utilizes inference function
# Output: df (incl. predictions)

def general_inference(dataset, inference_model):
    df = dataset.to_pandas()
    df['predictions'] = df['tokens'].apply(lambda x: inference(x.tolist(),inference_model))
    return df

In [None]:
# perform inference on sentences
# Input: word-tokenized sentence as list + model
# Output: predictions [#words, #labels] as np.array

def inference(sentence, inference_model):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = inference_model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

In [None]:
# Entropy strategy obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes max_entropy_calculation function
# Output: candidate_list, non_candidate_list

def EN_candidates(df, step_size):
    df['entropy'] = df['predictions'].apply(lambda x: max_entropy_calculation(x))
    df = df.sort_values(by='entropy',ascending=False)
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [None]:
# calculate max entropy score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: max entropy value 

@jit(nopython=True)
def max_entropy_calculation(predictions):
    entropy_list = []

    for word in predictions: # word basis
        entropy = 0
        for label_prob in word: #label basis    
            added_entropy = -label_prob*np.log(label_prob)
            entropy += added_entropy
        entropy_list.append(entropy)

    return max(entropy_list)

In [None]:
def EN_inference_aggregation(input_train_dataset, input_inference_dataset, inference_model, step_size):
    df_inference = general_inference(input_inference_dataset, inference_model)
    candidate_list, non_candidate_list = EN_candidates(df_inference, step_size)

    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

### Finetuning metrics

In [13]:
# DataCollator -> adds padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

In [14]:
# write a function to strip special tokens and convert to strings -> true_labels + true_predictions
# compare true_labels with true_predictions using seqeval

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] # single labels
        for label in labels] # scale towards every label in sentence
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100] # single labels, zip with l in order to delete -100 tokens
        for prediction, label in zip(predictions, labels) # scale towards every label in sentence
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

### Model training

In [15]:
# dictionarys for better mapping

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
# define model

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CONFIG,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
# Train and save models over all al_steps locally

model_path_list = []

for al_step in trange(iterations): # loop over all AL iteration steps [0 = 10%, 1=20%, etc.]
    
    # select query strategies
    if(AL_STRATEGY == 'RS'):
        al_train_dataset = sliced_datasets[al_step]

    elif(AL_STRATEGY == 'LC'):
        if(al_step == 0): # seed init in first iteration
            al_train_dataset = tokenized_datasets['train'].select(range(0,step_size)) # seed train dataset
            al_inference_dataset = tokenized_datasets['train'].select(range(step_size+1,tokenized_datasets['train'].num_rows)) # seed inference dataset
        else:
            inference_config = "./bert-finetuned-ner-" + str((al_step)*STEP_PERCENT) + "percent" #not al_step +1, as inference on last trained model
            inference_model = AutoModelForTokenClassification.from_pretrained(inference_config)
            pipe = pipeline("ner", model=inference_model, tokenizer=tokenizer, aggregation_strategy="simple")
            al_train_dataset, al_inference_dataset = inference_aggregation(al_train_dataset, al_inference_dataset, step_size)

    elif(AL_STRATEGY == 'EN'):
        if(al_step == 0): # seed init in first iteration
            al_train_dataset = tokenized_datasets['train'].select(range(0,step_size)) # seed train dataset
            al_inference_dataset = tokenized_datasets['train'].select(range(step_size+1,tokenized_datasets['train'].num_rows)) # seed inference dataset
        else:
            inference_config = "./bert-finetuned-ner-" + str((al_step)*STEP_PERCENT) + "percent" #not al_step +1, as inference on last trained model
            inference_model = AutoModelForTokenClassification.from_pretrained(inference_config)
            al_train_dataset, al_inference_dataset = EN_inference_aggregation(al_train_dataset, al_inference_dataset, inference_model, step_size)

    else:
        print("AL strategy not existent")
        break

    args = TrainingArguments(
        "cache/bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent", # al_step = 0 -> 10% data
        evaluation_strategy="epoch",
        save_strategy="no",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        seed = 2022,
        push_to_hub=False,
        # hub_model_id="dxiao/bert-finetuned-ner-" + str((al_step+1)*10) + "percent"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=al_train_dataset,
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()

    model_path = "./bert-finetuned-ner-" + str((al_step+1)*STEP_PERCENT) + "percent"
    model_path_list.append(model_path)
    trainer.save_model(model_path)

### Model evaluation

In [None]:
# Evaluate model directly from local path

dataset_test = load_dataset("dxiao/requirements-ner", split="test")
task_evaluator = evaluator("token-classification")

results = []
for model in model_path_list:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=dataset_test, metric="seqeval"
            )
        )

df = pd.DataFrame(results, index=model_path_list)
df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]

In [None]:
# Preliminary plotting of F1 learning curve

import matplotlib.pyplot as plt

score_f1 = [0] + list(round(df["overall_f1"],3)) # add 0 at the start of non-tuned model
steps_relative = list(np.linspace(0,100,iterations+1))

plt.plot(steps_relative, score_f1, label=AL_STRATEGY)
plt.axhline(y=max(score_f1), color='orange', label='Maximum')
plt.xlim([0, 100])
plt.ylim([0, 1])
plt.title(f'Preliminary results: {AL_STRATEGY} strategy with {STEP_PERCENT}% steps')
plt.xlabel('Percentage of manually labeled data')
plt.xticks(list(np.linspace(0, 100, 11)))
plt.ylabel('F1 score')
plt.yticks(list(np.linspace(0, 1, 11)))
plt.legend(loc='lower right')
plt.grid(color='grey', linewidth=0.2)
plt.show()

In [None]:
score_f1