### Least confidence

In [1]:
# ToDos:

# 1. Implement model inference based on finetuned transformer (on cloud) [DONE]
# 2. Aggregate inference of entities in one sentence into aggregated certainty score (ACS) [DONE]
# 3. Detokenize dataset from word tokens into sentence to be labeled (will this worsen results?) [DONE]
# 4. Scale ACS inference to hf dataset and rank by ACS score [DONE]
# 5. Scale to selected dataset slices and return as new input dataset [DONE]

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import numpy as np
import spacy
import torch
from numba import jit

In [3]:
input_model = "dxiao/bert-finetuned-ner-100percent"
input_dataset = 'dxiao/requirements-ner-id'

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
xiao_data = load_dataset(input_dataset)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Generalizable inference
# Input: dataset + inference_model -> Utilizes inference function
# Output: df (incl. predictions)

def general_inference(dataset, inference_model):
    df = dataset.to_pandas()
    df['predictions'] = df['tokens'].apply(lambda x: inference(x.tolist(),inference_model))
    return df

In [5]:
# perform inference on sentences
# Input: word-tokenized sentence as list + model
# Output: predictions [#words, #labels] as np.array

def inference(sentence, inference_model):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = inference_model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

In [9]:
# Least confidence strategy obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes least_confidence_calculation function
# Output: candidate_list, non_candidate_list

def LC_candidates(df, step_size):
    df['LC'] = df['predictions'].apply(lambda x: least_confidence_calculation(x))
    df = df.sort_values(by='LC')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [10]:
# calculate least confidence score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: min margin value 

@jit(nopython=True)

def least_confidence_calculation(predictions):
    least_confidence_list = []
    for word in predictions: # word level
        pred_1st = np.partition(word,-1)[-1] # highest prediction
        least_confidence_list.append(pred_1st)
    return min(least_confidence_list) # minimum prediction of words in sentence is weakest link 

In [139]:
# function to perform inference_aggregation and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)

from datasets import concatenate_datasets

def inference_aggregation(input_train_dataset, input_inference_dataset, step_size):
    candidate_list, non_candidate_list = top_candidates(input_inference_dataset, step_size)
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

In [140]:
x, y = inference_aggregation(
    xiao_data['train'].select(range(60)), xiao_data['train'].select(range(61,xiao_data['train'].num_rows)), step_size)