### Entropy query strategy

In [None]:
# ToDos:

# 1. Display dataframe
# 2. Implement entropy calculation -> logit and prediction (logit through softmax layer) on token level
# 2.1. Get logit score of tokens
# 2.2. Turn logit into predictions with softmax
# 2.3. perform entropy math on predictions

# In General: perform query strategies based on token level and not on word level -> consider all tokens. even if they are tagged 0
# rewrite LC into token level strategy later

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import numpy as np
import spacy
import torch
from numba import jit

In [None]:
# TYPE IN EXAMPLE HERE
example = "The Payload shall resist an acceleration of at least 60Gs of shocks"

In [11]:
input_model = "dxiao/bert-finetuned-ner-100percent"
input_dataset = 'dxiao/requirements-ner-id'

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
xiao_data = load_dataset(input_dataset)

In [4]:
example = ['The', 'Payload', 'shall', 'resist', 'an', 'acceleration', 'of', 'at' ,'least', '60Gs' ,'of', 'shocks']
example = ['The', 'probability', 'of', 'undetected', 'frame', 'error', 'for', 'the', 'COM', 'TCC', 'Terminal', 'uplink', 'shall', 'be', '<','19', '-', 'Oct.']

In [5]:
# Rough prototype pipeline

encoding = tokenizer(example, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
outputs = model(**encoding)
logits = outputs.logits
predictions = torch.nn.functional.softmax(logits, dim=-1)[0]

# -> use of datasets directly with list of words. No need for detokenization anymore!
# -> Output of entire entropy strategy should be the id_list of the top candidates 

# TODO: entropy math on predictions -> entropy score for each sentence
# TODO: make fast using numpy
# TODO: scale up with datasets -> leverage batched mapping. to entropy calc of dataset. Turn dataset into df and extract candidate indices
# TODO: instead of combining words into sentence and then tokenizing one by one, try batched_mapping from evalrun

In [70]:
# Generalizable inference
# Input: dataset + inference_model -> Utilizes inference function
# Output: df (incl. predictions)

def general_inference(dataset, inference_model):
    df = dataset.to_pandas()
    df['predictions'] = df['tokens'].apply(lambda x: inference(x.tolist(),inference_model))
    return df


In [72]:
# perform inference on sentences
# Input: word-tokenized sentence as list + model
# Output: predictions [#words, #labels] as np.array

def inference(sentence, inference_model):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = inference_model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

In [75]:
# Entropy strategy obtain candidates
# Input: df (incl. predictions) + step_size -> Utilizes max_entropy_calculation function
# Output: candidate_list, non_candidate_list

def EN_candidates(df, step_size):
    df['entropy'] = df['predictions'].apply(lambda x: max_entropy_calculation(x))
    df = df.sort_values(by='entropy',ascending=False)
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list


In [55]:
# calculate max entropy score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: max entropy value 

@jit(nopython=True)
def max_entropy_calculation(predictions):
    entropy_list = []

    for word in predictions: # word basis
        entropy = 0
        for label_prob in word: #label basis    
            added_entropy = -label_prob*np.log(label_prob)
            entropy += added_entropy
        entropy_list.append(entropy)

    return max(entropy_list)
