### Entropy query strategy

In [None]:
# ToDos:

# 1. Display dataframe
# 2. Implement entropy calculation -> logit and prediction (logit through softmax layer) on token level
# 2.1. Get logit score of tokens
# 2.2. Turn logit into predictions with softmax
# 2.3. perform entropy math on predictions

# In General: perform query strategies based on token level and not on word level -> consider all tokens. even if they are tagged 0
# rewrite LC into token level strategy later

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import spacy

In [2]:
# model config

input_model = "dxiao/bert-finetuned-ner-100percent"
input_dataset = 'dxiao/requirements-ner-id'
step_size = 60

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
# pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
xiao_data = load_dataset(input_dataset)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Detokenize dataset
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

def detokenize(token_list):
    detokenizer = Detok()
    text = detokenizer.detokenize(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '.', text)
    return text

In [None]:
# calculate average of all certainty scores in ner_results
# might be calculated faster -> Vectorize with np
def calculate_entropy(text):
    ner_results = pipe(text)
    certainty_score_list = []
    for i in ner_results:
        certainty_score_list.append(i['score'])
    if not certainty_score_list: # if list is empty
        aggregated_certainty_score = 0
    else:
        aggregated_certainty_score = sum(certainty_score_list)/len(certainty_score_list)
    return aggregated_certainty_score

In [None]:
# dataset input has to be 'train' set

def top_candidates(dataset, step_size):
    candidate_list = []
    df = dataset.to_pandas()
    # detokenize tokens into new text column
    df['text'] = df['tokens'].apply(lambda x: detokenize(x))
    # inference calculation of ACS and new ACS columns
    df['ACS'] = df['text'].apply(lambda x: calculate_ACS(x)) # takes ~30s for 600 rows
    # rank from lowest ACS to highest
    df = df.sort_values(by='ACS')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [None]:
# function to perform inference_aggregation and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)

from datasets import concatenate_datasets

def inference_aggregation(input_train_dataset, input_inference_dataset, step_size):
    candidate_list, non_candidate_list = top_candidates(input_inference_dataset, step_size)
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

In [None]:
x, y = inference_aggregation(
    xiao_data['train'].select(range(60)), xiao_data['train'].select(range(61,xiao_data['train'].num_rows)), step_size)

#### end

In [None]:
# TYPE IN EXAMPLE HERE
example = "The Payload shall resist an acceleration of at least 60Gs of shocks"

In [None]:
from spacy import displacy

# displays inference for one sentence
def display_inference(text):
    ner_results = pipe(text)
    spacy_pipe = spacy.blank("en")
    doc = spacy_pipe(text)

    ents = []
    for i in ner_results:
        span = doc.char_span(i['start'], i['end'], label=i['entity_group']) #None if mapping issue
        ents.append(span)
    doc.ents = ents 

    colors = {"ENT": "#C5BDF4", "ACT": "#FFD882", "ATTR": "#D9FBAD", "RELOP": "#FFDAF9", "QUANT": "#C2F2F6"}
    options = {"ents": ['ENT', 'ACT', 'ATTR', 'RELOP', 'QUANT'], "colors": colors}

    displacy.render(doc, style = 'ent', options = options)

    for i in ner_results:
        print(f'{i}')

In [None]:
input_model = "dxiao/bert-finetuned-ner-100percent"
input_dataset = 'dxiao/requirements-ner-id'

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

display_inference(example)

In [23]:
example = ['The', 'Payload', 'shall', 'resist', 'an', 'acceleration', 'of', 'at' ,'least', '60Gs' ,'of', 'shocks']
example = ['The', 'probability', 'of', 'undetected', 'frame', 'error', 'for', 'the', 'COM', 'TCC', 'Terminal', 'uplink', 'shall', 'be', '<','19', '-', 'Oct.']

In [24]:
# Rough prototype pipeline

import torch
import numpy as np

encoding = tokenizer(example, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
outputs = model(**encoding)
logits = outputs.logits
predictions = torch.nn.functional.softmax(logits, dim=-1)[0]

# -> use of datasets directly with list of words. No need for detokenization anymore!
# -> Output of entire entropy strategy should be the id_list of the top candidates 

# TODO: entropy math on predictions -> entropy score for each sentence
# TODO: make fast using numpy
# TODO: scale up with datasets -> leverage batched mapping. to entropy calc of dataset. Turn dataset into df and extract candidate indices
# TODO: instead of combining words into sentence and then tokenizing one by one, try batched_mapping from evalrun

In [38]:
import torch
import numpy as np
from numba import jit

# perform inference on sentences
# Input: word-tokenized sentence as list
# Output: predictions [#words, #labels] as np.array

def inference(sentence):
    encoding = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True) #same params as in evalrun
    outputs = model(**encoding)
    logits = outputs.logits
    predictions = torch.nn.functional.softmax(logits, dim=-1)[0]
    predictions = predictions.detach().numpy()
    return predictions

In [39]:
# calculate max entropy score from predictions of one sentence
# Input: predictions in shape [#words, #labels] as np.array
# Output: max entropy value 

@jit(nopython=True)
def max_entropy_calculation(predictions):
    entropy_list = []

    for word in predictions: # word basis
        entropy = 0
        for label_prob in word: #label basis    
            added_entropy = -label_prob*np.log(label_prob)
            entropy += added_entropy
        entropy_list.append(entropy)

    return max(entropy_list)


In [41]:
# EN: return stepwise list of top candidates & non-candidates

def top_candidates_EN(dataset, step_size):
    candidate_list = []
    df = dataset.to_pandas()
    df['entropy'] = df['tokens'].apply(lambda x: max_entropy_calculation(inference(x.tolist()))) #entropy calc on predictions
    df = df.sort_values(by='entropy',ascending=False)
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [29]:
candidate_list

Unnamed: 0,id,tokens,tags,ner_tags,entropy
27,27,"[The, probability, of, undetected, frame, erro...","[O, B-ATTR, I-ATTR, I-ATTR, I-ATTR, I-ATTR, O,...","[0, 3, 4, 4, 4, 4, 0, 0, 9, 10, 10, 10, 0, 3, ...",1.909202
440,440,"[The, probability, of, frame, loss, for, the, ...","[O, B-ATTR, I-ATTR, I-ATTR, I-ATTR, O, O, B-EN...","[0, 3, 4, 4, 4, 0, 0, 9, 10, 10, 10, 10, 1, 5,...",1.903301
614,614,"[The, BER, on, physical, channel, for, the, CO...","[O, B-ATTR, I-ATTR, I-ATTR, I-ATTR, O, O, B-EN...","[0, 3, 4, 4, 4, 0, 0, 9, 10, 10, 10, 0, 1, 5, ...",1.866802
481,481,"[Digitisation, for, each, polarisation, shall,...","[B-ATTR, O, O, B-ENT, O, B-ACT, O, O, O, B-REL...","[3, 0, 0, 9, 0, 1, 0, 0, 0, 5, 6, 0, 0, 0, 0, ...",1.831017
40,40,"[The, algorithm, shall, produce, a, cloud, mas...","[O, O, O, O, O, B-ENT, I-ENT, I-ENT, O, B-ACT,...","[0, 0, 0, 0, 0, 9, 10, 10, 0, 1, 0, 3, 4, 4, 4...",1.797966
...,...,...,...,...,...
2,2,"[Vertical, profiles, shall, be, measured, in, ...","[B-ENT, I-ENT, O, B-ACT, I-ACT, O, O, B-ATTR, ...","[9, 10, 0, 1, 2, 0, 0, 3, 4, 0, 5, 7, 8, 8, 0,...",0.040787
182,182,"[The, fine, frequency, cells, for, the, SKA1_L...","[O, B-ATTR, I-ATTR, I-ATTR, O, O, B-ENT, I-ENT...","[0, 3, 4, 4, 0, 0, 9, 10, 0, 1, 0, 7, 0, 0, 0,...",0.040345
360,360,"[WFOS, ,, in, spectroscopy, mode, ,, shall, ha...","[B-ENT, O, O, O, O, O, O, B-ACT, O, B-ATTR, I-...","[9, 0, 0, 0, 0, 0, 0, 1, 0, 3, 4, 4, 4, 0, 5, ...",0.035466
287,287,"[The, SKA1_Mid, dishes, ,, when, the, band, 2,...","[O, B-ENT, I-ENT, O, O, O, O, O, O, O, O, O, O...","[0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...",0.034628


In [26]:
predictions

tensor([[4.3555e-01, 3.6871e-02, 2.8352e-02, 9.7163e-02, 7.8188e-02, 4.0228e-02,
         2.5014e-02, 4.1915e-02, 1.2168e-01, 5.6276e-02, 3.8757e-02],
        [9.9738e-01, 6.1271e-04, 1.1583e-04, 4.8168e-04, 6.9839e-04, 8.0477e-05,
         4.9122e-05, 4.5355e-05, 1.6888e-04, 1.2886e-04, 2.3587e-04],
        [1.5931e-04, 2.1190e-04, 1.5912e-04, 9.9804e-01, 3.0016e-04, 5.1444e-04,
         1.7116e-04, 8.6053e-05, 8.9181e-05, 2.0549e-04, 6.3765e-05],
        [3.2736e-05, 1.7553e-05, 4.6170e-04, 1.7169e-04, 9.9910e-01, 4.0421e-05,
         3.9969e-05, 2.7235e-05, 1.7903e-05, 1.3213e-05, 7.3108e-05],
        [2.7579e-05, 1.9359e-05, 6.3249e-04, 1.1077e-04, 9.9893e-01, 2.6163e-05,
         4.1693e-05, 3.0933e-05, 1.4414e-05, 2.1519e-05, 1.4958e-04],
        [3.2212e-05, 1.9937e-05, 7.7785e-04, 8.1562e-05, 9.9872e-01, 2.8533e-05,
         5.0583e-05, 3.1145e-05, 1.7425e-05, 2.2687e-05, 2.2072e-04],
        [3.1963e-05, 1.8113e-05, 6.9847e-04, 6.4996e-05, 9.9886e-01, 2.4579e-05,
         4.43

In [None]:
encoding

In [None]:
outputs = model(**encoding)

In [None]:
logits = outputs.logits
print(logits.shape)

In [None]:
logits[0][2]

In [None]:
predicted_label_classes = logits.argmax(-1) # use something different than argmax maybe?
print(predicted_label_classes)

In [None]:
predicted_labels = [model.config.id2label[id] for id in predicted_label_classes.squeeze().tolist()]
print(predicted_labels)

In [None]:
import torch

predictions = torch.nn.functional.softmax(logits, dim=-1)
predictions.shape

In [28]:
predictions[0][2]

tensor(0.0284, grad_fn=<SelectBackward0>)

In [None]:
for id, label in zip(encoding.input_ids.squeeze().tolist(), predicted_label_classes[0]):
  print(tokenizer.decode([id]), label)

In [27]:
for id, label in zip(encoding.input_ids.squeeze().tolist(), predicted_labels):
  print(tokenizer.decode([id]), label)

NameError: name 'predicted_labels' is not defined