### Aggregating model inference and scaling to unlabeled data

In [1]:
# ToDos:

# 1. Implement model inference based on finetuned transformer (on cloud) [DONE]
# 2. Aggregate inference of entities in one sentence into aggregated certainty score (ACS) [DONE]
# 3. Detokenize dataset from word tokens into sentence to be labeled (will this worsen results?) [DONE]
# 4. Scale ACS inference to hf dataset and rank by ACS score [DONE]
# 5. Scale to selected dataset slices and return as new input dataset [DONE]

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import Dataset
from spacy import displacy
import pandas as pd
import spacy

In [3]:
# model config

input_model = "dxiao/bert-finetuned-ner-100percent"
input_dataset = 'dxiao/requirements-ner-id'
step_size = 60

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
xiao_data = load_dataset(input_dataset)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Detokenize dataset
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

def detokenize(token_list):
    detokenizer = Detok()
    text = detokenizer.detokenize(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '.', text)
    return text

In [5]:
# calculate average of all certainty scores in ner_results
# might be calculated faster -> Vectorize with np
def calculate_ACS(text):
    ner_results = pipe(text)
    certainty_score_list = []
    for i in ner_results:
        certainty_score_list.append(i['score'])
    aggregated_certainty_score = sum(certainty_score_list)/len(certainty_score_list)
    return aggregated_certainty_score

In [6]:
# transform dataset into dataframe
df_xiao_train = xiao_data['train'].to_pandas()
# detokenize tokens into new text column
df_xiao_train['text'] = df_xiao_train['tokens'].apply(lambda x: detokenize(x))
# inference calculation of ACS and new ACS columns
df_xiao_train['ACS'] = df_xiao_train['text'].apply(lambda x: calculate_ACS(x)) # takes ~30s for 600 rows
# rank from lowest ACS to highest
df_xiao_train = df_xiao_train.sort_values(by='ACS')
# transform dataframe back to dataset
xiao_data_train_new = Dataset.from_pandas(df_xiao_train[:step_size])
# drop extra columns from transformation
xiao_data_train_new = xiao_data_train_new.remove_columns('__index_level_0__')
xiao_data_train_new

Dataset({
    features: ['id', 'tokens', 'tags', 'ner_tags', 'text', 'ACS'],
    num_rows: 60
})

#### end

In [7]:
# return id of top 60 (variable) lowest ACS scores is list
lowest_ACS = list(df_xiao_train['id'][:60])

In [8]:
len(lowest_ACS)

60

In [9]:
# filter function for selected ids [id_list] to be mapped onto a dataset
def filter_ids(row,id_list):
    for i in id_list:
        if(row == i):
            return row

In [10]:
# select slide with the lowest ACS scores -> filter function always cuts off after 615. no matter size of dataset
filtered_data = xiao_data['train'].filter(lambda x: filter_ids(x['id'],[616,617,618]))
filtered_data

Loading cached processed dataset at C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab\cache-269dcb83466dfa42.arrow


Dataset({
    features: ['id', 'tokens', 'tags', 'ner_tags'],
    num_rows: 0
})

In [11]:
# difference between lists -> why are there 3 elements difference?
list(set(lowest_ACS) - set(filtered_data['id']))

[512,
 257,
 386,
 4,
 134,
 137,
 9,
 267,
 16,
 17,
 280,
 537,
 536,
 27,
 414,
 416,
 163,
 36,
 421,
 35,
 423,
 46,
 47,
 561,
 433,
 434,
 52,
 566,
 183,
 440,
 55,
 446,
 68,
 197,
 198,
 589,
 212,
 349,
 94,
 478,
 481,
 97,
 357,
 614,
 103,
 361,
 235,
 492,
 237,
 496,
 626,
 499,
 628,
 245,
 118,
 632,
 253,
 506,
 509,
 510]

In [12]:
# TYPE IN EXAMPLE HERE
example = "The Payload shall resist an acceleration of at least 60Gs of shocks"

In [13]:
# displays inference for one sentence
def display_inference(text):
    ner_results = pipe(text)
    spacy_pipe = spacy.blank("en")
    doc = spacy_pipe(example)

    ents = []
    for i in ner_results:
        span = doc.char_span(i['start'], i['end'], label=i['entity_group']) #None if mapping issue
        ents.append(span)
    doc.ents = ents 

    colors = {"ENT": "#C5BDF4", "ACT": "#FFD882", "ATTR": "#D9FBAD", "RELOP": "#FFDAF9", "QUANT": "#C2F2F6"}
    options = {"ents": ['ENT', 'ACT', 'ATTR', 'RELOP', 'QUANT'], "colors": colors}

    displacy.render(doc, style = 'ent', options = options)

    for i in ner_results:
        print(f'{i}')