### Aggregating model inference and scaling to unlabeled data

In [98]:
# ToDos:

# 1. Implement model inference based on finetuned transformer (on cloud) [DONE]
# 2. Aggregate inference of entities in one sentence into aggregated certainty score (ACS) [DONE]
# 3. Detokenize dataset from word tokens into sentence to be labeled (will this worsen results?) [DONE]
# 4. Scale ACS inference to hf dataset and rank by ACS score [DONE]
# 5. Scale to selected dataset slices and return as new input dataset [DONE]

In [99]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import spacy

In [100]:
# model config

input_model = "dxiao/bert-finetuned-ner-10percent"
input_dataset = 'dxiao/requirements-ner-id'
step_size = 60

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(input_model)
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
xiao_data = load_dataset(input_dataset)

Using custom data configuration dxiao--requirements-ner-a9d27206730c3bd0
Found cached dataset json (C:/Users/dekai/.cache/huggingface/datasets/dxiao___json/dxiao--requirements-ner-a9d27206730c3bd0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [101]:
# Detokenize dataset
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok

def detokenize(token_list):
    detokenizer = Detok()
    text = detokenizer.detokenize(token_list)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '.', text)
    return text

In [102]:
# calculate average of all certainty scores in ner_results
# might be calculated faster -> Vectorize with np
def calculate_ACS(text):
    ner_results = pipe(text)
    certainty_score_list = []
    for i in ner_results:
        certainty_score_list.append(i['score'])
    if not certainty_score_list: # if list is empty
        aggregated_certainty_score = 0
    else:
        aggregated_certainty_score = sum(certainty_score_list)/len(certainty_score_list)
    return aggregated_certainty_score

In [124]:
# dataset input has to be 'train' set

def top_candidates(dataset, step_size):
    candidate_list = []
    df = dataset.to_pandas()
    # detokenize tokens into new text column
    df['text'] = df['tokens'].apply(lambda x: detokenize(x))
    # inference calculation of ACS and new ACS columns
    df['ACS'] = df['text'].apply(lambda x: calculate_ACS(x)) # takes ~30s for 600 rows
    # rank from lowest ACS to highest
    df = df.sort_values(by='ACS')
    candidate_list = list(df.index[:step_size])
    non_candidate_list = list(df.index[step_size+1:])
    return candidate_list, non_candidate_list

In [139]:
# function to perform inference_aggregation and batched mapping
# input dataset has to be input_dataset['train']
# i.e.: input_train_dataset = seed dataset (60 sentences)
# i.e.: input_inference_dataset = remaining dataset (540 sentences)

from datasets import concatenate_datasets

def inference_aggregation(input_train_dataset, input_inference_dataset, step_size):
    candidate_list, non_candidate_list = top_candidates(input_inference_dataset, step_size)
    added_inference_dataset = input_inference_dataset.select(candidate_list) # select candidates
    
    output_train_dataset = concatenate_datasets([input_train_dataset, added_inference_dataset]) # add candidates to train_dataset
    output_inference_dataset = input_inference_dataset.select(non_candidate_list) # remaining rows become new inference_dataset 

    return output_train_dataset, output_inference_dataset

In [140]:
x, y = inference_aggregation(
    xiao_data['train'].select(range(60)), xiao_data['train'].select(range(61,xiao_data['train'].num_rows)), step_size)

#### end

In [None]:
# return id of top 60 (variable) lowest ACS scores is list
lowest_ACS = list(df_xiao_train['id'][:60])

In [None]:
# TYPE IN EXAMPLE HERE
example = "The Payload shall resist an acceleration of at least 60Gs of shocks"

In [16]:
from spacy import displacy

# displays inference for one sentence
def display_inference(text):
    ner_results = pipe(text)
    spacy_pipe = spacy.blank("en")
    doc = spacy_pipe(text)

    ents = []
    for i in ner_results:
        span = doc.char_span(i['start'], i['end'], label=i['entity_group']) #None if mapping issue
        ents.append(span)
    doc.ents = ents 

    colors = {"ENT": "#C5BDF4", "ACT": "#FFD882", "ATTR": "#D9FBAD", "RELOP": "#FFDAF9", "QUANT": "#C2F2F6"}
    options = {"ents": ['ENT', 'ACT', 'ATTR', 'RELOP', 'QUANT'], "colors": colors}

    displacy.render(doc, style = 'ent', options = options)

    for i in ner_results:
        print(f'{i}')

In [1]:
for i in range(3):
    print(i)

0
1
2
