In [1]:
import datasets, json

RANDOM_SEED = 0

def load_dataset(path, split=[0.9, 0.05, 0.05]):
    '''
    Load the dataset from the path and create a Dataset object.
    Dataset format requires three lists:
        - id: the id of the scenario
        - tokens: tokennized words
        - ner_tags: the NER tags of the tokens
    @ param path: file path to dataset json file.
    @ param split: [train, validation, test] split percentages.
    '''
    
    # verify that the split is [train, validation, test] and sums to 1.00
    if len(split) != 3:
        raise Exception("Split parameter requires three values [train, validation, test], but %i found: %s" % (len(split), split))
    if abs(split[0] + split[1] + split[2] - 1.0) > 1e-5:
        raise Exception("Split must sum to 1.00, but sum = %0.4f" % (sum(split)))
 
    # read and format dataset
    with open(path, 'r') as f:
        raw_data = json.load(f)
        
    res = {'id': [], 'tokens': [], 'ner_tags': []}
    for id, data in raw_data.items():
        res['id'].append(id)
        res['tokens'].append(data['words'])
        res['ner_tags'].append(data['codes'])
        
    dataset = datasets.Dataset.from_dict(res, features=datasets.Features({
        "id": datasets.Value("string"),
        "tokens": datasets.Sequence(datasets.Value("string")),
        "ner_tags": datasets.Sequence(
            datasets.features.ClassLabel(
                names=['O', 'B-SIM', 'I-SIM', 'B-COM', 'I-COM', 'B-QUE', 'I-QUE']
            )
        ),
    }))
    
    # randomize data for training and testing
    train_testvalid = dataset.shuffle(seed=RANDOM_SEED).train_test_split(test_size=split[1] + split[2], seed=RANDOM_SEED)
    
    # split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=split[2]/(split[1] + split[2]), seed=RANDOM_SEED)
    
    # gather datasets into single DatasetDict
    train_test_valid_dataset = datasets.dataset_dict.DatasetDict({
        'train': train_testvalid['train'],
        'validation': test_valid['train'],
        'test': test_valid['test']
    })
    return train_test_valid_dataset

In [11]:
from spacy.training import offsets_to_biluo_tags, biluo_to_iob
import spacy

# setup the spaCy English tokenizer
nlp_parser = spacy.load("en_core_web_sm")

def eval_scenario(scenario, nlp):
    '''
    Print the words, true label, and predicted label

    Input: 
    @ param scenario: A scenario from loaded dataset
    @param nlp: A pipeline generated from pipeline()
    
    '''
    # setup the expected labels for each word
    true_label = scenario['codes']
    
    # predict the named entities from the test scenario
    entities = nlp(scenario['text'])
    
    entity_triples = []
    for entity in entities:
        entity_triples.append([entity['start'], entity['end'], entity['entity_group']])

    # convert character-level label spans to BILUO tags
    doc = nlp_parser(scenario['text'])
    biluo_tags = offsets_to_biluo_tags(doc, entity_triples)
        
    # conver BILUO tags to BIO tags
    pred_label = biluo_to_iob(biluo_tags)

    print(f" {'WORD':<16} {'TRUE LABEL':<16} {'PREDICTION'}")
    print(f"{'-'*48}")
    for i in range(len(scenario['words'])):
        print(f" {scenario['words'][i]:<16} {true_label[i]:<16} {pred_label[i]}")

In [15]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# load the scenario data to obtain a random test set
raw_datasets = load_dataset('../datasets/scenarios-labeled.json', [0.8, 0.1, 0.1])
test_set = raw_datasets["test"]
ner_feature = test_set.features["ner_tags"]
label_names = ner_feature.feature.names
with open(DATASET_PATH) as f:
    raw_dataset = json.load(f)
test_scenarios = [raw_dataset[id] for id in test_set['id']]

# instantiate the tokenizer and model, setup pipeline
model_path = './bert-finetuned-ner/checkpoint-300'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path, ignore_mismatched_sizes=True)
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy='first')

In [16]:
# here is an example of making a prediction:

test_scenario = test_scenarios[0]
eval_scenario(test_scenario, nlp)

 WORD             TRUE LABEL       PREDICTION
------------------------------------------------
 I                O                O
 use              O                O
 this             O                O
 for              O                O
 quick            O                O
 recordings       O                O
 ,                O                O
 mostly           O                O
 when             O                O
 I                O                O
 am               O                O
 writing          O                O
 songs            B-SIM            B-SIM
 or               O                O
 sketching        O                O
 ideas            B-COM            B-SIM
 for              I-COM            O
 future           I-COM            O
 songs            I-COM            B-SIM
 .                O                O
 I                O                O
 open             O                O
 the              O                O
 app              O                O
 ,   