### Software mentions model (inference mode)

In [1]:
import numpy as np
import torch
from transformers import BertForTokenClassification, BertTokenizerFast

Load model 

Instantiate model and tokenizer

In [2]:
trained_model = "./models/scibert_software_version_2"
tokenizer = BertTokenizerFast.from_pretrained(trained_model, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(trained_model)

Preprocessing/ Postprocessing functions

In [3]:
def get_software_ver_labels(data):
    all_sent = []
    all_tokens = []
    all_labels = []
    sentences = data.split(". ")
    for sentence in sentences: 
        sentence = sentence[:512]
        tokenized_sentence = tokenizer.encode(sentence)
        input_ids = torch.tensor([tokenized_sentence])#.cuda()
        with torch.no_grad():
            output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)
        all_tokens.extend(new_tokens[1:-1])
        all_labels.extend(new_labels[1:-1])
    return list(zip(all_tokens, all_labels))

def collapse(ner_result):
    collapsed_list = []
    current_entity_tokens = []
    current_entity = None
    for token, tag in ner_result:
        if tag == "O" or tag == 'I-<version>' or tag == 'B-<version>':
            continue
        if tag.startswith("B-"):
            if current_entity is not None:
                collapsed_list.append(
                    (" ".join(current_entity_tokens), current_entity))
            current_entity = tag[2:]
            current_entity_tokens = [token]
        elif tag == "I-" + str(current_entity):
            current_entity_tokens.append(str(token))
        else:
            pass
    if current_entity is not None:
        collapsed_list.append(
            (" ".join(current_entity_tokens), current_entity))
    return collapsed_list

Tag values: 

In [4]:
tag_values = ['O', 'B-<software>', 'I-<software>', 'B-<version>', 'I-<version>', 'PAD']

Example: 

In [5]:
test_sentence = "I used Python package DBSCAN 1.234 for this analysis"

In [6]:
print (get_software_ver_labels(test_sentence))

[('I', 'O'), ('used', 'O'), ('Python', 'B-<software>'), ('package', 'I-<software>'), ('DBSCAN', 'B-<software>'), ('1', 'B-<version>'), ('.', 'I-<version>'), ('234', 'I-<version>'), ('for', 'O'), ('this', 'O'), ('analysis', 'O')]


In [7]:
softw_sent = get_software_ver_labels(test_sentence)

In [8]:
softw_sent

[('I', 'O'),
 ('used', 'O'),
 ('Python', 'B-<software>'),
 ('package', 'I-<software>'),
 ('DBSCAN', 'B-<software>'),
 ('1', 'B-<version>'),
 ('.', 'I-<version>'),
 ('234', 'I-<version>'),
 ('for', 'O'),
 ('this', 'O'),
 ('analysis', 'O')]

In [9]:
collapsed_software = collapse(softw_sent)

In [10]:
print(collapsed_software)

[('Python package', '<software>'), ('DBSCAN', '<software>')]


In [11]:
software_list = [x[0] for x in collapsed_software]

In [12]:
software_list

['Python package', 'DBSCAN']

In [13]:
def get_software_mentions(sentence):
    labels = get_software_ver_labels(test_sentence)
    collapse_labels = collapse(labels)
    software_list = [x[0] for x in collapse_labels]
    return software_list

In [14]:
get_software_mentions("I used Python package DBSCAN 1.234 for this analysis")

['Python package', 'DBSCAN']