### Software mentions model (inference mode)

In [2]:
import numpy as np
import torch
from transformers import BertForTokenClassification, BertTokenizerFast

Load pretrained model and tokenizer

In [3]:
trained_model = "./models/scibert_software_sent"
tokenizer = BertTokenizerFast.from_pretrained(trained_model, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(trained_model)

Preprocessing/ Postprocessing functions

In [4]:
def get_software_ver_labels(data):
    all_sent = []
    all_tokens = []
    all_labels = []
    sentences = data.split(". ")
    for sentence in sentences: 
        sentence = sentence[:512]
        tokenized_sentence = tokenizer.encode(sentence)
        input_ids = torch.tensor([tokenized_sentence])#.cuda()
        with torch.no_grad():
            output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)
        all_tokens.extend(new_tokens[1:-1])
        all_labels.extend(new_labels[1:-1])
    return list(zip(all_tokens, all_labels))

def collapse(ner_result):
    collapsed_list = []
    current_entity_tokens = []
    current_entity = None
    for token, tag in ner_result:
        if tag == "O" or tag == 'I-version' or tag == 'B-version':
            continue
        if tag.startswith("B-"):
            if current_entity is not None:
                collapsed_list.append(
                    (" ".join(current_entity_tokens), current_entity))
            current_entity = tag[2:]
            current_entity_tokens = [token]
        elif tag == "I-" + str(current_entity):
            current_entity_tokens.append(str(token))
        else:
            pass
    if current_entity is not None:
        collapsed_list.append(
            (" ".join(current_entity_tokens), current_entity))
    return collapsed_list

Tag values: 

In [5]:
tag_values = ['I-version', 'O', 'I-software', 'B-version', 'B-software', 'PAD']

Example: 

In [6]:
test_sentence = "I used Python package DBSCAN 1.234 for this analysis"

In [7]:
print (get_software_ver_labels(test_sentence))

[('I', 'O'), ('used', 'O'), ('Python', 'B-software'), ('package', 'O'), ('DBSCAN', 'B-software'), ('1', 'B-version'), ('.', 'B-version'), ('234', 'B-version'), ('for', 'O'), ('this', 'O'), ('analysis', 'O')]


In [8]:
softw_sent = get_software_ver_labels(test_sentence)

In [9]:
softw_sent

[('I', 'O'),
 ('used', 'O'),
 ('Python', 'B-software'),
 ('package', 'O'),
 ('DBSCAN', 'B-software'),
 ('1', 'B-version'),
 ('.', 'B-version'),
 ('234', 'B-version'),
 ('for', 'O'),
 ('this', 'O'),
 ('analysis', 'O')]

In [10]:
collapsed_software = collapse(softw_sent)

In [11]:
print(collapsed_software)

[('Python', 'software'), ('DBSCAN', 'software')]


In [12]:
software_list = [x[0] for x in collapsed_software if x[1]=="software"]

In [13]:
software_list

['Python', 'DBSCAN']

In [14]:
def get_software_mentions(sentence):
    labels = get_software_ver_labels(sentence)
    collapse_labels = collapse(labels)
    software_list = [x[0] for x in collapse_labels if x[1]=="software"]
    return software_list

In [15]:
get_software_mentions("I used Python package DBSCAN 1.234 for this analysis")

['Python', 'DBSCAN']

In [16]:
test1 = "This is the start of the official scanpy twitter account. \
         We'd like to share exciting news today: We just released scanpy 1.7.0! \
         Check out what's new at http://scanpy.readthedocs.io"

In [17]:
get_software_mentions(test1)

['scanpy']

In [18]:
test2 = "Realistic scRNA-seq Generation with Automatic Cell-Type identification using Introspective Variational Autoencoders"

In [19]:
get_software_mentions(test2)

[]

In [20]:
test3 = "New tool: SMILE, Mutual Information Learning for Integration of Single Cell Omics Data"

In [21]:
get_software_mentions(test3)

['SMILE']

In [22]:
test4 = "DeepDRIM is a supervised deep neural network model for predicting GRNs from scRNA-seq"

In [23]:
get_software_mentions(test4)

['DeepDRIM']

In [24]:
test5 = "Update: New CRAN repository for LIGER"

In [25]:
get_software_mentions(test5)

[]

In [26]:
test6 = "R package for differential expression (DE) analysis and gene set testing (GST) for scRNA-seq"

In [27]:
get_software_mentions(test6)

['R package differential expression']

In [28]:
test7 = "scGEAToolbox is a comprehensive Matlab toolbox for scRNA-seq data analysis"

In [29]:
get_software_mentions(test7)

['scGEAToolbox', 'Matlab']

In [30]:
test8 = "JOINT performs probability-based cell-type identification and DEG analysis simultaneously without the need for imputation"

In [31]:
get_software_mentions(test8)

['JOINT']

In [32]:
test9 = "The coronavirus disease 2019 (COVID-19) pandemic has affected millions of people worldwide."

In [33]:
get_software_mentions(test9)

[]

In [34]:
test10 = "This article illustrates the use of Microsoft OneNote as an Electronic Notebook for undergraduate \
biochemistry lab reports as well as student opinions of this use both before and during the online coursework \
shift during the pandemic."

In [35]:
get_software_mentions(test10)

['OneNote']

In [37]:
test11 = "Google trends and COVID-19 in Italy: could we brace for impact?"

In [38]:
get_software_mentions(test11)

[]