### Software Mention model applied to CORD-19 dataset

In [1]:
import numpy as np
import pandas as pd
import torch
import os
from transformers import BertForTokenClassification, BertTokenizerFast
import json
import time

Load trained model

Instantiate model and tokenizer

In [2]:
trained_model = "./models/scibert_software_sent"
tokenizer = BertTokenizerFast.from_pretrained(trained_model, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(trained_model)

Preprocessing/ Postprocessing functions

In [3]:
def get_software_ver_labels(data):
    all_sent = []
    all_tokens = []
    all_labels = []
    sentences = data.split(". ")
    for sentence in sentences: 
        sentence = sentence[:512]
        tokenized_sentence = tokenizer.encode(sentence)
        input_ids = torch.tensor([tokenized_sentence])#.cuda()
        with torch.no_grad():
            output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)
        all_tokens.extend(new_tokens[1:-1])
        all_labels.extend(new_labels[1:-1])
    return list(zip(all_tokens, all_labels))

def collapse(ner_result):
    collapsed_list = []
    current_entity_tokens = []
    current_entity = None
    for token, tag in ner_result:
        if tag == "O" or tag == 'I-<version>' or tag == 'B-<version>':
            continue
        if tag.startswith("B-"):
            if current_entity is not None:
                collapsed_list.append(
                    (" ".join(current_entity_tokens), current_entity))
            current_entity = tag[2:]
            current_entity_tokens = [token]
        elif tag == "I-" + str(current_entity):
            current_entity_tokens.append(str(token))
        else:
            pass
    if current_entity is not None:
        collapsed_list.append(
            (" ".join(current_entity_tokens), current_entity))
    return collapsed_list

Tag values:

In [4]:
tag_values = ['I-version', 'O', 'I-software', 'B-version', 'B-software', 'PAD']

### Import CORD-19 data

Download from: https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html

Date: 2021-02-08

Size: 7.4GB

In [6]:
cord19_data_loc = "./data/CORD19/"

In [7]:
os.listdir(cord19_data_loc) 

['document_parses',
 '.DS_Store',
 'changelog',
 'metadata.csv',
 'document_parses.tar.gz']

### Parse full text papers and metadata

In [16]:
metadata_csv = pd.read_csv(cord19_data_loc+"metadata.csv", low_memory=False)

metadata_csv_subs = metadata_csv[~metadata_csv['pdf_json_files'].isnull()]
metadata_csv_subs['paper_id'] = metadata_csv_subs['pdf_json_files'].apply(lambda x: x.split("/")[-1].split(".")[0])

all_files = []

for filename in os.listdir(cord19_data_loc+'document_parses/pdf_json'):
    filename = cord19_data_loc+'document_parses/pdf_json/' + filename
    file = json.load(open(filename, 'rb'))
    data_df = pd.DataFrame(file['body_text'])
    data_df = pd.concat([data_df, pd.DataFrame(file['abstract'])])
    data_df['paper_id'] = np.repeat(file['paper_id'], data_df.shape[0])
    all_files.append(data_df)
    
all_files_pdf_df = pd.concat(all_files)

pdfs_preproc = pd.merge(all_files_pdf_df, metadata_csv_subs , on = "paper_id", how = "inner")
pdfs_preproc_covid19 = pdfs_preproc[pdfs_preproc['publish_time']>='2020-01-01']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Extract mentions of software from all pdfs in "./document_parses/pdf_json"

Preprocess text: 

In [20]:
pdfs_preproc_covid19['text'] = pdfs_preproc_covid19['text'].astype(str)
pdfs_preproc_covid19_subset = pdfs_preproc_covid19[~pdfs_preproc_covid19['text'].apply(lambda x: "\n\t\t\t\t" in x)]
pdfs_preproc_covid19_subset.reset_index(inplace=True)
pdfs_preproc_covid19_subset['sentences'] = pdfs_preproc_covid19_subset['text'].apply(lambda x: x.split(". "))
pdfs_preproc_covid19_subset_sentences = pdfs_preproc_covid19_subset.explode('sentences')
pdfs_preproc_covid19_subset_sentences['len_sent'] = pdfs_preproc_covid19_subset_sentences['sentences'].apply(lambda x: len(x))
pdfs_preproc_covid19_subset_sentences_subs = pdfs_preproc_covid19_subset_sentences[(pdfs_preproc_covid19_subset_sentences['len_sent']>60)]
pdfs_preproc_covid19_subset_sentences_subs.drop(['index'], axis = 1, inplace = True)

In [26]:
pdfs_preproc_covid19_subset_sentences_subs.reset_index(inplace=True)

Get software mentions:

In [28]:
temp_s = []
for indx in pdfs_preproc_covid19_subset_sentences_subs.index:
    if indx%1000 ==0:
        print (indx)
    temp_s.append([indx, get_software_ver_labels(pdfs_preproc_covid19_subset_sentences_subs.loc[indx, 'sentences'])])

0


In [29]:
software_df = pd.DataFrame(temp_s)

Post-process text:

In [30]:
software_eval = pd.concat([pdfs_preproc_covid19_subset_sentences_subs[0:software_df.shape[0]] , software_df], axis = 1)
software_eval['software'] = software_eval[1].apply(lambda x: collapse(x))
software_eval['software_clean'] = software_eval['software'].apply(lambda x: [i[0] for i in x])

In [32]:
excl_list = ["COVID", "COVID-","19", "Android", "Apple","#",")","(","*", "+","-","scRNA", "Medline","Pubmed", "COVID -", "scRNA - seq", "nCODIV", "scRNAseq", "COVID", "COVID19", "COVID 19", "COVID-19", "COVID - 19" , "SARS", "Covid","MERS", "medRxiv", "bioRxiv", "smartphone", "H1N1", "H1N2"]
companies = ["Google", "Microsoft", "IBM", "Apple", "Twitter", "Facebook", "WhatsApp", "Instagram", "YouTube", "Zoom", "WeChat"]

In [33]:
software_eval['software_clean'] = software_eval['software_clean'].apply(lambda x: [i for i in x if (i not in excl_list and i not in companies)])

In [37]:
software_eval['software_clean'] = software_eval['software_clean'].apply(lambda x: [i for i in x if len(i)>1 or i =="R"])

In [35]:
software_eval[['section',          
               'source_x',            
               'title',
               'doi', 
               'license',         
               'abstract',     
               'publish_time', 
               'journal', 
               'url',  
               'sentences', 
               'software_clean']].head()

Unnamed: 0,section,source_x,title,doi,license,abstract,publish_time,journal,url,sentences,software_clean
0,,Medline; PMC,Clinical and epidemiological characteristics o...,10.1371/journal.pmed.1003130,cc-by,"BACKGROUND: As of April 18, 2020, over 2,000,0...",2020-06-16,PLoS Med,https://doi.org/10.1371/journal.pmed.1003130; ...,a1111111111 a1111111111 a1111111111 a111111111...,[]
1,,Medline; PMC,Clinical and epidemiological characteristics o...,10.1371/journal.pmed.1003130,cc-by,"BACKGROUND: As of April 18, 2020, over 2,000,0...",2020-06-16,PLoS Med,https://doi.org/10.1371/journal.pmed.1003130; ...,"In addition, significant increases in the leve...",[]
2,,Medline; PMC,Clinical and epidemiological characteristics o...,10.1371/journal.pmed.1003130,cc-by,"BACKGROUND: As of April 18, 2020, over 2,000,0...",2020-06-16,PLoS Med,https://doi.org/10.1371/journal.pmed.1003130; ...,Patchy lesions in lobules were detected by che...,[]
3,,Medline; PMC,Clinical and epidemiological characteristics o...,10.1371/journal.pmed.1003130,cc-by,"BACKGROUND: As of April 18, 2020, over 2,000,0...",2020-06-16,PLoS Med,https://doi.org/10.1371/journal.pmed.1003130; ...,"Ground-glass opacities, which were a typical f...",[]
4,,Medline; PMC,Clinical and epidemiological characteristics o...,10.1371/journal.pmed.1003130,cc-by,"BACKGROUND: As of April 18, 2020, over 2,000,0...",2020-06-16,PLoS Med,https://doi.org/10.1371/journal.pmed.1003130; ...,Rapid radiologic progression and a late-onset ...,[]


In [46]:
all_software = software_eval.explode("software_clean")

Mentions per sentence: 

In [67]:
all_software.head()

Unnamed: 0,index,text,cite_spans,ref_spans,section,paper_id,cord_uid,sha,source_x,title,...,pdf_json_files,pmc_json_files,url,s2_id,sentences,len_sent,0,1,software,software_clean
0,0,a1111111111 a1111111111 a1111111111 a111111111...,"[{'start': 899, 'end': 910, 'text': '(8.00-14....",[],,efe13333c69a364cb5d4463ba93815e6fc2d91c6,u9num2o9,efe13333c69a364cb5d4463ba93815e6fc2d91c6,Medline; PMC,Clinical and epidemiological characteristics o...,...,document_parses/pdf_json/efe13333c69a364cb5d44...,document_parses/pmc_json/PMC7297312.xml.json,https://doi.org/10.1371/journal.pmed.1003130; ...,219726388.0,a1111111111 a1111111111 a1111111111 a111111111...,80,0,"[(a1111111111, O), (a1111111111, O), (a1111111...",[],
1,0,a1111111111 a1111111111 a1111111111 a111111111...,"[{'start': 899, 'end': 910, 'text': '(8.00-14....",[],,efe13333c69a364cb5d4463ba93815e6fc2d91c6,u9num2o9,efe13333c69a364cb5d4463ba93815e6fc2d91c6,Medline; PMC,Clinical and epidemiological characteristics o...,...,document_parses/pdf_json/efe13333c69a364cb5d44...,document_parses/pmc_json/PMC7297312.xml.json,https://doi.org/10.1371/journal.pmed.1003130; ...,219726388.0,"In addition, significant increases in the leve...",258,1,"[(In, O), (addition, O), (,, O), (significant,...",[],
2,0,a1111111111 a1111111111 a1111111111 a111111111...,"[{'start': 899, 'end': 910, 'text': '(8.00-14....",[],,efe13333c69a364cb5d4463ba93815e6fc2d91c6,u9num2o9,efe13333c69a364cb5d4463ba93815e6fc2d91c6,Medline; PMC,Clinical and epidemiological characteristics o...,...,document_parses/pdf_json/efe13333c69a364cb5d44...,document_parses/pmc_json/PMC7297312.xml.json,https://doi.org/10.1371/journal.pmed.1003130; ...,219726388.0,Patchy lesions in lobules were detected by che...,96,2,"[(Patchy, O), (lesions, O), (in, O), (lobules,...",[],
3,0,a1111111111 a1111111111 a1111111111 a111111111...,"[{'start': 899, 'end': 910, 'text': '(8.00-14....",[],,efe13333c69a364cb5d4463ba93815e6fc2d91c6,u9num2o9,efe13333c69a364cb5d4463ba93815e6fc2d91c6,Medline; PMC,Clinical and epidemiological characteristics o...,...,document_parses/pdf_json/efe13333c69a364cb5d44...,document_parses/pmc_json/PMC7297312.xml.json,https://doi.org/10.1371/journal.pmed.1003130; ...,219726388.0,"Ground-glass opacities, which were a typical f...",100,3,"[(Ground, O), (-, O), (glass, O), (opacities, ...",[],
4,0,a1111111111 a1111111111 a1111111111 a111111111...,"[{'start': 899, 'end': 910, 'text': '(8.00-14....",[],,efe13333c69a364cb5d4463ba93815e6fc2d91c6,u9num2o9,efe13333c69a364cb5d4463ba93815e6fc2d91c6,Medline; PMC,Clinical and epidemiological characteristics o...,...,document_parses/pdf_json/efe13333c69a364cb5d44...,document_parses/pmc_json/PMC7297312.xml.json,https://doi.org/10.1371/journal.pmed.1003130; ...,219726388.0,Rapid radiologic progression and a late-onset ...,97,4,"[(Rapid, O), (radiologic, O), (progression, O)...",[],


In [52]:
all_software_nonull = all_software[~all_software['software_clean'].isnull()]

In [53]:
all_software_pp = all_software_nonull.groupby(["paper_id"])["software_clean"].apply(lambda x: x.tolist())

In [54]:
all_software_pp

paper_id
190c2ad4da55a54dccf52a4dd83ebe317926fb2d                 [CoronIT, CoronIT, HPzone, R, stats]
2b12eec29eceba760197cbb4ebe2fcdd1d522a22                                                  [R]
a071bea9e05d8d81f97db1c6aca34e07ff145e8c    [Raosoft, SurveyMonkey, Statistical Package fo...
e21fa0c3998bf989ec31046de9ebe3e074ec77ab                            [CXNet, CheXNeXt, MATLAB]
e9f4e223a840bf55a1bcfc2646946612361289eb                                           [SPSS, 25]
efe13333c69a364cb5d4463ba93815e6fc2d91c6                                           [SPSS, 20]
Name: software_clean, dtype: object

In [68]:
all_software_dedup = all_software[['paper_id',
               'source_x',            
               'title',
               'doi', 
               'license',         
               'abstract',     
               'publish_time', 
               'journal', 
               'url']].drop_duplicates()

In [71]:
paper_to_software = pd.merge(all_software_pp, all_software_dedup, how = "left", on = "paper_id")

In [74]:
paper_to_software.columns = ['paper_id', 'software', 'source', 'title', 'doi', 'license',
       'abstract', 'publish_time', 'journal', 'url']

In [75]:
paper_to_software.head()

Unnamed: 0,paper_id,software,source,title,doi,license,abstract,publish_time,journal,url
0,190c2ad4da55a54dccf52a4dd83ebe317926fb2d,"[CoronIT, CoronIT, HPzone, R, stats]",Medline; PMC,Occupation- and age-associated risk of SARS-Co...,10.2807/1560-7917.es.2020.25.50.2001884,cc-by,High coronavirus incidence has prompted the Ne...,2020-12-17,Euro Surveill,https://www.ncbi.nlm.nih.gov/pubmed/33334396/;...
1,2b12eec29eceba760197cbb4ebe2fcdd1d522a22,[R],MedRxiv; WHO,SARS-CoV-2 Infection Hospitalization Rate and ...,10.1101/2020.10.30.20223461,medrxiv,Importance: COVID-19 case fatality and hospita...,2020-11-04,,http://medrxiv.org/cgi/content/short/2020.10.3...
2,a071bea9e05d8d81f97db1c6aca34e07ff145e8c,"[Raosoft, SurveyMonkey, Statistical Package fo...",Medline; PMC,Emergency Healthcare Providers’ Perceptions of...,10.3390/healthcare8040442,cc-by,This study evaluates the perceptions of prepar...,2020-10-29,Healthcare (Basel),https://www.ncbi.nlm.nih.gov/pubmed/33138164/;...
3,e21fa0c3998bf989ec31046de9ebe3e074ec77ab,"[CXNet, CheXNeXt, MATLAB]",Medline; PMC,The investigation of multiresolution approache...,10.1007/s13755-020-00116-6,no-cc,"COVID-19 is a novel virus, which has a fast sp...",2020-09-29,Health Inf Sci Syst,https://doi.org/10.1007/s13755-020-00116-6; ht...
4,e9f4e223a840bf55a1bcfc2646946612361289eb,"[SPSS, 25]",Medline; PMC,"Characteristics, laboratories, and prognosis o...",10.1371/journal.pone.0239644,cc-by,The impact of the COVID-19 pandemic has been i...,2020-09-24,PLoS One,https://www.ncbi.nlm.nih.gov/pubmed/32970757/;...
