## This script adds miscelleneous annotations generated from other algorithms

In [1]:
import os
import json

def find_gen_path():
    tmp_dir = os.getcwd()
    while 'topic_classifier' not in os.listdir(tmp_dir):
        tmp_dir = os.path.dirname(tmp_dir)
    return(tmp_dir)

def fetch_path_dict():
    general_path = find_gen_path()
    topic_folder = os.path.join(general_path,'topic_classifier')
    topic_results = os.path.join(topic_folder,'results')
    alt_path = os.path.join(general_path,'covid_altmetrics')
    alt_results = os.path.join(alt_path,'results')
    preprint_path = os.path.join(general_path,'outbreak_preprint_matcher')
    preprint_results = os.path.join(preprint_path,'results')
    preprint_dumps = os.path.join(preprint_results,'update dumps')
    loe_ann_path = os.path.join(general_path,'covid19_LST_annotations')
    loe_results = os.path.join(loe_ann_path,'results')
    path_dict = {
        'topics_file':os.path.join(topic_results,'topicCats.json'),
        'altmetrics_file':os.path.join(alt_results,'altmetric_annotations.json'),
        'litcovid_updates':os.path.join(preprint_dumps,'litcovid_update_file.json'),
        'preprint_updates':os.path.join(preprint_dumps,'preprint_update_file.json'),
        'loe_annotations':os.path.join(loe_results,'loe_annotations.json')
        }
    return(path_dict)


def fetch_annotation(path_dict,source,outbreak_id):
    with open(path_dict[source],'r') as infile:
        ann_dict = json.load(infile)
    ann_info = [x for x in ann_dict if x["_id"]==outbreak_id]
    try:
        return(ann_info[0])
    except:
        return(ann_info)

    
def add_anns(doc):
    path_dict = fetch_path_dict()
    ## add corrections
    if doc['@type']=='Publication':
        if 'pmid' in doc['_id']:
            ## doc is from litcovid
            corrections = fetch_annotation(path_dict,'litcovid_updates',doc['_id'])
            loe_info = fetch_annotation(path_dict,'loe_annotations',doc['_id'])
        else:
            corrections = fetch_annotation(path_dict,'preprint_updates',doc['_id'])
            loe_info = None
        if corrections != None:
            if 'correction' in doc.keys():  ## check if correction field already used
                try:
                    doc['correction'].append(corrections)
                except:
                    correct_object = doc['correction']
                    doc['correction']=[correct_object,corrections]
            else:
                doc['correction']=corrections
        if loe_info != None:
            doc['evaluations'] = loe_info['evaluations']
            if 'citedBy' in doc.keys():
                doc['citedBy'].append(loe_info['citedBy'])
            else:
                doc['citedBy'] = []
                doc['citedBy'].append(loe_info['citedBy'])
    ## add topic_cats
    topic_cats = fetch_annotation(path_dict,'topics_file',doc['_id'])
    if topic_cats != None:
        doc['topicCategory']=topic_cats
    ## add altmetrics
    altinfo = fetch_annotation(path_dict,'altmetrics_file',doc['_id'])
    if altinfo != None:
        if 'evaluations' in doc.keys():
            try:
                doc['evaluations'].append(altinfo['evaluations'][0])
            except:
                eval_object = doc['evaluations']
                doc['evaluations']=[eval_object,altinfo['evaluations'][0]]
        else:
            doc['evaluations'] = altinfo['evaluations']       
    return(doc)
   

### Test speed when using list comprehension instead of loops to find an id

In [2]:
import requests

In [14]:
#### Old functions (ignore these as they are slower)
            

def fetch_topics(path_dict,outbreak_id):
    with open(path_dict['topics_file']) as infile:
        topics_dict = json.load(infile)
    for i in range(len(topics_dict)):
        if topics_dict[i]['_id']==outbreak_id:
            topicinfo = topics_dict[i]
            return(topicinfo)    

def fetch_preprint_updates(path_dict,preprint_id):
    with open(path_dict['preprint_updates'],'r') as infile:
        preprint_dict = json.load(infile)
    for i in range(len(preprint_dict)):
        if preprint_dict[i]['_id']==preprint_id:
            preprint_info = preprint_dict[i]
            return(preprint_info)

def fetch_reviewed_updates(path_dict,litcovid_id):
    with open(path_dict['litcovid_updates'],'r') as infile:
        litcovid_dict = json.load(infile)
    for i in range(len(litcovid_dict)):
        if litcovid_dict[i]['_id']==litcovid_id:
            litcovid_info = litcovid_dict[i]
            return(litcovid_info)
    
def check_altmetrics(path_dict,outbreak_id):
    with open(path_dict['altmetrics_file']) as infile:
        altmetrics_dict = json.load(infile)
    for i in range(len(altmetrics_dict)):
        if altmetrics_dict[i]['_id']==outbreak_id:
            altinfo = altmetrics_dict[i]
            return(altinfo)

def check_loe_anns(path_dict,outbreak_id):
    with open(path_dict['loe_annotations']) as infile:
        loe_dict = json.load(infile)
    for i in range(len(loe_dict)):
        if loe_dict[i]['_id']==outbreak_id:
            loe_info = loe_dict[i]
            return(loe_info)

def add_anns_old(doc):
    path_dict = fetch_path_dict()
    ## add corrections
    if doc['@type']=='Publication':
        if 'pmid' in doc['_id']:
            ## doc is from litcovid
            corrections = fetch_reviewed_updates(path_dict,doc['_id'])
            loe_info = check_loe_anns(path_dict,doc['_id'])
        else:
            corrections = fetch_preprint_updates(path_dict,doc['_id'])
            loe_info = None
        if corrections != None:
            if 'correction' in doc.keys():  ## check if correction field already used
                try:
                    doc['correction'].append(corrections)
                except:
                    correct_object = doc['correction']
                    doc['correction']=[correct_object,corrections]
            else:
                doc['correction']=corrections
        if loe_info != None:
            doc['evaluations'] = loe_info['evaluations']
            if 'citedBy' in doc.keys():
                doc['citedBy'].append(loe_info['citedBy'])
            else:
                doc['citedBy'] = []
                doc['citedBy'].append(loe_info['citedBy'])
    ## add topic_cats
    topic_cats = fetch_topics(path_dict,doc['_id'])
    if topic_cats != None:
        doc['topicCategory']=topic_cats
    ## add altmetrics
    altinfo = check_altmetrics(path_dict,doc['_id'])
    if altinfo != None:
        if 'evaluations' in doc.keys():
            doc['evaluations'].append(altinfo['evaluations'][0])
        else:
            doc['evaluations'] = altinfo['evaluations']       
    return(doc)
            

#### Testing impact of list comprehension in a single function

In [4]:
path_dict = fetch_path_dict()
litcovid_list = ['pmid32562477','pmid34385356','pmid33582134','pmid32835303','pmid32264791',
                 'pmid32424571','pmid32650645','pmid32302377','pmid32463365','pmid32220655',
                 'pmid32502733','pmid32339844','pmid32428990','pmid32526193','pmid32388471']#,
                 #'pmid39546836','pmid32403007','pmid32526655','pmid32594937','pmid32374400',
                 #'pmid32376627','pmid32658859','pmid32434518','pmid32408453','pmid32547891',
                 #'pmid32234804','pmid32369759','pmid32552016','pmid32627200','pmid32614817',
                 #'pmid32651556','pmid32495918','pmid32344319','pmid32239761','pmid32404476',
                 #'pmid32183920','pmid32234121','pmid39546830','pmid39546831','pmid39546832']
preprint_list = ['2020.04.07.20052340','2020.05.01.20077743','2020.01.28.20019224',
                 '2020.03.24.20043018','2020.05.07.20093674','2020.04.16.20068379',
                 '2020.05.11.20097808','2020.03.26.20040709','2020.01.28.20019224',
                 '2020.05.03.066266','2020.03.17.20037671']


In [5]:
%%timeit
litcovid_results = []
for litcovid_id in litcovid_list:
    litcovid_info = fetch_reviewed_updates(path_dict,litcovid_id)
    litcovid_results.append(litcovid_info)

62.1 ms ± 4.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%%timeit
litcovid_results = []
for litcovid_id in litcovid_list:
    litcovid_info = fetch_reviewed_updates_list(path_dict,litcovid_id)
    litcovid_results.append(litcovid_info)

58.8 ms ± 662 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Testing impact of list comprehension in overall function

In [7]:
%%time
json_docs = []
for each_id in list(set(litcovid_list).union(set(preprint_list))):
    r = requests.get("https://api.outbreak.info/resources/resource/"+each_id)
    doc = json.loads(r.text)
    json_docs.append(doc)

Wall time: 9.58 s


In [9]:
%%timeit
doclist = []
for doc in json_docs:
    jdoc = add_anns_old(doc)
    doclist.append(jdoc)

3.46 s ± 86.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
listdoclist = []
for doc in json_docs:
    jdoc = add_anns(doc)
    listdoclist.append(jdoc)

3.12 s ± 32.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


The code using the list comprehension is faster by a bit

### Testing functions or parts of functions

In [None]:
with open(path_dict['litcovid_updates'],'r') as infile:
    preprint_dict = json.load(infile)
for i in range(len(preprint_dict)):
    if preprint_dict[i]['_id']=='pmid32562477':
        print(preprint_dict[i])
        break

print(preprint_dict[0].keys())
if 'correction' in preprint_dict[0].keys():
    print('yes')

In [None]:
path_dict = fetch_path_dict()
litcovid_info = fetch_reviewed_updates(path_dict,'pmid32562477')
check = fetch_preprint_updates(path_dict,'pmid32562477')
print(litcovid_info)
if check == None:
    print('no check')

In [None]:
with open(path_dict['altmetrics_file']) as infile:
    altmetrics_dict = json.load(infile)
print(altmetrics_dict[1])