## Creating training datasets for specific topicCategories from LitCovid data

This was the process used in creating an initial training set of pmids from LitCovid.
1. Pull all LitCovid pmids with keywords field
2. Sort keywords field by frequency
3. Map keywords fields to topicCategories (as specifically as possible)
4. Use mapping to identify pmids for each topicCategory (matching_pmids)
5. For cleaner training data, remove ambiguous entries by filtering out duplicate pmids (filtered_pmids)

To update the training dataset, repeat from step 4.

In [1]:
import os
import pandas as pd
from pandas import read_csv
import requests
import json
import pickle

## Functions for fetching metadata

In [2]:
#### Get the size of the source (to make it easy to figure out when to stop scrolling)
def fetch_src_size(source):
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)

#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)

#### Ping the API and get all the ids for a specific source and scroll through the source until number of ids matches meta
def get_source_ids(source):
    source_size = fetch_src_size(source)
    r = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = tmpset.union(idlist2)
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)


def get_pub_ids(sourceset):
    pub_srcs = {"preprint":["bioRxiv","medRxiv"],"litcovid":["litcovid"],
                "other":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis"],
                "all":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis",
                       "bioRxiv","medRxiv","litcovid"]}
    sourcelist = pub_srcs[sourceset]
    allids = []
    for eachsource in sourcelist:
        sourceids = get_source_ids(eachsource)
        allids = list(set(allids).union(set(sourceids)))
    return(allids)


#### Get the name, abstract for the pmids
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_keywords(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','name','keywords'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,keywords'})
        if r.status_code == 200:
            rawresult = pd.read_json(r.text)
            checkcols = rawresult.columns
            if (('keywords' not in checkcols) and ('abstract' in checkcols)):
                rawresult['keywords']=[]
            elif (('keywords' in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
            elif (('keywords' not in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
                rawresult['keywords']=[]
            cleanresult = rawresult[['_id','name','abstract','keywords']].loc[rawresult['_score']==1].fillna(" ").copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            textdf = pd.concat((textdf,cleanresult))
        i=i+1
    return(textdf)
        

In [3]:
DATAPATH = 'data/'
RESULTSPATH = 'results/'
MODELPATH = 'models/subtopics/'
PREDICTPATH = 'predictions/subtopics/'

In [6]:
%%time
litcovid_ids = get_pub_ids("litcovid")

Wall time: 2min 35s


In [7]:
%%time
textdf = batch_fetch_keywords(litcovid_ids)

keywordsdf = textdf.loc[textdf['keywords'].str.len()>1].copy()
print(len(keywordsdf))
print(keywordsdf.head(n=4))

with open(os.path.join(DATAPATH,'keywordsdf.pickle'),'wb') as keydumpfile:
    pickle.dump(keywordsdf,keydumpfile)

68895
            _id                                           abstract  \
1  pmid33060058  OBJECTIVE: Encephalopathy is a major neurologi...   
2  pmid32624494  Since the outbreak of COVID-19 in China in Dec...   
3  pmid33192858  Objective: We aim to determine the psychologic...   
4  pmid33583756  OBJECTIVE: To observe the early interventions ...   

                                                name  \
1  Alpha coma EEG pattern in patients with severe...   
2  Improved survival following ward-based non-inv...   
3  Psychological Impact of the Civil War and COVI...   
4  Early therapeutic interventions of traditional...   

                                            keywords  
1  [Ascending reticular formation, Brainstem, Enc...  
2  [assisted ventilation, lung physiology, non in...  
3  [COVID-19, GAD-7, PHQ-9, SARS-CoV-2, anxiety, ...  
4  [Conversion time of viral nucleic acid, Corona...  
Wall time: 19min 42s


56006
            _id                                           abstract  \
0  pmid32529889  Since the emergence of patients with COVID-19 ...   
1  pmid33982129  This correspondence argues that it is not only...   
2  pmid32779341  Severe acute respiratory syndrome coronavirus ...   
5  pmid33645278  INTRODUCTION: The COVID-19 pandemic resulted i...   

                                                name  \
0  Prevention and control strategies in the diagn...   
1              The last rites in a time of pandemic.   
2  Understanding immunopathological fallout of hu...   
5  The hidden dangers of staying home: a London t...   

                                            keywords  
0  [COVID-19, Prevention and control, children, s...  
1             [beliefs, health promotion, mortality]  
2  [COVID-19, autoantibodies, cytokine storm, hyd...  
5  [Accidental injury, COVID-19, Injuries, Lockdo...  


In [12]:
%%time
curated_pmids = read_csv(os.path.join(DATAPATH,'pmids_for_training.tsv'),sep='\t',header=0,index_col=0)
i=0
new_curated_pmids = []
while i < len(curated_pmids):
    matching_pmids = []
    topicCat = curated_pmids.iloc[i]['topicCategory']
    category = curated_pmids.iloc[i]['category']
    searchterm_split = curated_pmids.iloc[i]['search terms'].split(',')
    search_terms = [x.strip() for x in searchterm_split]
    for eachterm in search_terms:
        tmpdf = keywordsdf.loc[keywordsdf['keywords'].astype(str).str.lower().str.contains(eachterm)]
        pmids = tmpdf['_id'].unique().tolist()
        matching_pmids = list(set(pmids).union(set(matching_pmids)))
    new_curated_pmids.append({'topicCategory':topicCat,'category':category,
                              'description':curated_pmids.iloc[i]['description'],
                              'subcategory':curated_pmids.iloc[i]['subcategory'],
                              'search terms':search_terms,'matching_pmids':matching_pmids,
                              'no of samples':len(matching_pmids)})
    i=i+1
new_curated_pmids_df = pd.DataFrame(new_curated_pmids) 
new_curated_pmids_df.to_csv(os.path.join(DATAPATH,'updated_pmids_for_training.tsv'),sep='\t',header=True)

Wall time: 16.7 s


In [13]:
with open(os.path.join(DATAPATH,'updated_pmids_for_training.pickle'),"wb") as dumpfile:
    pickle.dump(new_curated_pmids_df,dumpfile)


### Use keyword-mapped text to identify/expand dataset for additional search terms
The curated dumpfile is based on keyword mapping--ie., the author categorized their papers based on keywords and those keywords could readily be mapped to a specific topicCategory. Now, pull the keywords for successfully mapped papers and  run a frequency analysis to see if additional keywords can be identified to grow the dataset.

Given the large number of subtopic categories, the keywords for each category will not be inspected exhaustively, rather the 1 to 10% most frequent keywords for eachtopic will be inspected for inclusion/exclusion based on specificity/broadness.

In [2]:
with open(os.path.join(DATAPATH,'updated_pmids_for_training.pickle'),"rb") as infile:
    new_curated_pmids_df = pickle.load(infile)

NameError: name 'DATAPATH' is not defined

In [36]:
%%time 
FREQPATH = os.path.join(RESULTSPATH,'wordfrequencies/')

for index,row in subcats.iterrows():
    topicCategory = row['topicCategory'].replace(' / ','-').replace('/','-')
    checkdf = keywordsdf.loc[keywordsdf['_id'].isin(row['matching_pmids'])]
    allkw = checkdf[['_id','keywords']].explode('keywords').copy()
    kwfreq = allkw.groupby('keywords').size().reset_index(name='counts')
    kwfreq.sort_values('counts',ascending=False,inplace=True)
    onepercent = kwfreq.head(n=int(round(len(kwfreq)*0.01,0)))
    fivepercent = kwfreq.head(n=int(round(len(kwfreq)*0.05,0)))
    tenpercent = kwfreq.head(n=int(round(len(kwfreq)*0.1,0)))
    onepercent.to_csv(os.path.join(FREQPATH,topicCategory+'_1_percent_most_freq_kw.txt'),sep='\t',header=True)
    fivepercent.to_csv(os.path.join(FREQPATH,topicCategory+'_5_percent_most_freq_kw.txt'),sep='\t',header=True)
    tenpercent.to_csv(os.path.join(FREQPATH,topicCategory+'_10_percent_most_freq_kw.txt'),sep='\t',header=True)

Wall time: 570 ms


### Apply search terms to LitCovid and our own API (elastic search) 
Use these searches to determine how well the terms broaden or narrow the training datasets. Use subtopic to litcovid/offtopic mapping and litcovid (to our api results) matching to improve the quality of the training datasets

More advanced searches can be conducted on litcovid to limit the search of a keyword for a specific topicCategory to a particular LitCovid topic

Eg- https://www.ncbi.nlm.nih.gov/research/coronavirus/docsum?text=pathology&filters=topics.Diagnosis&sort=score%20desc&page=1

For certain categories, exclusion criteria may be needed. For example, the broad category, 'clinical' should encompass clinical studies which often (but not always overlap with 'prevention' and 'treatment' in LitCovid

Additionally, it may make sense to include certain categories.  For example in LitCovid, 'prevention' encompasses palliative care and intubation (as these two prevent suffering and premature death respectively), however, we would consider both of these medical care for the treatment of infected individuals

Litcovid categorical overlaps to consider:
* Prevention and Treatment
* Treatment and Case Descriptions
* Prevention and Case Descriptions
* Prevention and Transmission
* Mechanism and Transmission

In [19]:
maintopics = ['Diagnosis',
              'Epidemiology',
              'Mechanism',
              'Prevention',
              'Transmission',
              'Treatment']


## Note that Clinical is excluded from the main topics, 
## since the subtopics for this one are trained and treated like a broad topic

In [29]:
DATAPATH = 'data/'
topic = maintopics[5]
keyword_dict = load_search_terms(DATAPATH,topic)
for eachkeyword in keyword_dict:
    querylist = keyword_dict[eachkeyword]
    for query in querylist:
        pubmeta = requests.get('https://api.outbreak.info/resources/query?q=(("'+query+'") AND (@type:Publication))&size=0&aggs=@type')
        pubjson = json.loads(pubmeta.text)
        pubcount = int(pubjson["facets"]["@type"]["total"])
        print(query,": ",pubcount)
#idcheck = sub_category_id_check(DATAPATH,topic,source='litcovid')
#for topic in maintopics:
#    print(topic)
#    idcheck = sub_category_id_check(DATAPATH,topic)

convalescent plasma :  1145
therapeutic plasma :  61
neutralizing antibodies :  1224
plasmapheresis :  69
antibodies for treating :  2
antibodies used to treat :  4
antibody therapeutics :  42
therapeutic antibodies :  82
antibody therapies :  45
plasma therapy :  315
plasma exchange :  115
immunotherapy :  801
neutralizing antibody :  697
antibody neutralization :  56
passive immunization :  60
tocilizumab :  1065
hospital care :  259
prehospital care :  22
standard of care :  545
medical care :  948
supportive care :  555
palliative care :  743
terminal care :  12
psychological care :  46
icu treatment :  69
intubation :  1485
keywords :  772
drug discovery :  816
high throughput screening :  142
natural products discovery :  1
molecular docking :  1004
molecular dynamics :  949
antiviral virtual screening :  0
drug design :  402
drug development :  575
drug delivery :  250
drug target :  303
natural products :  291
drug screening :  132
drug safety :  130
adverse drug reactions :  1

In [2]:
#### Functions for building the non-litcovid topics training set from keywords
import os
import requests
import pandas as pd
from pandas import read_csv
import time
from datetime import datetime
import json
import pickle

#### Pull ids from a json file
from src.common import get_ids_from_json
from src.fetch_offtopics import *


def get_subpath(DATAPATH,topic):
    keystring = 'keywords/'+topic+'/'
    SUBPATH = os.path.join(DATAPATH,keystring)
    SUBFILES = os.listdir(SUBPATH)
    return(SUBPATH,SUBFILES)


def load_search_terms(DATAPATH,topic):
    SUBPATH,SUBFILES = get_subpath(DATAPATH,topic)
    keyword_dict = {}
    for eachfile in SUBFILES:
        filename = eachfile.split('.')[0]
        keywords = []
        with open(os.path.join(SUBPATH,eachfile),'r') as readfile:
            for eachline in readfile:
                keywords.append(eachline.strip())
        keyword_dict[filename]=keywords
    return(keyword_dict)



## Search litcovid for a term and retrieve the pmids
def search_litcovid_ids(searchterm,topic):
    baseurl = 'https://www.ncbi.nlm.nih.gov/research/coronavirus-api/export/tsv?text="'
    filterurl = '"&filters={"topics":["'
    urlend = '"]}'
    nofilter = '"&filters={}'
    if topic==False:
        litsearchurl = baseurl+searchterm+nofilter
    else:
        litsearchurl = baseurl+searchterm+filterurl+topic+urlend
    check_litcovid = requests.get(litsearchurl)
    litcovid_data = check_litcovid.text.split('\n')[34:]
    pmids = []
    for line in litcovid_data:
        if line.startswith('#') or line.startswith('p'):
            continue
        pmids.append(line.split('\t')[0])
    cleanpmids = ['pmid'+x for x in pmids if x != ""]
    return(cleanpmids)


## load the list of search terms for each topicCategory and push it to either outbreak (default) or litcovid
## returns an id list
def sub_category_id_check(DATAPATH,topic,source='outbreak'):
    mapped_topics = {'Treatment':['Prevention','Treatment','Case Report'],
                     'Transmission':['Transmission','Prevention'],
                     'Prevention':['Prevention','Case Report']}
    special_cases = {'Mechanism of Transmission':['Mechanism','Transmission']}
    keyword_dict = load_search_terms(DATAPATH,topic)
    allids = []
    if source == 'litcovid':
        for category in keyword_dict.keys():
            keywordlist = keyword_dict[category]
            if category=='Mechanism of Transmission':
                litcovidtopics = read_csv(os.path.join(DATAPATH,'litcovidtopics.tsv'),delimiter='\t',index_col=0,header=0)
                mechtrans = litcovidtopics.loc[(litcovidtopics['topicCategory']=='Mechanism')|
                                               (litcovidtopics['topicCategory']=='Transmission')].copy()
                mechtrans.drop_duplicates(keep='first',inplace=True)
                freqs = mechtrans.groupby('_id').size().reset_index(name='counts')
                meetsreqs = freqs['_id'].loc[freqs['counts']>1].unique().tolist()
                for category in keyword_dict.keys():  
                    keywordlist = keyword_dict[category]
                    for eachkey in keywordlist:
                        idlist = search_litcovid_ids(eachkey,topic)
                        totalids = list(set(idlist).union(set(meetsreqs)))
                        allids.append({'category':category,'searchterm':eachkey,'ids':totalids})                    
            else:
                if topic == 'Epidemiology':
                    topic = False
                    for eachkey in keywordlist:
                        idlist = search_litcovid_ids(eachkey,topic)
                        allids.append({'category':category,'searchterm':eachkey,'ids':idlist})                    
                elif topic in (mapped_topics.keys()):
                    topic_sublist = mapped_topics[topic]
                    for eachkey in keywordlist:
                        totalids = []
                        for eachtopic in topic_sublist:
                            idlist = search_litcovid_ids(eachkey,eachtopic)
                            totalids = list(set(idlist).union(set(totalids)))
                        allids.append({'category':category,'searchterm':eachkey,'ids':totalids})
                else:
                    for eachkey in keywordlist:
                        idlist = search_litcovid_ids(eachkey,topic)
                        allids.append({'category':category,'searchterm':eachkey,'ids':idlist})
        idcheck = pd.DataFrame(allids)
    else:
        for category in keyword_dict.keys():  
            keywordlist = keyword_dict[category]
            for eachkey in keywordlist:
                idlist = get_query_ids(eachkey)
                allids.append({'category':category,'searchterm':eachkey,'ids':idlist})
        idcheck = pd.DataFrame(allids)
    return(idcheck)


## Pull the id lists after search outbreak and litcovid and compare them
## keep only ids in common for training purposes
## Note, this will remove all preprints from the training set since litcovid does not have them
def get_in_common_sub_ids(DATAPATH):
    maintopics = ['Diagnosis',
              'Epidemiology',
              'Mechanism',
              'Prevention',
              'Transmission',
              'Treatment']
    cleandf = pd.DataFrame(columns=['category','searchterm','len_ids_x','len_ids_y','len_clean_ids','clean_ids'])
    for topic in maintopics:
        outbreakids = sub_category_id_check(DATAPATH,topic)
        litcovidids = sub_category_id_check(DATAPATH,topic,source='litcovid')
        mergedf = outbreakids.merge(litcovidids,on=(['category','searchterm']),how='outer')
        mergedf['clean_ids'] = mergedf.apply(lambda row: list(set(row['ids_x']).intersection(set(row['ids_y']))),axis=1)
        mergedf['len_clean_ids'] = mergedf['clean_ids'].str.len()
        mergedf['len_ids_x'] = mergedf['ids_x'].str.len()
        mergedf['len_ids_y'] = mergedf['ids_y'].str.len()
        mergedf.drop(columns=['ids_x','ids_y'],inplace=True)
        cleandf = pd.concat((cleandf,mergedf),ignore_index=True)
    return(cleandf)


def generate_subtraining_dict(DATAPATH,RESULTSPATH,savefile = False):
    cleandf = get_in_common_sub_ids(DATAPATH)
    training_dict = {}
    for eachcat in cleandf['category'].unique().tolist(): 
        j=0
        tmpdf = cleandf.loc[cleandf['category']==eachcat]
        allids = []
        while j<len(tmpdf):
            allids = list(set(allids).union(set(tmpdf.iloc[j]['clean_ids'])))
            j=j+1
        training_dict[eachcat]=allids
    if savefile == False:
        return(training_dict)
    else:
        with open(os.path.join(RESULTSPATH,"training_dict.json"), "w") as outfile: 
            json.dump(training_dict, outfile)        


def get_sub_topics(DATAPATH,RESULTSPATH):
    training_dict = generate_subtraining_dict(DATAPATH,RESULTSPATH)
    trainingdf = transform_training_dict(training_dict)
    trainingdf.to_csv(os.path.join(DATAPATH,'subtopics.tsv'),sep='\t',header=True)
    



In [3]:
%%time
DATAPATH = 'data/'
RESULTSPATH = 'results/'
get_sub_topics(DATAPATH,RESULTSPATH)

Wall time: 18min 50s
