In [1]:
import pandas as pd
from pandas import read_csv
import requests
import json
import time
import math
import pickle
import os
import ast ## Only needed to convert string to dict where json.loads fails

In [6]:
%%time

## Perform the initial query

#query_url = 'https://api.data.niaid.nih.gov/v1/query?q=_exists_:species&fields=_id,name,species&fetch_all=true'
query_url = 'https://api-staging.data.niaid.nih.gov/v1/query?q=_exists_:species&fields=_id,name,species&fetch_all=true'
r = requests.get(query_url)
cleanr = json.loads(r.text)
hits = cleanr['hits']
#print(len(cleanr['hits']))
df1 = pd.DataFrame(cleanr['hits'])
scroll_id = cleanr['_scroll_id']
total_hits = cleanr['total']
print(total_hits)

201222
CPU times: total: 1.42 s
Wall time: 3.49 s


In [7]:
%%time
## Scroll to get all the results

i = 0
#k = 3 
k = math.ceil(total_hits/1000)
while i < k:
    #r2 = requests.get(f'https://api.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    r2 = requests.get(f'https://api-staging.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    tmp = json.loads(r2.text)
    scroll_id = tmp['_scroll_id']
    tmpdf = pd.DataFrame(tmp['hits'])
    df1 = pd.concat((df1,tmpdf),ignore_index=True)
    #print(len(df1))
    i = i+1
    time.sleep(0.25)

KeyError: '_scroll_id'

In [10]:
## Inspect and save the results of the search

print(len(df1))
print(df1.head(n=3))
with open(os.path.join('data','processed_species_results.pickle'),'wb') as dumpfile:
    pickle.dump(df1,dumpfile)

201222
                    _id  _score  \
0  DDE_0565c31a11705723     1.0   
1  DDE_095ecd25213286dd     1.0   
2  DDE_1058e9acef861126     1.0   

                                                name  \
0  Primary human microvascular endothelial cells ...   
1     Protein-protein interaction map for SARS-CoV-2   
2  genotyping by high throughput sequencing, gene...   

                                             species _ignored  
0  [{'alternateName': ['Human', 'Homo sapiens Lin...      NaN  
1  [{'alternateName': ['Human', 'Homo sapiens Lin...      NaN  
2  [{'alternateName': ['Human', 'Homo sapiens Lin...      NaN  


## To Do
* Perform the same search using the staging data api (which should have the PubTator normalized results
* Run the Text2term tool on the data set to see how it overlaps with PubTator and whether or not it can be used in lieu of PubTator
* Also try using the Biothings Taxonomy API to compare

### Potential options
* Generate a training dataset for TaggerOne
* Process raw text files using TaggerOne

In [2]:
with open(os.path.join('data','processed_species_results.pickle'),'rb') as readfile:
    processdf = pickle.load(readfile)

print(processdf.head(n=2))

                    _id  _score  \
0  DDE_0565c31a11705723     1.0   
1  DDE_095ecd25213286dd     1.0   

                                                name  \
0  Primary human microvascular endothelial cells ...   
1     Protein-protein interaction map for SARS-CoV-2   

                                             species _ignored  
0  [{'alternateName': ['Human', 'Homo sapiens Lin...      NaN  
1  [{'alternateName': ['Human', 'Homo sapiens Lin...      NaN  


In [3]:
processboom = processdf.explode('species')
print(len(processboom))
print(processboom.head(n=2))

201366
                    _id  _score  \
0  DDE_0565c31a11705723     1.0   
1  DDE_095ecd25213286dd     1.0   

                                                name  \
0  Primary human microvascular endothelial cells ...   
1     Protein-protein interaction map for SARS-CoV-2   

                                             species _ignored  
0  {'alternateName': ['Human', 'Homo sapiens Linn...      NaN  
1  {'alternateName': ['Human', 'Homo sapiens Linn...      NaN  


In [4]:
print(processboom.iloc[0]['species'])

{'alternateName': ['Human', 'Homo sapiens Linnaeus, 1758', 'human', 'Home sapiens', 'Homo sampiens', 'Homo sapeins', 'Homo sapian', 'Homo sapians', 'Homo sapien', 'Homo sapience', 'Homo sapiense', 'Homo sapients', 'Homo sapines', 'Homo spaiens', 'Homo spiens', 'Humo sapiens'], 'classification': 'host', 'commonName': 'Human', 'curatedBy': {'dateModified': '2023-10-05', 'name': 'PubTator', 'url': 'https://www.ncbi.nlm.nih.gov/research/pubtator/api.html'}, 'displayName': 'Human | Homo sapiens', 'identifier': '9606', 'inDefinedTermSet': 'UniProt', 'isCurated': True, 'name': 'Homo sapiens', 'originalName': 'homo sapiens', 'url': 'https://www.uniprot.org/taxonomy/9606'}


In [13]:
def parse_id(speciesdict):
    if isinstance(speciesdict,dict):
        tmpdict = speciesdict
    elif isinstance(speciesdict,str):
        try:
            tmpdict = json.loads(speciesdict)
        except:
            tmpdict = {'failed':speciesdict}
    if 'identifier' in tmpdict.keys():
        speciesid = tmpdict['identifier']
        curie = 'NCBITAXON:'+str(speciesid)
    else:
        curie = -1
    return curie

processboom['CURIE'] = processboom.apply(lambda row: parse_id(row['species']),axis=1)
print(processboom.head(n=2))

                    _id  _score  \
0  DDE_0565c31a11705723     1.0   
1  DDE_095ecd25213286dd     1.0   

                                                name  \
0  Primary human microvascular endothelial cells ...   
1     Protein-protein interaction map for SARS-CoV-2   

                                             species _ignored           CURIE  
0  {'alternateName': ['Human', 'Homo sapiens Linn...      NaN  NCBITAXON:9606  
1  {'alternateName': ['Human', 'Homo sapiens Linn...      NaN  NCBITAXON:9606  


In [16]:
flagged = processboom.loc[processboom['CURIE']==-1]
clean_processed = processboom.loc[processboom['CURIE']!=-1].copy()
print(len(flagged))
print(flagged.tail(n=2))

33879
             _id  _score                                               name  \
201115  GSE95614     1.0  Comparative expression analysis of the Arabido...   
201201    GSE886     1.0  Identification and distinct regulation of yeas...   

                                     species       _ignored CURIE  
201115      {'name': 'Arabidopsis thaliana'}  [all.keyword]    -1  
201201  {'name': 'Saccharomyces cerevisiae'}  [all.keyword]    -1  


## Comparison with Text2Term

In [18]:
## load Text2Term results
t2t_results = pd.read_csv(os.path.join('data','t2t_mapped_no_flags.tsv'), delimiter='\t', header=0, index_col=0)
print(t2t_results.head(n=2))

             _id                                               name  \
0  GEO_GSE110840  Sequencing of Caenorhabditis elegans deletion ...   
1  GEO_GSE110842  Sequencing of Caenorhabditis elegans overexpre...   

             species_name  flag_raised                   Label  \
0  Caenorhabditis elegans        False  Caenorhabditis elegans   
1  Caenorhabditis elegans        False  Caenorhabditis elegans   

            CURIE  Mapping Score  
0  NCBITAXON:6239          0.995  
1  NCBITAXON:6239          0.995  


In [22]:
omicsdf = t2t_results.loc[t2t_results['_id'].astype(str).str.contains('OMICSDI')]
print(omicsdf.head(n=2))

Empty DataFrame
Columns: [_id, name, species_name, flag_raised, Label, CURIE, Mapping Score]
Index: []


In [23]:
def fix_id(identifier):
    if 'GEO_GSE' in identifier:
        newid = identifier.replace('GEO_GSE','GSE')
    else:
        newid = identifier
    return newid

t2t_results['new_id'] = t2t_results.apply(lambda row: fix_id(row['_id']),axis=1)
t2t_results.rename(columns={'_id':'old_id','Label':'t2t_label'},inplace=True)
t2t_results.rename(columns={'new_id':'_id'},inplace=True)
print(t2t_results.head(n=2))

          old_id                                               name  \
0  GEO_GSE110840  Sequencing of Caenorhabditis elegans deletion ...   
1  GEO_GSE110842  Sequencing of Caenorhabditis elegans overexpre...   

             species_name  flag_raised                   Label  \
0  Caenorhabditis elegans        False  Caenorhabditis elegans   
1  Caenorhabditis elegans        False  Caenorhabditis elegans   

            CURIE  Mapping Score        _id  
0  NCBITAXON:6239          0.995  GSE110840  
1  NCBITAXON:6239          0.995  GSE110842  


In [25]:
matched_merged = t2t_results.merge(clean_processed,on=['_id','name','CURIE'],how='inner')
print(len(matched_merged))

167014


In [26]:
unmatched = t2t_results.loc[~t2t_results['_id'].isin(matched_merged['_id'].unique().tolist())]
print(len(unmatched))

29547


In [28]:
corresponding_unmatched = clean_processed.loc[clean_processed['_id'].isin(unmatched['_id'].unique().tolist())]
print(len(corresponding_unmatched))
flagged_unmatched = flagged.loc[flagged['_id'].isin(unmatched['_id'].unique().tolist())]
print(len(flagged_unmatched))

64
29157


In [38]:
accounted_ids = list(set(corresponding_unmatched['_id'].unique().tolist()).union(set(flagged_unmatched['_id'].unique().tolist())))
not_found = unmatched.loc[~unmatched['_id'].isin(accounted_ids)]
print(len(not_found))                                                 

0


In [40]:
mismatch = corresponding_unmatched.merge(unmatched,on=['_id','name'],how='inner')
print(mismatch.head(n=2))

t2t_no_id = flagged_unmatched.merge(unmatched,on=['_id','name'],how='inner')
print(t2t_no_id.head(n=2))

         _id  _score                                               name  \
0  GSE123173     1.0  Switching from Random to Imprinted X-inactivat...   
1   GSE28539     1.0                   RNA-seq of Taxus chinensis cells   

                                             species       _ignored  \
0  {'alternateName': ['Mus musculus domesticus x ...  [all.keyword]   
1  {'alternateName': ['Chinese yew', 'Taxus chine...  [all.keyword]   

             CURIE_x         old_id  \
0  NCBITAXON:1266728  GEO_GSE123173   
1    NCBITAXON:29808   GEO_GSE28539   

                                 species_name  flag_raised  \
0  Mus musculus domesticus x M. m. molossinus        False   
1            Taxus wallichiana var. chinensis        False   

                     Label           CURIE_y  Mapping Score  
0  Mus musculus molossinus   NCBITAXON:57486          0.762  
1        Taxus wallichiana  NCBITAXON:147273          0.827  
        _id  _score                                               nam

In [44]:
failed_to_parse = pd.read_csv(os.path.join('data','t2t_mapped_flagged.tsv'), delimiter='\t', header=0, index_col=0)
results = {
    "Successful Text2Term mappings": len(t2t_results),
    "Issues parsing species field for mapping": len(failed_to_parse),
    "species with no identifiers from staging": len(flagged),
    "Species with identifiers from staging": len(clean_processed),
    "Text2Term mapping matched Pubtator": len(matched_merged),
    "Text2Term mapping did NOT match Pubtator": len(corresponding_unmatched),
    "Text2Term mapping matched to species with no id from staging": len(flagged_unmatched),
    "Number of ids with T2T mappings that are unaccounted for": len(not_found)
}

with open(os.path.join('data','t2t_pubtator_compare.json'),'w') as outwrite:
    outwrite.write(json.dumps(results))

mismatch.to_csv(os.path.join('data','t2t_disagree_pubtator.tsv'),sep='\t', header=True)
t2t_no_id.to_csv(os.path.join('data','t2t_no_id.tsv'),sep='\t', header=True)