## Check for limit (Praetorian issue 2023)

In [1]:
import json
import requests
import pandas as pd

In [2]:
%%time
r = requests.get('https://api.data.niaid.nih.gov/v1/query?q=__all__&fetch_all=true')

CPU times: total: 93.8 ms
Wall time: 1.51 s


In [3]:
cleanr = json.loads(r.text)
print(cleanr.keys())

dict_keys(['_scroll_id', 'took', 'total', 'max_score', 'hits'])


In [4]:
print(len(cleanr['hits']))

1000


In [4]:
with open('testdump.json','w') as outwrite:
    outwrite.write(json.dumps(cleanr))

## Checking for unicode parsing errors 
As seen in: https://github.com/NIAID-Data-Ecosystem/nde-portal/issues/192


In [3]:
%%time
#r = requests.get('https://api.data.niaid.nih.gov/v1/query?q="-?"&fetch_all=true')
r = requests.get('https://api.data.niaid.nih.gov/v1/query?q="Δ"&fetch_all=true')
cleanr = json.loads(r.text)
hits = cleanr['hits']
print(len(cleanr['hits']))

430
CPU times: total: 234 ms
Wall time: 2.15 s


In [6]:
test_table = []

for hit in hits:
    test_table.append({'url':hit['url'],'id':hit['_id'],'checked':'no'})

testdf = pd.DataFrame(test_table)
testdf.to_csv('test2.tsv',sep='\t',header=True)

## Checking for comma issues in author name for BD2K-LINCS
See [GH Issue](https://github.com/NIAID-Data-Ecosystem/nde-crawlers/issues/107) for more details.


In [2]:
requesturl = "https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:author.name&extra_filter=(includedInDataCatalog.name:%22LINCS%22)&limit=1000&fields=author.name"
r = requests.get(requesturl)
cleanr = json.loads(r.text)
hits = cleanr['hits']
print(len(cleanr['hits']))
print(cleanr['hits'][0])

424
{'_id': 'LINCS_EDS-1001', '_ignored': ['all.keyword'], '_score': 10.600381, 'author': {'name': 'Cyril Benes'}}


In [4]:
tmpdf = pd.DataFrame(cleanr['hits'])
print(tmpdf.loc[tmpdf['_id']=="LINCS_LDS-1404"])

               _id       _ignored     _score                   author
68  LINCS_LDS-1404  [all.keyword]  10.600381  {'name': 'Jia-Ren Lin'}


In [14]:
nonlists = tmpdf.loc[~tmpdf['author'].astype(str).str.contains(']')]
lsdf = tmpdf.loc[tmpdf['author'].astype(str).str.contains(']')]
#print(len(nonlists))
#print(len(lsdf))

#nonlists['name'] = pd.json_normalize(nonlists['author']).fillna('blank')
#print(nonlists.head(n=2))
boomdf = lsdf.explode('author')
#boomdf['name'] = pd.json_normalize(boomdf['author'])
#print(boomdf.head(n=2))
commanonlistsdf = nonlists.loc[nonlists['author'].astype(str).str.contains(",")]
commaboomdf = boomdf.loc[boomdf['author'].astype(str).str.contains(",")]
alldf = pd.concat((commanonlistsdf,commaboomdf),ignore_index=True)
#print(len(commadf))
#print(commanonlistsdf)
print(alldf)
alldf.to_csv('BD2K-LINCS_author_comma.tsv', sep='\t',header=True)
semidf = alldf.loc[alldf['author'].astype(str).str.contains(";")]
print(semidf)

               _id       _ignored     _score  \
0   LINCS_LDS-1398            NaN  10.600381   
1   LINCS_LDS-1431  [all.keyword]  10.600381   
2   LINCS_LDS-1463  [all.keyword]  10.600381   
3   LINCS_LDS-1473  [all.keyword]  10.600381   
4   LINCS_LDS-1508  [all.keyword]  10.600381   
..             ...            ...        ...   
59  LINCS_LDS-1531  [all.keyword]  10.454861   
60  LINCS_LDS-1546  [all.keyword]  10.454861   
61  LINCS_LDS-1222  [all.keyword]  10.433613   
62  LINCS_LDS-1547  [all.keyword]  10.433613   
63  LINCS_LDS-1580  [all.keyword]  10.433613   

                                               author  
0   {'name': 'Jie Wu, Malcolm Casale, Ryan Lim, Je...  
1   {'name': 'Joe Gray, Gordon Mills, Laura Heiser...  
2   {'name': 'Joe Gray, Gordon Mills, Laura Heiser...  
3   {'name': 'Joe Gray, Gordon Mills, Laura Heiser...  
4   {'name': 'Birtwistle, M.; Iyengar, R.; Sobie, ...  
..                                                ...  
59     {'name': 'Huan (Sharon) 

## Looking for duplicate records

In [None]:
import json
import requests
import pandas as pd

In [None]:
r = requests.get('https://api.data.niaid.nih.gov/v1/query?q=__all__&fetch_all=true')