# Biopython-Entrez
##  Functions for generic queries

Notebook with functions that can turn queries into results for MongoDB and Pandas.

In [3]:
from Bio import Entrez
import json
from bs4 import BeautifulSoup as bs
import lxml
import json

In [4]:
def pretty_print_json(json_):
    print(json.dumps(json_, indent=4, sort_keys=True))

## Query

In [46]:
def entrez_search_pubmed(query,records_per_query=10,email="XXX@YYY.com"):
    from Bio import Entrez
    Entrez.email = email
    # Search
    handle = Entrez.esearch(db="pubmed",term=query, idtype="acc")
    record = Entrez.read(handle)
    handle.close()
    return record

In [47]:
entrez_search_pubmed(query = "kruse eiken vestergaard",)

DictElement({'Count': '7', 'RetMax': '7', 'RetStart': '0', 'IdList': ['29569152', '28197643', '27848006', '26679436', '26659068', '26223424', '25466529'], 'TranslationSet': [], 'TranslationStack': [DictElement({'Term': 'kruse[All Fields]', 'Field': 'All Fields', 'Count': '6557', 'Explode': 'N'}, attributes={}), DictElement({'Term': 'eiken[All Fields]', 'Field': 'All Fields', 'Count': '1408', 'Explode': 'N'}, attributes={}), 'AND', DictElement({'Term': 'vestergaard[All Fields]', 'Field': 'All Fields', 'Count': '2384', 'Explode': 'N'}, attributes={}), 'AND', 'GROUP'], 'QueryTranslation': 'kruse[All Fields] AND eiken[All Fields] AND vestergaard[All Fields]'}, attributes={})

## Summary

In [50]:
def entrez_fetch_single_summary(uid):
    from Bio import Entrez
    handle = Entrez.esummary(db="pubmed", id=uid)
    record = Entrez.read(handle)
    return record

In [51]:
entrez_fetch_single_summary('29569152')

[DictElement({'Item': [], 'Id': '29569152', 'PubDate': '2018 Jun', 'EPubDate': '2018 Mar 22', 'Source': 'Osteoporos Int', 'AuthorList': ['Kruse C', 'Goemaere S', 'De Buyser S', 'Lapauw B', 'Eiken P', 'Vestergaard P'], 'LastAuthor': 'Vestergaard P', 'Title': 'Predicting mortality and incident immobility in older Belgian men by characteristics related to sarcopenia and frailty.', 'Volume': '29', 'Issue': '6', 'Pages': '1437-1445', 'LangList': ['English'], 'NlmUniqueID': '9100105', 'ISSN': '0937-941X', 'ESSN': '1433-2965', 'PubTypeList': ['Journal Article'], 'RecordStatus': 'PubMed - indexed for MEDLINE', 'PubStatus': 'ppublish+epublish', 'ArticleIds': DictElement({'pubmed': ['29569152'], 'medline': [], 'doi': '10.1007/s00198-018-4467-z', 'pii': '10.1007/s00198-018-4467-z', 'rid': '29569152', 'eid': '29569152'}, attributes={}), 'DOI': '10.1007/s00198-018-4467-z', 'History': DictElement({'pubmed': ['2018/03/24 06:00'], 'medline': ['2019/10/24 06:00'], 'received': '2017/07/16 00:00', 'accep

In [52]:
def entrez_fetch_list_summary(uid_list):
    from Bio import Entrez
    results = [ Entrez.read( Entrez.esummary(db="pubmed", id=uid) ) for uid in uid_list ]
    return results

In [13]:
entrez_fetch_list_summary(['29569152', '28197643', '27848006', '26679436', '26659068', '26223424', '25466529'])

[[DictElement({'Item': [], 'Id': '29569152', 'PubDate': '2018 Jun', 'EPubDate': '2018 Mar 22', 'Source': 'Osteoporos Int', 'AuthorList': ['Kruse C', 'Goemaere S', 'De Buyser S', 'Lapauw B', 'Eiken P', 'Vestergaard P'], 'LastAuthor': 'Vestergaard P', 'Title': 'Predicting mortality and incident immobility in older Belgian men by characteristics related to sarcopenia and frailty.', 'Volume': '29', 'Issue': '6', 'Pages': '1437-1445', 'LangList': ['English'], 'NlmUniqueID': '9100105', 'ISSN': '0937-941X', 'ESSN': '1433-2965', 'PubTypeList': ['Journal Article'], 'RecordStatus': 'PubMed - indexed for MEDLINE', 'PubStatus': 'ppublish+epublish', 'ArticleIds': DictElement({'medline': [], 'pubmed': ['29569152'], 'doi': '10.1007/s00198-018-4467-z', 'pii': '10.1007/s00198-018-4467-z', 'rid': '29569152', 'eid': '29569152'}, attributes={}), 'DOI': '10.1007/s00198-018-4467-z', 'History': DictElement({'medline': ['2019/10/24 06:00'], 'pubmed': ['2018/03/24 06:00'], 'received': '2017/07/16 00:00', 'acce

## Abstract

In [20]:
def flatten_abstract(abstract_xml):
    abstract = ''
    for abstractText in abstract_xml.find_all('abstracttext'):
        if abstractText.get('label') != None:
            abstract = abstract + " " + abstractText.get('label') + ": "
        abstract = abstract + abstractText.text
    return abstract

In [21]:
def entrez_fetch_abstracts(uid):
    from Bio import Entrez
    from bs4 import BeautifulSoup as bs
    handle = Entrez.efetch(db="pubmed", id=uid, rettype='Medline', retmode='xml')
    result = handle.readlines()
    result = "".join(result)
    bs_content = bs(result, "lxml")
    abstracts = bs_content.find_all('abstract')
    handle.close()
    # Abstract
    results = [ flatten_abstract(abstract) for abstract in abstracts]
    return results

In [22]:
def entrez_construct_abstract_dict(uids):
    results = entrez_fetch_abstracts( uids )
    return [{'uid':a, 'abstract':b} for a,b in zip(uids, results)]


In [23]:
uids = ['29569152', '28197643', '27848006', '26679436', '26659068', '26223424', '25466529']
entrez_construct_abstract_dict( uids )

[{'uid': '29569152',
  'abstract': 'There is an increasing awareness of sarcopenia in older people. We applied machine learning principles to predict mortality and incident immobility in older Belgian men through sarcopenia and frailty characteristics. Mortality could be predicted with good accuracy. Serum 25-hydroxyvitamin D and bone mineral density scores were the most important predictors. INTRODUCTION: Machine learning principles were used to predict 5-year mortality and 3-year incident severe immobility in a population of older men by frailty and sarcopenia characteristics. METHODS: Using prospective data from 1997 on 264 older Belgian men (n\u2009=\u2009152 predictors), 29 statistical models were developed and tuned on 75% of data points then validated on the remaining 25%. The model with the highest test area under the curve (AUC) was chosen as the best. From these, ranked predictor importance was extracted. RESULTS: Five-year mortality could be predicted with good accuracy (tes

## Elink

In [24]:
def entrez_fetch_full_text_linkout(uid_list):
    import requests
    query = ",".join(uid_list)
    result = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=" + query + "&cmd=prlinks&retmode=json").json()
    #pretty_print_json( result.json() )
    return result

In [25]:
results_Elink = entrez_fetch_full_text_linkout(uids)
results_Elink

{'header': {'type': 'elink', 'version': '0.3'},
 'linksets': [{'dbfrom': 'pubmed',
   'idurllist': [{'id': '29569152',
     'objurls': [{'url': {'value': 'https://doi.org/10.1007/s00198-018-4467-z'},
       'iconurl': {'lng': 'EN',
        'value': '//www.ncbi.nlm.nih.gov/corehtml/query/egifs/http:--production.springer.de-OnlineResources-Logos-springerlink.gif'},
       'subjecttypes': ['publishers/providers'],
       'categories': ['Full Text Sources'],
       'attributes': ['full-text online',
        'publisher of information in url',
        'subscription/membership/fee required'],
       'provider': {'name': 'Springer',
        'nameabbr': 'Springer',
        'id': '3055',
        'url': {'lng': 'EN', 'value': 'http://www.springeronline.com/'}}}]},
    {'id': '28197643',
     'objurls': [{'url': {'value': 'https://dx.doi.org/10.1007/s00223-017-0238-7'},
       'iconurl': {'lng': 'EN',
        'value': '//www.ncbi.nlm.nih.gov/corehtml/query/egifs/http:--production.springer.de-Onlin

In [42]:
def parse_results_Elink(elink_results):
    elink_results = elink_results['linksets'][0]['idurllist']
    return [ { 'uid' : result['id'], 'url' : result['objurls'][0]['url']['value'] } for result in elink_results ]
parse_results_Elink(results_Elink)

[{'uid': '29569152', 'url': 'https://doi.org/10.1007/s00198-018-4467-z'},
 {'uid': '28197643', 'url': 'https://dx.doi.org/10.1007/s00223-017-0238-7'},
 {'uid': '27848006', 'url': 'https://doi.org/10.1007/s00198-016-3828-8'},
 {'uid': '26679436',
  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S8756-3282(15)00424-X'},
 {'uid': '26659068', 'url': 'https://doi.org/10.1007/s00198-015-3451-0'},
 {'uid': '26223424', 'url': 'https://doi.org/10.1111/joim.12397'},
 {'uid': '25466529', 'url': 'https://doi.org/10.1007/s00198-014-2973-1'}]

# Essentials to pandas dataframe