# Grab Pubmed Abstracts

## Functions

In [29]:
from Bio import Entrez
import json
from bs4 import BeautifulSoup as bs
import lxml
import json
import numpy as np
from tqdm import tqdm
import pandas as pd
import semanticscholar as sch

In [30]:
def entrez_search_pubmed(query,records_per_query=10,email="XXX@YYY.com",retMax=100):
    from Bio import Entrez
    Entrez.email = email
    # Search
    handle = Entrez.esearch(db="pubmed",term=query, idtype="acc", retMax=retMax,mindate='2020/05',maxdate='2020/06')
    record = Entrez.read(handle)
    handle.close()
    return record

In [31]:
def flatten_abstract(abstract_xml):
    abstract = ''
    for abstractText in abstract_xml.find_all('abstracttext'):
        if abstractText.get('label') != None:
            abstract = abstract + " " + abstractText.get('label') + ": "
        abstract = abstract + abstractText.text
    return abstract

In [32]:
def entrez_fetch_abstracts(uid,email):
    from Bio import Entrez
    from bs4 import BeautifulSoup as bs
    Entrez.email = email
    handle = Entrez.efetch(db="pubmed", id=uid, rettype='Medline', retmode='xml')
    result = handle.readlines()
    result = b"".join(result)
    bs_content = bs(result, "lxml")
    abstracts = bs_content.find_all('abstract')
    handle.close()
    # Abstract
    results = [ flatten_abstract(abstract) for abstract in abstracts]
    return results

In [33]:
def entrez_construct_abstract_dict(uids,email):
        results = entrez_fetch_abstracts( uids, email )
        return [{'uid':a, 'abstract':b, 'include':''} for a,b in zip(uids, results)]

In [34]:
def entrez_fetch_list_summary(uid_list):
    from Bio import Entrez
    results = [ Entrez.read( Entrez.esummary(db="pubmed", id=uid) ) for uid in uid_list ]
    return results

## Query

In [35]:
query_body={ "query": "\"COVID-19\"[Mesh]", 'email' : "XXX@YYY.com" }
## Query search
results_query = entrez_search_pubmed(query = query_body['query'], email = query_body['email'], retMax=200)

In [36]:
uids = results_query['IdList']
len(uids)

200

In [37]:
results_query

{'Count': '15599', 'RetMax': '200', 'RetStart': '0', 'IdList': ['32593259', '32593224', '32593209', '32593202', '32593196', '32593195', '32593183', '32593180', '32593144', '32593133', '32593125', '32593124', '32593122', '32593121', '32593120', '32593119', '32593116', '32593067', '32593065', '32593062', '32593060', '32593056', '32593034', '32593033', '32592975', '32592974', '32592969', '32592968', '32592967', '32592966', '32592960', '32592922', '32592919', '32592918', '32592911', '32592908', '32592904', '32592903', '32592884', '32592868', '32592866', '32592843', '32592842', '32592841', '32592840', '32592837', '32592836', '32592817', '32592816', '32592790', '32592788', '32592787', '32592761', '32592752', '32592742', '32592724', '32592716', '32592709', '32592705', '32592704', '32592703', '32592702', '32592672', '32592671', '32592670', '32592661', '32592657', '32592626', '32592625', '32592581', '32592564', '32592550', '32592539', '32592522', '32592519', '32592509', '32592508', '32592507', 

# Citations by DOI

In [38]:
result_summaries = entrez_fetch_list_summary(uids)

In [39]:
def citations_from_doi(doi):
    return sch.paper(doi, timeout=2)['numCitedBy']

In [40]:
def extract_epub_doi(summary):
    try:
        return { 'uid':summary['Id'], 'EPubDate': summary['EPubDate'], 'doi': summary['ArticleIds']['doi'], 'citations' : citations_from_doi(summary['ArticleIds']['doi']) }
    except:
        return {'uid':summary['Id'], 'EPubDate': summary['EPubDate'], 'doi': None, 'citations' : None }

In [41]:
result_citations = [ extract_epub_doi(summary[0]) for summary in result_summaries ]

In [42]:
result_citations = pd.DataFrame(result_citations)
result_citations

Unnamed: 0,uid,EPubDate,doi,citations
0,32593259,,,
1,32593224,2020 Jul 31,,
2,32593209,2020 Jul 16,,
3,32593202,2020 Jul 26,,
4,32593196,,,
...,...,...,...,...
195,32590808,,,
196,32590779,,,
197,32590775,,,
198,32590755,,,


# Abstracts

In [43]:
## Abstract
results_abstracts = entrez_construct_abstract_dict(uids,"XXX@YYY.com")

In [44]:
len(results_abstracts)

105

In [45]:
results_abstracts = pd.DataFrame(results_abstracts)
results_abstracts

Unnamed: 0,uid,abstract,include
0,32593259,"Never before in history, aging was such a sign...",
1,32593224,COVID-19 carries a high risk of severe disease...,
2,32593209,BACKGROUND: The emergence of new SARS-CoV-2 h...,
3,32593202,"COVID- 19, a biomedical disease has serious ph...",
4,32593196,BACKGROUND AND AIMS: Dentistry involves close...,
...,...,...,...
100,32592118,Quantitative studies using validated questionn...,
101,32592114,RATIONALE: Coronavirus disease 2019 (COVID-19...,
102,32592113,To compare clinical and imaging features betwe...,
103,32592104,The aim of this study was to investigate the p...,


# Merge

In [48]:
df_merged = pd.merge(results_abstracts,result_citations)

NameError: name 'citations' is not defined

In [52]:
df_merged.query("citations >= 0").sort_values(["citations"],ascending=False)

Unnamed: 0,uid,abstract,include,EPubDate,doi,citations
79,32592501,Aim This narrative review aims to report on th...,,2020 Sep 30,10.1002/jmv.26232,130.0
96,32592160,"BACKGROUND: Since December 2019, when it firs...",,,10.1007/s10900-020-00870-4,15.0
101,32592114,RATIONALE: Coronavirus disease 2019 (COVID-19...,,2020 Jun 26,10.1007/s11739-020-02416-x,12.0
88,32592396,Final-year medical students in the UK have bee...,,,10.1093/femspd/ftaa031,11.0
77,32592507,Design Case series.Introduction The most commo...,,2020 Jul 20,10.1002/dmrr.3379,6.0
99,32592132,COVID-19 is an emerging infectious disease cap...,,2020 Jun 27,10.1007/s10103-020-03086-z,5.0
87,32592400,"As the COVID-19 pandemic continues to spread, ...",,,10.1093/ajcn/nqaa178,0.0


# To Excel

In [47]:
df_merged.to_excel("COVID_SELECTION_ONE_YEAR.xlsx")