# Grab Pubmed Abstracts

## Functions

In [67]:
from Bio import Entrez
import json
from bs4 import BeautifulSoup as bs
import lxml
import json
import numpy as np
from tqdm import tqdm
import pandas as pd
import semanticscholar as sch

In [32]:
def entrez_search_pubmed(query,records_per_query=10,email="XXX@YYY.com",retMax=100):
    from Bio import Entrez
    Entrez.email = email
    # Search
    handle = Entrez.esearch(db="pubmed",term=query, idtype="acc", retMax=retMax)
    record = Entrez.read(handle)
    handle.close()
    return record

In [33]:
def flatten_abstract(abstract_xml):
    abstract = ''
    for abstractText in abstract_xml.find_all('abstracttext'):
        if abstractText.get('label') != None:
            abstract = abstract + " " + abstractText.get('label') + ": "
        abstract = abstract + abstractText.text
    return abstract

In [34]:
def entrez_fetch_abstracts(uid,email):
    from Bio import Entrez
    from bs4 import BeautifulSoup as bs
    Entrez.email = email
    handle = Entrez.efetch(db="pubmed", id=uid, rettype='Medline', retmode='xml')
    result = handle.readlines()
    result = b"".join(result)
    bs_content = bs(result, "lxml")
    abstracts = bs_content.find_all('abstract')
    handle.close()
    # Abstract
    results = [ flatten_abstract(abstract) for abstract in abstracts]
    return results

In [35]:
def entrez_construct_abstract_dict(uids,email):
        results = entrez_fetch_abstracts( uids, email )
        return [{'uid':a, 'abstract':b, 'include':''} for a,b in zip(uids, results)]

In [46]:
def entrez_fetch_list_summary(uid_list):
    from Bio import Entrez
    results = [ Entrez.read( Entrez.esummary(db="pubmed", id=uid) ) for uid in uid_list ]
    return results

## Query

In [36]:
query_body={ "query": "\"COVID-19\"[Mesh]", 'email' : "XXX@YYY.com" }
## Query search
results_query = entrez_search_pubmed(query = query_body['query'], email = query_body['email'], retMax=200)

In [37]:
uids = results_query['IdList']
len(uids)

200

# Citations by DOI

In [47]:
result_summaries = entrez_fetch_list_summary(uids)

In [75]:
def citations_from_doi(doi):
    return sch.paper('10.1371/journal.pone.0251960', timeout=2)['numCitedBy']

In [94]:
def extract_epub_doi(summary):
    try:
        return { 'uid':summary['Id'], 'EPubDate': summary['EPubDate'], 'doi': summary['ArticleIds']['doi'], 'citations' : citations_from_doi(summary['ArticleIds']['doi']) }
    except:
        return {'uid':summary['Id'], 'EPubDate': summary['EPubDate'], 'doi': None, 'citations' : None }

In [95]:
result_citations = [ extract_epub_doi(summary[0]) for summary in result_summaries ]
result_citations

[{'uid': '34060442', 'EPubDate': '', 'doi': None, 'citations': None},
 {'uid': '34059550',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34059544',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34059043',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34059036',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34059034',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34059031', 'EPubDate': '2021 Jun 1', 'doi': None, 'citations': None},
 {'uid': '34058989',
  'EPubDate': '2021 May 31',
  'doi': None,
  'citations': None},
 {'uid': '34055314',
  'EPubDate': '2021 Feb 10',
  'doi': None,
  'citations': None},
 {'uid': '34057973', 'EPubDate': '', 'doi': None, 'citations': None},
 {'uid': '34057958', 'EPubDate': '', 'doi': None, 'citations': None},
 {'uid': '34057361',
  'EPubDate': '2021 Mar 23',
  'doi': None,
  'citations': None},
 {'uid': '3405

In [96]:
result_citations = pd.DataFrame(result_citations)
result_citations

Unnamed: 0,uid,EPubDate,doi,citations
0,34060442,,,
1,34059550,2021 May 31,,
2,34059544,2021 May 31,,
3,34059043,2021 May 31,,
4,34059036,2021 May 31,,
...,...,...,...,...
195,34037733,2021 May 3,,
196,34037731,2021 May 3,,
197,34037598,,,
198,34037551,,,


# Abstracts

In [79]:
## Abstract
results_abstracts = entrez_construct_abstract_dict(uids,"XXX@YYY.com")

In [80]:
len(results_abstracts)

169

In [84]:
results_abstracts = pd.DataFrame(results_abstracts)
results_abstracts

Unnamed: 0,uid,abstract,include
0,34060442,"Objective To predict the epitopes of B cells, ...",
1,34059550,We report a case of COVID-19 in a 29-week pret...,
2,34059544,Immune thrombocytopenia (ITP) has been widely ...,
3,34059043,BACKGROUND: The media play a critical role in...,
4,34059036,OBJECTIVES: The second wave of the coronaviru...,
...,...,...,...
164,34039764,Regular surveillance testing of asymptomatic i...,
165,34039763,SARS-CoV-2 infection in minks has become a ser...,
166,34039654,The COVID-19 pandemic has brought biosafety to...,
167,34039588,Importance: Infection with COVID-19 has been ...,


# Merge

In [101]:
df_merged = pd.merge(results_abstracts,result_citations)
df_merged

Unnamed: 0,uid,abstract,include,EPubDate,doi,citations
0,34060442,"Objective To predict the epitopes of B cells, ...",,,,
1,34059550,We report a case of COVID-19 in a 29-week pret...,,2021 May 31,,
2,34059544,Immune thrombocytopenia (ITP) has been widely ...,,2021 May 31,,
3,34059043,BACKGROUND: The media play a critical role in...,,2021 May 31,,
4,34059036,OBJECTIVES: The second wave of the coronaviru...,,2021 May 31,,
...,...,...,...,...,...,...
164,34039764,Regular surveillance testing of asymptomatic i...,,,,
165,34039763,SARS-CoV-2 infection in minks has become a ser...,,,,
166,34039654,The COVID-19 pandemic has brought biosafety to...,,,,
167,34039588,Importance: Infection with COVID-19 has been ...,,,,


# To Excel

In [102]:
df_merged.to_excel("COVID_SELECTION.xlsx")