In [1]:
#Links
#LINK1: https://www.ncbi.nlm.nih.gov/home/develop/api/
#LINK2: https://colab.research.google.com/drive/1VOuvANFR08twLBROqYwO_TV34pazgF0_
#LINK3: https://www.nlm.nih.gov/pubs/techbull/mj12/mj12_pm_author_ranking.html
#LINK4: https://pubmed.ncbi.nlm.nih.gov/?term=Kumar+V&cauthor_id=32489811
#
#Concerns/notes:
#   Veer probably used LINK1's Entrez as the other stuff looks out of date.
#
#   I found LINK2 which works great if we could modify it to suit our purposes.
#
#   For this code to work, you need to have a computed author id, a number that PubMed calculates in an attempt to distinguish
# authors with the same names. I read LINK3 and from what I understood, there is no guarentee that two authors will be the
# same or different, the search just puts the most-likely-to-be same author first in the search. Though if we only need a small
# amount of studies (10 or 20), this will probably work? It might not be good if there is a relatively unpublished author.
#
#  LINK4 is random person I used for testing purposes. Something to note is that the pubmed interface from LINK4 displays
# results in a different order than the Entrez esearch. However, I did put more parameters into the Entrez search. I have
# not tested if this is the cause for the differing orders.
#
#   There might be a problem getting the computed author id, but I have not looked into this much. We are given the name and
# the Clinical Trials study number. Perhaps there will be an easy way to convert the Clinical Trials study number to a PubMed
# one? If so, I think getting the computed author id will be simple.
#
#   There are some error handling statements in LINK2 regarding HTTPS certificates and verification. Will we need these?
#
#   Encountered an error while trying to scrape data:
# {"error":"API rate limit exceeded","api-key":<key number>,"count":"4","limit":"3"}
# This stopped my search of 5 studies after the 4th study. Could this be a problem or is there a different underlying issue?
# I tried it two more times, both without printing(soup.prettify()), but just print(soup). The second time everything worked
# fine, but the third time I got the error again.

from bs4 import BeautifulSoup
import requests
import urllib.request#, urllib.parse, urllib.error
#import re
#import ssl
import json
#import calendar
#import numpy as np
#import pandas as pd

In [2]:
#We have name, email, and group. We need to somehow get the computed author id. Maybe use study itself?

#Fill in prior information
base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

#Database searching
db = "pubmed"

#Output format
ret = "json"

#Max number of results
retnum = '5'

#Search Query
name = "Kumar+V"

#Computed author id
cauthor = '32489811'

#Assemble elink
url = base + "esearch.fcgi?db=" + db + "&retmode=" + ret + "&retmax=" + retnum + "&term=" + name + "&cauthor_id=" + cauthor

#Test the esearch URL
print(url)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=5&term=Kumar+V&cauthor_id=32489811


In [3]:
#Add search results to a list
webpage = urllib.request.urlopen(url).read()
dict_page =json.loads(webpage)
idlist = dict_page["esearchresult"]["idlist"]

#Test the search results
print(idlist)

['33592128', '33589646', '33589034', '33586721', '33585346']


In [4]:
articles_list = []
testCounter = 0

# Make a new search for each article, scrap the data needed, then add data to a list
for link in idlist:
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=idlist"
    url = url.replace('idlist', link)
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    testCounter += 1
    print(testCounter)
    
    print(soup)
    
    #article = get_bibliography(soup)
    #articles_list.append(article)

1
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">

<pubmedarticleset>
<pubmedarticle>
<medlinecitation owner="NLM" status="Publisher">
<pmid version="1">33592128</pmid>
<daterevised>
<year>2021</year>
<month>02</month>
<day>16</day>
</daterevised>
<article pubmodel="Print-Electronic">
<journal>
<issn issntype="Electronic">1096-9098</issn>
<journalissue citedmedium="Internet">
<pubdate>
<year>2021</year>
<month>Feb</month>
<day>16</day>
</pubdate>
</journalissue>
<title>Journal of surgical oncology</title>
<isoabbreviation>J Surg Oncol</isoabbreviation>
</journal>
<articletitle>Impact of COVID-19 pandemic on cancer surgery: Patient's perspective.</articletitle>
<elocationid eidtype="doi" validyn="Y">10.1002/jso.26429</elocationid>
<abstract>
<abstracttext label="BACKGROUND" nlmcategory="BACKGROUND">Coronavirus disease 2019 (COVID-19) has impacted cancer care globally. 

2
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">

<pubmedarticleset>
<pubmedarticle>
<medlinecitation owner="NLM" status="In-Data-Review">
<pmid version="1">33589646</pmid>
<daterevised>
<year>2021</year>
<month>02</month>
<day>16</day>
</daterevised>
<article pubmodel="Electronic">
<journal>
<issn issntype="Electronic">2056-7189</issn>
<journalissue citedmedium="Internet">
<volume>7</volume>
<issue>1</issue>
<pubdate>
<year>2021</year>
<month>Feb</month>
<day>15</day>
</pubdate>
</journalissue>
<title>NPJ systems biology and applications</title>
<isoabbreviation>NPJ Syst Biol Appl</isoabbreviation>
</journal>
<articletitle>Mapping drug-target interactions and synergy in multi-molecular therapeutics for pressure-overload cardiac hypertrophy.</articletitle>
<pagination>
<medlinepgn>11</medlinepgn>
</pagination>
<elocationid eidtype="doi" validyn="Y">10.1038/s41540-021

3
<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">

<pubmedarticleset>
<pubmedarticle>
<medlinecitation owner="NLM" status="PubMed-not-MEDLINE">
<pmid version="1">33589034</pmid>
<daterevised>
<year>2021</year>
<month>02</month>
<day>17</day>
</daterevised>
<article pubmodel="Print-Electronic">
<journal>
<issn issntype="Electronic">1998-3646</issn>
<journalissue citedmedium="Internet">
<volume>37</volume>
<issue>3</issue>
<pubdate>
<medlinedate>2019 Jul-Sep</medlinedate>
</pubdate>
</journalissue>
<title>Indian journal of medical microbiology</title>
<isoabbreviation>Indian J Med Microbiol</isoabbreviation>
</journal>
<articletitle>Dynamics of the Occurrence of Influenza in Relation to Seasonal Variation in Chennai, Tamil Nadu: A 7 -Year Cumulative Study.</articletitle>
<pagination>
<medlinepgn>401-405</medlinepgn>
</pagination>
<elocationid eidtype="pii" validyn="Y">S