In [None]:
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import requests
import urllib.parse
import uuid
import xml.etree.ElementTree as ET
from collections import OrderedDict
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm_notebook as tqdm

plotly.offline.init_notebook_mode(connected=False)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/NAACL 2024 /Scientometrics
!ls

/content/drive/MyDrive/NAACL 2024 /Scientometrics
'1. Global Health PMIDs Extraction.ipynb'   EDA.ipynb		   'List of economies.csv'
 Data_extraction.ipynb			    gender-api-output.csv   Preprocessing.ipynb


In [None]:
# const
BASEURL_INFO = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
BASEURL_SRCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
BASEURL_FTCH = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'

# parameters
SOURCE_DB    = 'pubmed'
#Search for journals using term search
# TODO: Remove Journal of Planetary Health
TERM         = '(Large Language Models[Title/Abstract] OR LLMs[Title/Abstract] OR Language Models[Title/Abstract]) AND (Health care[Title/Abstract] OR Healthcare[Title/Abstract] OR Medicine[Title/Abstract] OR Medical[Title/Abstract] OR Clinical[Title/Abstract])' # OR Health[MeSH Terms] OR Medicine[MeSH Terms] OR Healthcare[MeSH Terms] OR Clinical[MeSH Terms]' # OR Lancet[journal] OR BMJ[journal] OR JAMA[journal] OR NEJM[journal] OR PLOS[journal] OR BMC[journal] OR JMIR[journal] OR Medline[journal] OR Health Informatics[journal] OR Medical Informatics[journal] OR Healthcare Informatics[journal]'
DATE_TYPE    = 'pdat'       # Type of date used to limit a search. The allowed values vary between Entrez databases, but common values are 'mdat' (modification date), 'pdat' (publication date) and 'edat' (Entrez date). Generally an Entrez database will have only two allowed values for datetype.
MIN_DATE     = '2021/01/01' # yyyy/mm/dd
MAX_DATE     = '2024/12/31' # yyyy/mm/dd
SEP          = '|'
BATCH_NUM    = 1000

# seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:

'''
make query function

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def mkquery(base_url, params):
    base_url += '?'
    for key, value in zip(params.keys(), params.values()):
        base_url += '{key}={value}&'.format(key=key, value=value)
    url = base_url[0:len(base_url) - 1]
    print('request url is: ' + url)
    return url

'''
getXmlFromURL
(mkquery wrapper)

base_url: base_url
params: parameter dictionary
        ex) {key1: value1, key2: value2}
'''
def getXmlFromURL(base_url, params):
    response = requests.get(mkquery(base_url, params))
    return ET.fromstring(response.text)

'''
getTextFromNode

root: Xml root node
path: XPath
fill: fill na string
mode: 0 = text, 1 = attribute
attrib: attribute name
'''
def getTextFromNode(root, path, fill='', mode=0, attrib='attribute'):
    if (root.find(path) == None):
        return fill
    else:
        if mode == 0:
            return root.find(path).text
        if mode == 1:
            return root.find(path).get(attrib)


# example
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})

request url is: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=pubmed


In [None]:
# Info API
rootXml = getXmlFromURL(BASEURL_INFO, {'db': SOURCE_DB})
print(rootXml.find('DbInfo').find('Count').text)
print(rootXml.find('DbInfo').find('LastUpdate').text)

request url is: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=pubmed
37350140
2024/06/14 19:19


In [None]:
# get xml
rootXml = getXmlFromURL(BASEURL_SRCH, {
    'db': SOURCE_DB,
    'term': TERM,
    'usehistory': 'y',
    'datetype': DATE_TYPE,
    'mindate': MIN_DATE,
    'maxdate': MAX_DATE})

request url is: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=(Large Language Models[Title/Abstract] OR LLMs[Title/Abstract] OR Language Models[Title/Abstract]) AND (Health care[Title/Abstract] OR Healthcare[Title/Abstract] OR Medicine[Title/Abstract] OR Medical[Title/Abstract] OR Clinical[Title/Abstract])&usehistory=y&datetype=pdat&mindate=2021/01/01&maxdate=2024/12/31


In [None]:
# get querykey and webenv
Count = rootXml.find('Count').text
QueryKey = rootXml.find('QueryKey').text
WebEnv = urllib.parse.quote(rootXml.find('WebEnv').text)

print('total Count: ', Count)
print('QueryKey   : ', QueryKey)
print('WebEnv     : ', WebEnv)

total Count:  1274
QueryKey   :  1
WebEnv     :  MCID_666e16f9b1027e6e904c02b2


In [None]:
articleDics = []
authorArticleDics = []
authorAffiliationDics = []

def pushData(rootXml):
    for article in rootXml.iter('PubmedArticle'):
        # get article info
        articleDic = {
            'PMID'                    : getTextFromNode(article, 'MedlineCitation/PMID', ''),
            'JournalTitle'            : getTextFromNode(article, 'MedlineCitation/Article/Journal/Title', ''),
            'Title'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleTitle', ''),
            'doi'                     : getTextFromNode(article, 'MedlineCitation/Article/ELocationID[@EIdType="doi"]', ''),
            'Abstract'                : getTextFromNode(article, 'MedlineCitation/Article/Abstract/AbstractText', ''),
        #    if you want to get data in flat(denormalized), uncomment below. but it's difficult to use for analytics.
        #    'Authors'                 : SEP.join([author.find('ForeName').text + ' ' +  author.find('LastName').text if author.find('CollectiveName') == None else author.find('CollectiveName').text for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifiers'       : SEP.join([getTextFromNode(author, 'Identifier', 'None') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
        #    'AuthorIdentifierSources' : SEP.join([getTextFromNode(author, 'Identifier', 'None', 1, 'Source') for author in article.findall('MedlineCitation/Article/AuthorList/')]),
            'Language'                : getTextFromNode(article, 'MedlineCitation/Article/Language', ''),
            'Year_A'                  : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Year', ''),
            'Month_A'                 : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Month', ''),
            'Day_A'                   : getTextFromNode(article, 'MedlineCitation/Article/ArticleDate/Day', ''),
            'Year_PM'                 : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Year', ''),
            'Month_PM'                : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Month', ''),
            'Day_PM'                  : getTextFromNode(article, 'PubmedData/History/PubMedPubDate[@PubStatus="pubmed"]/Day', ''),
            'Status'                  : getTextFromNode(article, './PubmedData/PublicationStatus', ''),
            'MeSH'                    : SEP.join([getTextFromNode(mesh, 'DescriptorName') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'MeSH_UI'                 : SEP.join([getTextFromNode(mesh, 'DescriptorName', '', 1, 'UI') for mesh in article.findall('MedlineCitation/MeshHeadingList/')]),
            'Keyword'                 : SEP.join([keyword.text if keyword.text != None else ''  for keyword in article.findall('MedlineCitation/KeywordList/')])
        }
        articleDics.append(OrderedDict(articleDic))

        if article.find('MedlineCitation/MeshHeadingList/MeshHeading/') != None:
            tmp = article

        # get author info
        for author in article.findall('MedlineCitation/Article/AuthorList/'):

            # publish author ID
            # * It's only random id. not use for identify author. if you want to identify author, you can use identifier.
            authorId = str(uuid.uuid4())

            # author article
            authorArticleDic = {
                'authorId'         : authorId,
                'PMID'             : getTextFromNode(article, 'MedlineCitation/PMID', ''),
                'name'             : getTextFromNode(author, 'ForeName') + ' ' +  getTextFromNode(author,'LastName') if author.find('CollectiveName') == None else author.find('CollectiveName').text,
                'identifier'       : getTextFromNode(author, 'Identifier', '') ,
                'identifierSource' : getTextFromNode(author, 'Identifier', '', 1, 'Source')
            }
            authorArticleDics.append(OrderedDict(authorArticleDic))

            # author affiliation(author: affiliation = 1 : n)
            if author.find('./AffiliationInfo') != None:
                for affiliation in author.findall('./AffiliationInfo'):
                    authorAffiliationDic = {
                        'authorId'          : authorId,
                        'affiliation'       : getTextFromNode(affiliation, 'Affiliation', '') ,
                    }
                    authorAffiliationDics.append(OrderedDict(authorAffiliationDic))

In [None]:
# ceil
iterCount = math.ceil(int(Count) / BATCH_NUM)

# get all data
for i in tqdm(range(iterCount)):
    rootXml = getXmlFromURL(BASEURL_FTCH, {
        'db': SOURCE_DB,
        'query_key': QueryKey,
        'WebEnv': WebEnv,
        'retstart': i * BATCH_NUM,
        'retmax': BATCH_NUM,
        'retmode': 'xml'})

    pushData(rootXml)


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



  0%|          | 0/2 [00:00<?, ?it/s]

request url is: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_666e16f9b1027e6e904c02b2&retstart=0&retmax=1000&retmode=xml
request url is: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_666e16f9b1027e6e904c02b2&retstart=1000&retmax=1000&retmode=xml


In [None]:
# article
df_article = pd.DataFrame(articleDics)
# df_article.head(10)
df_article

Unnamed: 0,PMID,JournalTitle,Title,doi,Abstract,Language,Year_A,Month_A,Day_A,Year_PM,Month_PM,Day_PM,Status,MeSH,MeSH_UI,Keyword
0,38876802,Regional anesthesia and pain medicine,Artificial intelligence and regional anesthesi...,10.1136/rapm-2024-105522,Artificial intelligence (AI) has demonstrated ...,eng,2024,06,14,2024,6,15,aheadofprint,,,EDUCATION|REGIONAL ANESTHESIA|TECHNOLOGY
1,38876484,JMIR mental health,Crisis prediction among tele-mental health pat...,10.2196/58129,Due to recent advances in artificial intellige...,eng,2024,06,14,2024,6,15,aheadofprint,,,
2,38875696,Journal of medical Internet research,Triage Performance Across Large Language Model...,10.2196/53297,Large language models (LLMs) have demonstrated...,eng,2024,06,14,2024,6,14,epublish,Triage|Humans|Emergency Medicine|Physicians|Em...,D014218|D006801|D004635|D010820|D004636|D00780...,ChatGPT|German|Germany|artificial intelligence...
3,38875562,JMIR AI,"Cost, Usability, Credibility, Fairness, Accoun...",10.2196/51834,The world has witnessed increased adoption of ...,eng,2024,04,23,2024,6,14,epublish,,,AHP|CUC-FATE framework|ChatGPT|LLM|TISM|adopti...
4,38875551,JMIR AI,Online Health Search Via Multidimensional Info...,10.2196/42630,Widespread misinformation in web resources can...,eng,2024,05,02,2024,6,14,epublish,,,deep learning|health misinformation|infodemic|...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1269,33684612,Neural networks : the official journal of the ...,Comparative study using inverse ontology cogen...,10.1016/j.neunet.2021.01.018,This paper introduces inverse ontology cogency...,eng,2021,02,11,2021,3,9,ppublish,Biological Ontologies|Cerebral Cortex|Database...,D064229|D002540|D016208|D006801|D009317|D00932...,Cogent confabulation|Concept recognition|Langu...
1270,33584354,Frontiers in psychiatry,Deep Learning-Based Natural Language Processin...,10.3389/fpsyt.2020.533949,The introduction of pre-trained language model...,eng,2021,01,15,2021,2,16,epublish,,,deep learning|natural language processing|pati...
1271,33441905,Scientific reports,Embeddings from deep learning transfer GO anno...,10.1038/s41598-020-80786-0,Knowing protein function is crucial to advance...,eng,2021,01,13,2021,1,15,epublish,Amino Acid Sequence|Amino Acids|Computational ...,D000595|D000596|D019295|D000077321|D063990|D00...,
1272,33290879,Journal of biomedical informatics,Language models are an effective representatio...,10.1016/j.jbi.2020.103637,Widespread adoption of electronic health recor...,eng,2020,12,05,2020,12,9,ppublish,Electronic Health Records|Humans|Machine Learn...,D057286|D006801|D000069550|D015233|D009323|D01...,Electronic health record|Machine learning|Repr...


In [None]:
# select a column as series and then convert it into a column
list_of_PMIDs = df_article['PMID'].to_list()
list_of_PMIDs

['38876802',
 '38876484',
 '38875696',
 '38875562',
 '38875551',
 '38875534',
 '38872496',
 '38871125',
 '38866751',
 '38866172',
 '38864738',
 '38860619',
 '38860299',
 '38858069',
 '38857514',
 '38857454',
 '38855241',
 '38854210',
 '38852215',
 '38848553',
 '38847898',
 '38846858',
 '38842929',
 '38841582',
 '38839663',
 '38839458',
 '38838389',
 '38837145',
 '38836893',
 '38836141',
 '38834291',
 '38833694',
 '38833383',
 '38833165',
 '38832874',
 '38832862',
 '38829731',
 '38827102',
 '38827099',
 '38827084',
 '38827077',
 '38827075',
 '38827067',
 '38827064',
 '38827063',
 '38827053',
 '38827047',
 '38826991',
 '38826441',
 '38826372',
 '38826194',
 '38825181',
 '38823673',
 '38823633',
 '38819879',
 '38819655',
 '38819632',
 '38818204',
 '38818116',
 '38817799',
 '38817773',
 '38816721',
 '38814572',
 '38812088',
 '38810498',
 '38810475',
 '38810206',
 '38806945',
 '38806794',
 '38806403',
 '38801706',
 '38801658',
 '38800693',
 '38800264',
 '38798420',
 '38798194',
 '38796352',

In [None]:
df_article.to_csv('papers_2021_2024.csv', index=True)