# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

28436

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
#article_df['last_author'] = article_df.authors.map(lambda x: '{fore_name} {last_name}'.format(**x[-1]) if x else None)
article_df['last_author_fore_name'] = article_df.authors.map(lambda x: x[-1]['fore_name'] if x else None)
article_df['last_author_last_name'] = article_df.authors.map(lambda x: x[-1]['last_name'] if x else None)
article_df['n_authors'] = article_df.authors.map(len)
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors,last_author_fore_name,last_author_last_name,n_authors
67,12854978,PMC166169,10.1186/1471-2105-4-29,BMC Bioinformatics,100965194,MatGAT: an application that generates similari...,2003-07-10,"[{'fore_name': 'James J', 'last_name': 'Campan...",John,Smalley,3
10332,15073026,,10.1093/bioinformatics/bth174,Bioinformatics,9808944,ArrayNorm: comprehensive normalization and ana...,2004-04-08,"[{'fore_name': 'R', 'last_name': 'Pieler', 'af...",Z,Trajanoski,5
279,15527510,PMC533868,10.1186/1471-2105-5-176,BMC Bioinformatics,100965194,"ESTIMA, a tool for EST management in a multi-p...",2004-11-04,"[{'fore_name': 'Charu G', 'last_name': 'Kumar'...",Lei,Liu,6
330,15673474,PMC548130,10.1186/1471-2105-6-18,BMC Bioinformatics,100965194,Identification of novel prognostic markers in ...,2005-01-26,"[{'fore_name': 'Rifat A', 'last_name': 'Hamoud...",Ming-Qing,Du,3
11502,16303795,,10.1093/bioinformatics/bti793,Bioinformatics,9808944,Describing ancient horizontal gene transfers a...,2005-11-22,"[{'fore_name': 'F', 'last_name': 'Collyn', 'af...",C-A H,Roten,5
1265,17217507,PMC1780126,10.1186/1471-2105-7-s4-s15,BMC Bioinformatics,100965194,Parallelization of multicategory support vecto...,2006-12-12,"[{'fore_name': 'Chaoyang', 'last_name': 'Zhang...",Dequan,Chen,5
1574,17594507,PMC1925121,10.1186/1471-2105-8-223,BMC Bioinformatics,100965194,High-throughput identification of interacting ...,2007-06-27,"[{'fore_name': 'Jo-Lan', 'last_name': 'Chung',...",Philip E,Bourne,3
1736,17931407,PMC2148069,10.1186/1471-2105-8-380,BMC Bioinformatics,100965194,Detailed estimation of bioinformatics predicti...,2007-10-11,"[{'fore_name': 'Oliviero', 'last_name': 'Carug...",Oliviero,Carugo,1
12780,18006552,,10.1093/bioinformatics/btm522,Bioinformatics,9808944,Detecting high-order interactions of single nu...,2007-11-15,"[{'fore_name': 'Robin', 'last_name': 'Nunkesse...",Ingo,Wegener,5
22770,19008937,PMC2570617,10.1371/journal.pcbi.1000221,PLoS Comput Biol,101238922,Polar or apolar--the role of polarity for urea...,2008-11-14,"[{'fore_name': 'Martin C', 'last_name': 'Stump...",Helmut,Grubmüller,2


In [5]:
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'n_authors', 'last_author_fore_name', 'last_author_last_name', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)

In [6]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

Bioinformatics        12963
BMC Bioinformatics     9124
PLoS Comput Biol       6349
Name: journal, dtype: int64

In [7]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,BMC Bioinformatics,Bioinformatics,PLoS Comput Biol
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,25,8166,52
True,9099,4797,6297


In [8]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors,last_author_fore_name,last_author_last_name,n_authors


In [9]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10    27767
7       510
4       159
Name: publication_date, dtype: int64