# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

29624

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
#article_df['last_author'] = article_df.authors.map(lambda x: '{fore_name} {last_name}'.format(**x[-1]) if x else None)
# article_df['last_author_fore_name'] = article_df.authors.map(lambda x: x[-1]['fore_name'] if x else None)
# article_df['last_author_last_name'] = article_df.authors.map(lambda x: x[-1]['last_name'] if x else None)
# article_df['n_authors'] = article_df.authors.map(len)
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
9647,11159308,,10.1093/bioinformatics/16.11.968,Bioinformatics,9808944,Short interrupted palindromes on the extrageni...,2000-11,"[{'fore_name': 'A T', 'last_name': 'Vasconcelo..."
11357,15284100,,10.1093/bioinformatics/bth440,Bioinformatics,9808944,Accurate detection of aneuploidies in array CG...,2004-07-29,"[{'fore_name': 'Chad L', 'last_name': 'Myers',..."
11827,15802286,,10.1093/bioinformatics/bti420,Bioinformatics,9808944,A comprehensive and non-redundant database of ...,2005-03-31,"[{'fore_name': 'Guoying', 'last_name': 'Qi', '..."
23353,17319737,PMC1808025,10.1371/journal.pcbi.0030020,PLoS Comput Biol,101238922,Improving the Caenorhabditis elegans genome an...,2006-12-21,"[{'fore_name': 'Gunnar', 'last_name': 'Rätsch'..."
13614,18056063,,10.1093/bioinformatics/btm592,Bioinformatics,9808944,Model-based deconvolution of genome-wide DNA b...,2007-12-01,"[{'fore_name': 'David J', 'last_name': 'Reiss'..."
2944,19615046,PMC2720391,10.1186/1471-2105-10-222,BMC Bioinformatics,100965194,PCI-SS: MISO dynamic nonlinear protein seconda...,2009-07-17,"[{'fore_name': 'James R', 'last_name': 'Green'..."
14894,19880367,PMC2796817,10.1093/bioinformatics/btp615,Bioinformatics,9808944,inGAP: an integrated next-generation genome an...,2009-10-30,"[{'fore_name': 'Ji', 'last_name': 'Qi', 'affil..."
3444,20175920,PMC3098051,10.1186/1471-2105-11-100,BMC Bioinformatics,100965194,"DraGnET: software for storing, managing and an...",2010-02-22,"[{'fore_name': 'Stacy', 'last_name': 'Duncan',..."
3707,20576157,PMC2911458,10.1186/1471-2105-11-347,BMC Bioinformatics,100965194,Local alignment of generalized k-base encoded ...,2010-06-24,"[{'fore_name': 'Nils', 'last_name': 'Homer', '..."
4320,21612626,PMC3123608,10.1186/1471-2105-12-201,BMC Bioinformatics,100965194,CAPL: an efficient association software packag...,2011-05-25,"[{'fore_name': 'Ren-Hua', 'last_name': 'Chung'..."


In [5]:
author_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row = {'pmid': pmid, 'position': i + 1}
        row.update(author)
        row['reverse_position'] = len(authors) - i
        row.pop('affiliations')
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,10068688,1,C,Médigue,4
1,10068688,2,F,Rechenmann,3
2,10068688,3,A,Danchin,2
3,10068688,4,A,Viari,1
4,10068689,1,W,Kasprzak,2


In [6]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

In [7]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

Bioinformatics        13743
BMC Bioinformatics     9372
PLoS Comput Biol       6509
Name: journal, dtype: int64

In [8]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,BMC Bioinformatics,Bioinformatics,PLoS Comput Biol
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,99,8671,51
True,9273,5072,6458


In [9]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
9443,10487860,,10.1093/bioinformatics/15.7.528,Bioinformatics,9808944,Evaluation of human-readable annotation in bio...,,"[{'fore_name': 'F', 'last_name': 'Eisenhaber',..."
9444,10487861,,10.1093/bioinformatics/15.7.536,Bioinformatics,9808944,Complete genomes in WWW Entrez: data represent...,,"[{'fore_name': 'T A', 'last_name': 'Tatusova',..."
9445,10487862,,10.1093/bioinformatics/15.7.544,Bioinformatics,9808944,Development of the receptor database (RDB): ap...,,"[{'fore_name': 'K', 'last_name': 'Nakata', 'af..."
9446,10487863,,10.1093/bioinformatics/15.7.553,Bioinformatics,9808944,Regression trees for analysis of mutational sp...,,"[{'fore_name': 'V B', 'last_name': 'Berikov', ..."
9447,10487864,,10.1093/bioinformatics/15.7.563,Bioinformatics,9808944,Identifying DNA and protein patterns with stat...,,"[{'fore_name': 'G Z', 'last_name': 'Hertz', 'a..."
9448,10487865,,10.1093/bioinformatics/15.7.578,Bioinformatics,9808944,Analysis of base-pairing potentials between 16...,,"[{'fore_name': 'Y', 'last_name': 'Osada', 'aff..."
9449,10487866,,10.1093/bioinformatics/15.7.582,Bioinformatics,9808944,Nucleosomal DNA property database.,,"[{'fore_name': 'V G', 'last_name': 'Levitsky',..."
9450,10487867,,10.1093/bioinformatics/15.7.593,Bioinformatics,9808944,Genetic control of flower morphogenesis in Ara...,,"[{'fore_name': 'L', 'last_name': 'Mendoza', 'a..."
9451,10487868,,10.1093/bioinformatics/15.7.607,Bioinformatics,9808944,SCPD: a promoter database of the yeast Sacchar...,,"[{'fore_name': 'J', 'last_name': 'Zhu', 'affil..."
9452,10487869,,10.1093/bioinformatics/15.7.612,Bioinformatics,9808944,GeneBuilder: interactive in silico prediction ...,,"[{'fore_name': 'L', 'last_name': 'Milanesi', '..."


In [10]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    28419
7.0       990
4.0       197
Name: publication_date, dtype: int64