# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

29755

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
10363,12386006,,10.1093/bioinformatics/18.suppl_2.s219,Bioinformatics,9808944,"BioMiner--modeling, analyzing, and visualizing...",2002,"[{'fore_name': 'M', 'last_name': 'Sirava', 'af..."
11179,14990457,,10.1093/bioinformatics/btg461,Bioinformatics,9808944,Support vector machine classification on the web.,2004-01-22,"[{'fore_name': 'Paul', 'last_name': 'Pavlidis'..."
11522,15284101,,10.1093/bioinformatics/bth439,Bioinformatics,9808944,Primer design and marker clustering for multip...,2004-07-29,"[{'fore_name': 'Anton', 'last_name': 'Yuryev',..."
13972,18434343,,10.1093/bioinformatics/btn200,Bioinformatics,9808944,A global pathway crosstalk network.,2008-04-23,"[{'fore_name': 'Yong', 'last_name': 'Li', 'aff..."
3002,19732451,PMC2746223,10.1186/1471-2105-10-280,BMC Bioinformatics,100965194,EDGE(3): a web-based solution for management a...,2009-09-04,"[{'fore_name': 'Aaron L', 'last_name': 'Vollra..."
3566,20429880,PMC2882390,10.1186/1471-2105-11-215,BMC Bioinformatics,100965194,PoGO: Prediction of Gene Ontology terms for fu...,2010-04-29,"[{'fore_name': 'Jaehee', 'last_name': 'Jung', ..."
15775,21088027,PMC3018811,10.1093/bioinformatics/btq644,Bioinformatics,9808944,GPU-BLAST: using graphics processors to accele...,2010-11-18,"[{'fore_name': 'Panagiotis D', 'last_name': 'V..."
4085,21235786,PMC3025837,10.1186/1471-2105-12-21,BMC Bioinformatics,100965194,NClassG+: A classifier for non-classically sec...,2011-01-14,"[{'fore_name': 'Daniel', 'last_name': 'Restrep..."
4138,21342552,PMC3044277,10.1186/1471-2105-12-s1-s22,BMC Bioinformatics,100965194,Motif-All: discovering all phosphorylation mot...,2011-02-15,"[{'fore_name': 'Zengyou', 'last_name': 'He', '..."
25269,22496629,PMC3320577,10.1371/journal.pcbi.1002443,PLoS Comput Biol,101238922,Replication fork polarity gradients revealed b...,2012-04-05,"[{'fore_name': 'Antoine', 'last_name': 'Baker'..."


In [5]:
author_rows = list()
affiliation_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row_template = {'pmid': pmid, 'position': i + 1}
        row = row_template.copy()
        row.update(author)
        row['reverse_position'] = len(authors) - i
        for affiliation in row.pop('affiliations'):
            affiliation_rows.append(
                {**row_template, "affiliation": affiliation}
            )
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,9520496,1,B A,Eckman,7
1,9520496,2,J S,Aaronson,6
2,9520496,3,J A,Borkowski,5
3,9520496,4,W J,Bailey,4
4,9520496,5,K O,Elliston,3


In [6]:
affiliation_df = pandas.DataFrame(affiliation_rows)
affiliation_df = affiliation_df.sort_values(['pmid', 'position'])
affiliation_df.head()    

Unnamed: 0,pmid,position,affiliation
0,9520496,1,"Department of Bioinformatics, Merck Research L..."
1,9520497,1,"Department of Computer Science, University of ..."
2,9520498,1,"GIS Infobiogen, Villejuif, France."
3,9520499,1,Wadsworth Center for Laboratories and Research...
4,9520500,1,"Institute for Biomedical Computing, Washington..."


In [7]:
# Number of unique affiliations
affiliation_df.affiliation.nunique()

50818

In [8]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

# Write affiliation dataframe to TSV
affiliation_df.to_csv('data/pubmed/affiliations.tsv.xz', sep='\t', index=False)

In [9]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

Bioinformatics        13837
BMC Bioinformatics     9409
PLoS Comput Biol       6509
Name: journal, dtype: int64

In [10]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,BMC Bioinformatics,Bioinformatics,PLoS Comput Biol
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,20,8753,18
True,9389,5084,6491


In [11]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
9607,10487860,,10.1093/bioinformatics/15.7.528,Bioinformatics,9808944,Evaluation of human-readable annotation in bio...,,"[{'fore_name': 'F', 'last_name': 'Eisenhaber',..."
9608,10487861,,10.1093/bioinformatics/15.7.536,Bioinformatics,9808944,Complete genomes in WWW Entrez: data represent...,,"[{'fore_name': 'T A', 'last_name': 'Tatusova',..."
9609,10487862,,10.1093/bioinformatics/15.7.544,Bioinformatics,9808944,Development of the receptor database (RDB): ap...,,"[{'fore_name': 'K', 'last_name': 'Nakata', 'af..."
9610,10487863,,10.1093/bioinformatics/15.7.553,Bioinformatics,9808944,Regression trees for analysis of mutational sp...,,"[{'fore_name': 'V B', 'last_name': 'Berikov', ..."
9611,10487864,,10.1093/bioinformatics/15.7.563,Bioinformatics,9808944,Identifying DNA and protein patterns with stat...,,"[{'fore_name': 'G Z', 'last_name': 'Hertz', 'a..."
9612,10487865,,10.1093/bioinformatics/15.7.578,Bioinformatics,9808944,Analysis of base-pairing potentials between 16...,,"[{'fore_name': 'Y', 'last_name': 'Osada', 'aff..."
9613,10487866,,10.1093/bioinformatics/15.7.582,Bioinformatics,9808944,Nucleosomal DNA property database.,,"[{'fore_name': 'V G', 'last_name': 'Levitsky',..."
9614,10487867,,10.1093/bioinformatics/15.7.593,Bioinformatics,9808944,Genetic control of flower morphogenesis in Ara...,,"[{'fore_name': 'L', 'last_name': 'Mendoza', 'a..."
9615,10487868,,10.1093/bioinformatics/15.7.607,Bioinformatics,9808944,SCPD: a promoter database of the yeast Sacchar...,,"[{'fore_name': 'J', 'last_name': 'Zhu', 'affil..."
9616,10487869,,10.1093/bioinformatics/15.7.612,Bioinformatics,9808944,GeneBuilder: interactive in silico prediction ...,,"[{'fore_name': 'L', 'last_name': 'Milanesi', '..."


In [12]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    28423
7.0      1002
4.0       312
Name: publication_date, dtype: int64