# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

29795

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
#article_df['last_author'] = article_df.authors.map(lambda x: '{fore_name} {last_name}'.format(**x[-1]) if x else None)
# article_df['last_author_fore_name'] = article_df.authors.map(lambda x: x[-1]['fore_name'] if x else None)
# article_df['last_author_last_name'] = article_df.authors.map(lambda x: x[-1]['last_name'] if x else None)
# article_df['n_authors'] = article_df.authors.map(len)
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
9427,9545447,,10.1093/bioinformatics/14.2.151,Bioinformatics,9808944,A major component approach to presenting conse...,1998,"[{'fore_name': 'D K', 'last_name': 'Smith', 'a..."
11780,15593406,PMC2390821,10.1093/bioinformatics/btg218,Bioinformatics,9808944,FindGDPs: identification of primers for labeli...,2003-09-01,"[{'fore_name': 'Robert J', 'last_name': 'Blick..."
12616,16500942,,10.1093/bioinformatics/btl053,Bioinformatics,9808944,Algorithm to find gene expression profiles of ...,2006-02-24,"[{'fore_name': 'C', 'last_name': 'Prieto', 'af..."
838,16549017,PMC1525208,10.1186/1471-2105-7-160,BMC Bioinformatics,100965194,More robust detection of motifs in coexpressed...,2006-03-20,"[{'fore_name': 'Pieter', 'last_name': 'Monsieu..."
12677,16601005,,10.1093/bioinformatics/btl120,Bioinformatics,9808944,Analysing the ability to retain sidechain hydr...,2006-04-06,"[{'fore_name': 'Alison L', 'last_name': 'Cuff'..."
998,16790041,PMC1559650,10.1186/1471-2105-7-310,BMC Bioinformatics,100965194,Docking protein domains in contact space.,2006-06-21,"[{'fore_name': 'Stefano', 'last_name': 'Lise',..."
14486,19153134,PMC2647831,10.1093/bioinformatics/btp024,Bioinformatics,9808944,FrameDP: sensitive peptide detection on noisy ...,2009-01-19,"[{'fore_name': 'Jérôme', 'last_name': 'Gouzy',..."
15143,20031974,PMC3716225,10.1093/bioinformatics/btp706,Bioinformatics,9808944,BRAT: bisulfite-treated reads analysis tool.,2009-12-22,"[{'fore_name': 'Elena Y', 'last_name': 'Harris..."
15672,20861031,,10.1093/bioinformatics/btq537,Bioinformatics,9808944,MOBI: a web server to define and visualize str...,2010-09-21,"[{'fore_name': 'Alberto J M', 'last_name': 'Ma..."
4618,22151178,PMC3269943,10.1186/1471-2105-12-s8-s8,BMC Bioinformatics,100965194,Benchmarking of the 2010 BioCreative Challenge...,2011-10-03,"[{'fore_name': 'Andrew', 'last_name': 'Chatr-A..."


In [5]:
author_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row = {'pmid': pmid, 'position': i + 1}
        row.update(author)
        row['reverse_position'] = len(authors) - i
        row.pop('affiliations')
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,9520496,1,B A,Eckman,7
1,9520496,2,J S,Aaronson,6
2,9520496,3,J A,Borkowski,5
3,9520496,4,W J,Bailey,4
4,9520496,5,K O,Elliston,3


In [6]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

In [7]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

Bioinformatics        13877
BMC Bioinformatics     9409
PLoS Comput Biol       6509
Name: journal, dtype: int64

In [8]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,BMC Bioinformatics,Bioinformatics,PLoS Comput Biol
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,20,8794,18
True,9389,5083,6491


In [9]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
9607,10487860,,10.1093/bioinformatics/15.7.528,Bioinformatics,9808944,Evaluation of human-readable annotation in bio...,,"[{'fore_name': 'F', 'last_name': 'Eisenhaber',..."
9608,10487861,,10.1093/bioinformatics/15.7.536,Bioinformatics,9808944,Complete genomes in WWW Entrez: data represent...,,"[{'fore_name': 'T A', 'last_name': 'Tatusova',..."
9609,10487862,,10.1093/bioinformatics/15.7.544,Bioinformatics,9808944,Development of the receptor database (RDB): ap...,,"[{'fore_name': 'K', 'last_name': 'Nakata', 'af..."
9610,10487863,,10.1093/bioinformatics/15.7.553,Bioinformatics,9808944,Regression trees for analysis of mutational sp...,,"[{'fore_name': 'V B', 'last_name': 'Berikov', ..."
9611,10487864,,10.1093/bioinformatics/15.7.563,Bioinformatics,9808944,Identifying DNA and protein patterns with stat...,,"[{'fore_name': 'G Z', 'last_name': 'Hertz', 'a..."
9612,10487865,,10.1093/bioinformatics/15.7.578,Bioinformatics,9808944,Analysis of base-pairing potentials between 16...,,"[{'fore_name': 'Y', 'last_name': 'Osada', 'aff..."
9613,10487866,,10.1093/bioinformatics/15.7.582,Bioinformatics,9808944,Nucleosomal DNA property database.,,"[{'fore_name': 'V G', 'last_name': 'Levitsky',..."
9614,10487867,,10.1093/bioinformatics/15.7.593,Bioinformatics,9808944,Genetic control of flower morphogenesis in Ara...,,"[{'fore_name': 'L', 'last_name': 'Mendoza', 'a..."
9615,10487868,,10.1093/bioinformatics/15.7.607,Bioinformatics,9808944,SCPD: a promoter database of the yeast Sacchar...,,"[{'fore_name': 'J', 'last_name': 'Zhu', 'affil..."
9616,10487869,,10.1093/bioinformatics/15.7.612,Bioinformatics,9808944,GeneBuilder: interactive in silico prediction ...,,"[{'fore_name': 'L', 'last_name': 'Milanesi', '..."


In [10]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    28463
7.0      1002
4.0       312
Name: publication_date, dtype: int64