# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
#     for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('compbio.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

180761

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
15852,16026357,,10.1111/j.1365-2052.2005.01324.x,Anim Genet,8605704,Assignment porcine PCK1 and PCK2 genes to SSC1...,2005-08,"[{'fore_name': 'Y', 'last_name': 'Peng', 'affi..."
26654,17418445,,10.1016/j.tig.2007.03.008,Trends Genet,8507085,Positive and negative selection on the mitocho...,2007-04-05,"[{'fore_name': 'Colin D', 'last_name': 'Meikle..."
33022,18334116,,,Ai Zheng,9424852,[Serum proteomic spectra of esophageal squamou...,2008-03,"[{'fore_name': 'Cha-Zhen', 'last_name': 'Liu',..."
33860,18428815,,10.1002/0471142700.nca04as00,Curr Protoc Nucleic Acid Chem,101287865,Useful nucleic acid chemistry web sites.,2001-05,[]
53858,20618438,,10.1111/j.1742-4658.2010.07727.x,FEBS J,101229646,N-glycosylation is important for the correct i...,2010-07-05,"[{'fore_name': 'Lavinia', 'last_name': 'Bhatt'..."
61338,21406205,,10.1016/j.jconrel.2011.03.008,J Control Release,8607908,Following dynamic biological processes through...,2011-03-22,"[{'fore_name': 'Iola F', 'last_name': 'Duarte'..."
61929,21453479,PMC3094196,10.1186/1755-8794-4-28,BMC Med Genomics,101319628,Immunological network signatures of cancer pro...,2011-03-31,"[{'fore_name': 'Trevor', 'last_name': 'Clancy'..."
88241,23806673,,10.1016/j.marpolbul.2013.05.042,Mar Pollut Bull,260231,Genomics in marine monitoring: new opportuniti...,2013-06-24,"[{'fore_name': 'Sarah J', 'last_name': 'Bourla..."
94138,24344193,,10.1093/bioinformatics/btt722,Bioinformatics,9808944,A hierarchical statistical modeling approach t...,2013-12-15,"[{'fore_name': 'Cong', 'last_name': 'Zhou', 'a..."
112634,25974149,,10.1001/jama.2015.5533,JAMA,7501160,Single molecules meet genomics: pinpointing pr...,2015-05-26,"[{'fore_name': 'Xiaoliang Sunney', 'last_name'..."


In [5]:
author_rows = list()
affiliation_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row_template = {'pmid': pmid, 'position': i + 1}
        row = row_template.copy()
        row.update(author)
        row['reverse_position'] = len(authors) - i
        for affiliation in row.pop('affiliations'):
            affiliation_rows.append(
                {**row_template, "affiliation": affiliation}
            )
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,7477412,1,A H,Samad,10
1,7477412,2,W W,Cai,9
2,7477412,3,X,Hu,8
3,7477412,4,B,Irvin,7
4,7477412,5,J,Jing,6


In [6]:
affiliation_df = pandas.DataFrame(affiliation_rows)
affiliation_df = affiliation_df.sort_values(['pmid', 'position'])
affiliation_df.head()    

Unnamed: 0,pmid,position,affiliation
0,7477412,1,"Dept. of Pathology, Cornell Medical College, N..."
1,7479891,1,"National Center for Human Genome Research, Nat..."
2,7479895,1,"National Center for Human Genome Research, Nat..."
3,7480790,1,"Computation Center, Institute of Physical and ..."
4,7497116,1,"Sandia National Labs, Albuquerque, New Mexico ..."


In [7]:
# Number of unique affiliations
affiliation_df.affiliation.nunique()

386461

In [8]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

# Write affiliation dataframe to TSV
affiliation_df.to_csv('data/pubmed/affiliations.tsv.xz', sep='\t', index=False)

In [9]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

PLoS One                            7755
Methods Mol Biol                    5440
Bioinformatics                      5064
J Proteome Res                      4319
BMC Genomics                        4086
                                    ... 
J Card Surg                            1
Arch Ital Urol Androl                  1
Skinmed                                1
Vasc Med                               1
Arch Dis Child Fetal Neonatal Ed       1
Name: journal, Length: 4547, dtype: int64

In [10]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,A A Case Rep,AACN Adv Crit Care,AACN Clin Issues,AAOHN J,AAPS J,AAPS PharmSci,ABNF J,ACS Appl Mater Interfaces,ACS Chem Biol,ACS Chem Neurosci,...,Zoo Biol,Zool Res,Zoolog Sci,Zoology (Jena),Zoonoses Public Health,Zootaxa,Zygote,eNeuro,mBio,mSphere
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,1,6,1,1,5,1,0,10,89,8,...,2,0,15,1,2,9,2,0,0,0
True,0,0,1,0,24,2,1,1,84,17,...,0,4,0,0,0,0,0,8,204,37


In [11]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
8,7510179,,,Vestn Ross Akad Med Nauk,9215641,[Molecular diagnosis of genetic diseases in Ru...,,"[{'fore_name': 'V S', 'last_name': 'Baranov', ..."
9,7510181,,,Vestn Ross Akad Med Nauk,9215641,"[Administration, maintenance and expression of...",,"[{'fore_name': 'V I', 'last_name': 'Ivanov', '..."
10,7513103,,,Turk J Pediatr,0417505,"Child health, the genome project and phenylket...",,"[{'fore_name': 'C R', 'last_name': 'Scriver', ..."
40,7764291,,,Australas Biotechnol,9113681,Some ethical issues in genetic medicine.,,"[{'fore_name': 'N A', 'last_name': 'Tonti-Fili..."
41,7764293,,,Australas Biotechnol,9113681,Legal rights and genetic engineering.,,"[{'fore_name': 'N', 'last_name': 'Stoianoff', ..."
...,...,...,...,...,...,...,...,...
180098,31802060,,,Hell J Nucl Med,101257471,Predicting oligonucleotide therapeutic efficac...,,"[{'fore_name': 'Pantazis I', 'last_name': 'The..."
180120,31808361,PMC6900616,10.1177/1533033819892260,Technol Cancer Res Treat,101140941,Bioinformatics Analysis of Expression and Alte...,,"[{'fore_name': 'Yong-Zi', 'last_name': 'Chen',..."
180538,31876272,,10.1134/s0026898419060156,Mol Biol (Mosk),0105454,[3D Genomics].,,"[{'fore_name': 'S V', 'last_name': 'Razin', 'a..."
180662,31898667,,10.4103/jcrt.jcrt_866_18,J Cancer Res Ther,101249598,Regulation of HMGA2 and KRAS genes in epitheli...,,"[{'fore_name': 'Tuba', 'last_name': 'Gunel', '..."


In [12]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    136245
7.0      27273
4.0      14032
Name: publication_date, dtype: int64