# Extract a dataframe of PubMed articles from efetch & esummary XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all
from pubmedpy.esummary import parse_esummary_article_info

### Load incoming citation counts from esummary records

In [2]:
docsums = list()
# generator of XML PubmedArticle elements
docsum_elems = iter_extract_elems('data/pubmed/esummary/compbio-english.xml.xz', tag='DocSum')
for elem in docsum_elems:
    # Example esummary XML for <DocSum> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/esummary.xml
    docsums.append(parse_esummary_article_info(elem))
len(docsums)

179656

In [3]:
citation_count_df = (
    pandas.DataFrame(docsums)
    .rename(columns={"pubmed_id": "pmid"})
    [["pmid", "pmc_cited_by_count"]]
)
citation_count_df.head(2)

Unnamed: 0,pmid,pmc_cited_by_count
0,7477412,2
1,7479891,4


### Load article information from efetch records

In [4]:
articles = list()
# generator of XML PubmedArticle elements
article_elems = iter_extract_elems('data/pubmed/efetch/compbio-english.xml.xz', tag='PubmedArticle')
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

179656

In [5]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.merge(citation_count_df, how="left")
article_df = article_df.sort_values('pmid')
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors,pmc_cited_by_count
2211,11448876,,10.1093/bioinformatics/17.7.581,Bioinformatics,9808944,LDB2000: sequence-based integrated maps of the...,2001-07,"[{'fore_name': 'X', 'last_name': 'Ke', 'affili...",6
9983,15266247,PMC1395763,,MedGenMed,100894134,Conference report--genomics: the promise of SN...,2004-04-19,"[{'fore_name': 'Sara M', 'last_name': 'Mariani...",0
15488,16094372,,10.1038/nature03869,Nature,0410462,Identification of JAK/STAT signalling componen...,2005-08-11,"[{'fore_name': 'Patrick', 'last_name': 'Müller...",133
19362,16615769,,10.1021/ac051639u,Anal Chem,0370536,Correlation and convolution analysis of peptid...,2006-04-15,"[{'fore_name': 'Matthew J', 'last_name': 'Snia...",1
51674,20566863,PMC2901460,10.1073/pnas.1003379107,Proc Natl Acad Sci U S A,7505876,Genome sequences of the human body louse and i...,2010-06-21,"[{'fore_name': 'Ewen F', 'last_name': 'Kirknes...",171
61206,21569439,PMC3107185,10.1186/1471-2164-12-237,BMC Genomics,100965258,Genome-level homology and phylogeny of Shewane...,2011-05-12,"[{'fore_name': 'Rebecca B', 'last_name': 'Diko...",16
73946,22761568,PMC3386159,10.1371/journal.pcbi.1002590,PLoS Comput Biol,101238922,Learning with slight forgetting optimizes sens...,2012-06-28,"[{'fore_name': 'Masaya', 'last_name': 'Hirashi...",4
83154,23576030,,10.1002/elps.201200710,Electrophoresis,8204476,Phosphoproteomics--more than meets the eye.,2013-05-14,"[{'fore_name': 'Stefan', 'last_name': 'Loroch'...",8
89965,24187136,PMC3861660,10.1074/jbc.m113.507285,J Biol Chem,2985121R,Inhibition of mitochondrial pyruvate transport...,2013-11-01,"[{'fore_name': 'Jianhai', 'last_name': 'Du', '...",29
92226,24405844,PMC3890554,10.1186/1471-2350-15-6,BMC Med Genet,100968552,A novel MIP gene mutation associated with auto...,2014-01-09,"[{'fore_name': 'Yibo', 'last_name': 'Yu', 'aff...",13


In [6]:
author_rows = list()
affiliation_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row_template = {'pmid': pmid, 'position': i + 1}
        row = row_template.copy()
        row.update(author)
        row['reverse_position'] = len(authors) - i
        for affiliation in row.pop('affiliations'):
            affiliation_rows.append(
                {**row_template, "affiliation": affiliation}
            )
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,7477412,1,A H,Samad,10
1,7477412,2,W W,Cai,9
2,7477412,3,X,Hu,8
3,7477412,4,B,Irvin,7
4,7477412,5,J,Jing,6


In [7]:
affiliation_df = pandas.DataFrame(affiliation_rows)
affiliation_df = affiliation_df.sort_values(['pmid', 'position'])
affiliation_df.head()    

Unnamed: 0,pmid,position,affiliation
0,7477412,1,"Dept. of Pathology, Cornell Medical College, N..."
1,7479891,1,"National Center for Human Genome Research, Nat..."
2,7479895,1,"National Center for Human Genome Research, Nat..."
3,7497116,1,"Sandia National Labs, Albuquerque, New Mexico ..."
4,7497128,1,"Sandia National Laboratories, Albuquerque, NM ..."


In [8]:
# Number of unique affiliations
affiliation_df.affiliation.nunique()

393612

In [9]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'pmc_cited_by_count', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

# Write affiliation dataframe to TSV
affiliation_df.to_csv('data/pubmed/affiliations.tsv.xz', sep='\t', index=False)

In [10]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

PLoS One                           7755
Methods Mol Biol                   5440
Bioinformatics                     5073
J Proteome Res                     4433
BMC Genomics                       4086
                                   ... 
Int J Drug Policy                     1
J Sex Res                             1
Int J Nurs Stud                       1
United European Gastroenterol J       1
Gerodontology                         1
Name: journal, Length: 4300, dtype: int64

In [11]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,A A Case Rep,AACN Adv Crit Care,AACN Clin Issues,AAOHN J,AAPS J,AAPS PharmSci,ABNF J,ACS Appl Mater Interfaces,ACS Chem Biol,ACS Chem Neurosci,...,Zoo Biol,Zool Res,Zoolog Sci,Zoology (Jena),Zoonoses Public Health,Zootaxa,Zygote,eNeuro,mBio,mSphere
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,1,6,1,1,5,1,0,11,89,8,...,2,0,15,1,2,9,2,0,0,0
True,0,0,1,0,24,2,1,1,84,17,...,0,4,0,0,0,0,0,8,204,37


In [12]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors,pmc_cited_by_count
7,7513103,,,Turk J Pediatr,0417505,"Child health, the genome project and phenylket...",,"[{'fore_name': 'C R', 'last_name': 'Scriver', ...",0
34,7764291,,,Australas Biotechnol,9113681,Some ethical issues in genetic medicine.,,"[{'fore_name': 'N A', 'last_name': 'Tonti-Fili...",0
35,7764293,,,Australas Biotechnol,9113681,Legal rights and genetic engineering.,,"[{'fore_name': 'N', 'last_name': 'Stoianoff', ...",0
54,7856962,,,Am Nurse,7506499,Survey assesses RN management of genetic infor...,,"[{'fore_name': 'C', 'last_name': 'Scanlon', 'a...",0
62,7965253,,10.1111/j.1552-6909.1994.tb01909.x,J Obstet Gynecol Neonatal Nurs,8503123,The genome project.,,"[{'fore_name': 'B S', 'last_name': 'Raff', 'af...",0
...,...,...,...,...,...,...,...,...,...
178813,31808361,PMC6900616,10.1177/1533033819892260,Technol Cancer Res Treat,101140941,Bioinformatics Analysis of Expression and Alte...,,"[{'fore_name': 'Yong-Zi', 'last_name': 'Chen',...",0
179500,31898667,,10.4103/jcrt.jcrt_866_18,J Cancer Res Ther,101249598,Regulation of HMGA2 and KRAS genes in epitheli...,,"[{'fore_name': 'Tuba', 'last_name': 'Gunel', '...",1
179614,32053766,,,Discov Med,101250006,Translating cancer genomics for precision onco...,,"[{'fore_name': 'Philipp K', 'last_name': 'Habe...",0
179647,32421967,,10.1615/critrevimmunol.2019033126,Crit Rev Immunol,8914819,"Pathophysiology, Etiology, Epidemiology of Typ...",,"[{'fore_name': 'Begum', 'last_name': 'Dariya',...",0


In [13]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    138089
7.0      24915
4.0      13784
Name: publication_date, dtype: int64