# Extract a dataframe of PubMed articles from efetch XML results

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml import iter_extract_elems
from pubmedpy.efetch import extract_all

In [2]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
#     for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('compbio-english.xml.xz'))
)

In [3]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    articles.append(extract_all(elem))
len(articles)

177262

In [4]:
article_df = pandas.DataFrame(articles)
article_df.pmid = article_df.pmid.astype(int)
article_df = article_df.sort_values('pmid')
article_df.sample(n=20, random_state=0).sort_values('pmid')

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
7138,14580578,,10.1016/j.copbio.2003.08.001,Curr Opin Biotechnol,9100492,Advances in flux balance analysis.,2003-10,"[{'fore_name': 'Kenneth J', 'last_name': 'Kauf..."
8766,15059992,PMC383295,10.1101/gr.1984404,Genome Res,9518021,Patterns of insertions and their covariation w...,2004-04,"[{'fore_name': 'Shan', 'last_name': 'Yang', 'a..."
11513,15562315,PMC532387,10.1371/journal.pbio.0020412,PLoS Biol,101183755,In silico reconstitution of Listeria propulsio...,2004-11-30,"[{'fore_name': 'Jonathan B', 'last_name': 'Alb..."
15511,16097032,,10.1002/pmic.200401215,Proteomics,101092707,Subproteomic analysis of metal-interacting pro...,2005-09,"[{'fore_name': 'Kirsten', 'last_name': 'Heiss'..."
25883,17453953,,10.1080/02713680701215322,Curr Eye Res,8104312,Feasibility of two-dimensional gel electrophor...,2007-04,"[{'fore_name': 'Fan', 'last_name': 'Lu', 'affi..."
27922,17705782,,10.1162/evco.2007.15.3.345,Evol Comput,9513581,A new approach of data clustering using a floc...,2007,"[{'fore_name': 'Fabien', 'last_name': 'Picarou..."
32634,18436738,PMC2518900,10.1182/blood-2007-07-099432,Blood,7603509,Genomic complexity identifies patients with ag...,2008-04-24,"[{'fore_name': 'Lisa', 'last_name': 'Kujawski'..."
56354,21124818,PMC2991263,10.1371/journal.ppat.1001211,PLoS Pathog,101238921,Glycosylation focuses sequence variation in th...,2010-11-24,"[{'fore_name': 'Suman R', 'last_name': 'Das', ..."
59270,21388834,,10.1016/j.concog.2011.02.010,Conscious Cogn,9303140,Towards a computational theory of experience.,2011-03-08,"[{'fore_name': 'Tomer', 'last_name': 'Fekete',..."
72594,22635606,PMC3400962,10.1093/bioinformatics/bts315,Bioinformatics,9808944,Gowinda: unbiased analysis of gene set enrichm...,2012-05-26,"[{'fore_name': 'Robert', 'last_name': 'Kofler'..."


In [5]:
author_rows = list()
affiliation_rows = list()
for pmid, authors in zip(article_df.pmid, article_df.authors):
    for i, author in enumerate(authors):
        row_template = {'pmid': pmid, 'position': i + 1}
        row = row_template.copy()
        row.update(author)
        row['reverse_position'] = len(authors) - i
        for affiliation in row.pop('affiliations'):
            affiliation_rows.append(
                {**row_template, "affiliation": affiliation}
            )
        author_rows.append(row)
author_df = pandas.DataFrame(author_rows)
author_df = author_df.sort_values(['pmid', 'position'])
author_df.head()    

Unnamed: 0,pmid,position,fore_name,last_name,reverse_position
0,7477412,1,A H,Samad,10
1,7477412,2,W W,Cai,9
2,7477412,3,X,Hu,8
3,7477412,4,B,Irvin,7
4,7477412,5,J,Jing,6


In [6]:
affiliation_df = pandas.DataFrame(affiliation_rows)
affiliation_df = affiliation_df.sort_values(['pmid', 'position'])
affiliation_df.head()    

Unnamed: 0,pmid,position,affiliation
0,7477412,1,"Dept. of Pathology, Cornell Medical College, N..."
1,7479891,1,"National Center for Human Genome Research, Nat..."
2,7479895,1,"National Center for Human Genome Research, Nat..."
3,7497116,1,"Sandia National Labs, Albuquerque, New Mexico ..."
4,7497128,1,"Sandia National Laboratories, Albuquerque, NM ..."


In [7]:
# Number of unique affiliations
affiliation_df.affiliation.nunique()

383954

In [8]:
# Write article dataframe to TSV
(
    article_df
    [['pmid', 'pmcid', 'doi', 'journal', 'publication_date', 'title']]
    .to_csv('data/pubmed/articles.tsv.xz', sep='\t', index=False)
)
# Write author dataframe to TSV
author_df.to_csv('data/pubmed/authors.tsv.xz', sep='\t', index=False)

# Write affiliation dataframe to TSV
affiliation_df.to_csv('data/pubmed/affiliations.tsv.xz', sep='\t', index=False)

In [9]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

PLoS One                 7755
Methods Mol Biol         5440
Bioinformatics           5073
J Proteome Res           4319
BMC Genomics             4086
                         ... 
Environ Biosafety Res       1
Surg Innov                  1
J Stem Cells                1
Pain Manag                  1
Nurs Sci Q                  1
Name: journal, Length: 4286, dtype: int64

In [10]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmcid.notna(), article_df.journal)

journal,A A Case Rep,AACN Adv Crit Care,AACN Clin Issues,AAOHN J,AAPS J,AAPS PharmSci,ABNF J,ACS Appl Mater Interfaces,ACS Chem Biol,ACS Chem Neurosci,...,Zoo Biol,Zool Res,Zoolog Sci,Zoology (Jena),Zoonoses Public Health,Zootaxa,Zygote,eNeuro,mBio,mSphere
pmcid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,1,6,1,1,5,1,0,10,89,8,...,2,0,15,1,2,9,2,0,0,0
True,0,0,1,0,24,2,1,1,84,17,...,0,4,0,0,0,0,0,8,204,37


In [11]:
# articles without a publication date
article_df[article_df.publication_date.isna()]

Unnamed: 0,pmid,pmcid,doi,journal,journal_nlm_id,title,publication_date,authors
7,7513103,,,Turk J Pediatr,0417505,"Child health, the genome project and phenylket...",,"[{'fore_name': 'C R', 'last_name': 'Scriver', ..."
34,7764291,,,Australas Biotechnol,9113681,Some ethical issues in genetic medicine.,,"[{'fore_name': 'N A', 'last_name': 'Tonti-Fili..."
35,7764293,,,Australas Biotechnol,9113681,Legal rights and genetic engineering.,,"[{'fore_name': 'N', 'last_name': 'Stoianoff', ..."
54,7856962,,,Am Nurse,7506499,Survey assesses RN management of genetic infor...,,"[{'fore_name': 'C', 'last_name': 'Scanlon', 'a..."
62,7965253,,10.1111/j.1552-6909.1994.tb01909.x,J Obstet Gynecol Neonatal Nurs,8503123,The genome project.,,"[{'fore_name': 'B S', 'last_name': 'Raff', 'af..."
...,...,...,...,...,...,...,...,...
175822,31684829,PMC6831968,10.1177/1533033819883633,Technol Cancer Res Treat,101140941,Long Noncoding RNA GM16343 Promotes IL-36β to ...,,"[{'fore_name': 'Deli', 'last_name': 'Mao', 'af..."
176608,31802060,,,Hell J Nucl Med,101257471,Predicting oligonucleotide therapeutic efficac...,,"[{'fore_name': 'Pantazis I', 'last_name': 'The..."
176630,31808361,PMC6900616,10.1177/1533033819892260,Technol Cancer Res Treat,101140941,Bioinformatics Analysis of Expression and Alte...,,"[{'fore_name': 'Yong-Zi', 'last_name': 'Chen',..."
177166,31898667,,10.4103/jcrt.jcrt_866_18,J Cancer Res Ther,101249598,Regulation of HMGA2 and KRAS genes in epitheli...,,"[{'fore_name': 'Tuba', 'last_name': 'Gunel', '..."


In [12]:
# article proportions by date precision
article_df.publication_date.str.len().value_counts()

10.0    135844
7.0      24805
4.0      13753
Name: publication_date, dtype: int64