# Download articles from PubMed Central

https://www.ncbi.nlm.nih.gov/pmc/tools/oai/

In [1]:
import tqdm

import pandas
from pubmedpy.pmc_oai import (
    download_frontmatter_set,
    get_sets_for_pmcid,
)

## Specify PMC-OAI sets

In [2]:
# For each journal in articles.tsv.xz, show two articles and their corresponding sets
pubmed_df = pandas.read_table('data/pubmed/articles.tsv.xz')
pubmed_df = pubmed_df[['pmcid', 'journal']].dropna()
journal_to_n_articles = pubmed_df.journal.value_counts().to_dict()
journal_df = (
    pubmed_df
    .sample(frac=1, random_state=0)
    .groupby('journal').head(2)
    .sort_values(['journal', 'pmcid'])
)
journal_df["oai_set"] = [
    ', '.join(get_sets_for_pmcid(pmcid))
    for pmcid in journal_df.pmcid
]
journal_df

IdDoesNotExist: The value of the identifier argument is unknown or illegal in this repository.

In [3]:
# Define journals to download article frontmatter for
journal_to_oai_set = {
    'BMC Bioinformatics': 'bmcbioi',
    'Bioinformatics': 'bioinfo',
    'PLoS Comput Biol': 'ploscomp',
}

## Dowload sets

In [4]:
for journal, oai_set in journal_to_oai_set.items():
    path = f'data/pmc/oai/pmc_fm/{oai_set}.zip'
    download_frontmatter_set(
        oai_set,
        path,
        tqdm=tqdm.tqdm_notebook,
        n_records=journal_to_n_articles[journal],
    )

HBox(children=(IntProgress(value=0, description='bmcbioi', max=9389, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='bioinfo', max=5083, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='ploscomp', max=6491, style=ProgressStyle(description_width='i…




In [5]:
! ls --size --human-readable data/pmc/oai/pmc_fm

total 56M
11M bioinfo.zip  24M bmcbioi.zip  22M ploscomp.zip
