# Process PubMed journal catalog

Download and process PubMed/NLM [journal catalog](http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.journal_lists/).

In [1]:
import os
import re

import pandas

In [7]:
# Download PubMed Journals
url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'
! wget --no-verbose --directory-prefix download --timestamping {url}

2021-08-20 12:28:40 URL: ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt [1127] -> "download/.listing" [1]
2021-08-20 12:28:49 URL: ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt [8112777] -> "download/J_Medline.txt" [1]


In [8]:
# Read PubMed journals
path = os.path.join('download', 'J_Medline.txt')
with open(path) as read_file:
    text = read_file.read()

In [9]:
# Create a dataframe of PubMed journals
rows = list()
pattern = re.compile('^-+$', re.MULTILINE)
for stanza in re.split(pattern, text):
    stanza = stanza.strip()
    if not stanza:
        continue
    row = dict()
    for line in stanza.split('\n'):
        key, value = line.split(': ', 1)
        row[key] = value or None
    rows.append(row)

journal_df = pandas.DataFrame(rows)
journal_df = journal_df.sort_values(by='NlmId')

In [10]:
# Order columns by percent missing
missing_pct = journal_df.isnull().mean().sort_values()
journal_df = journal_df[missing_pct.index]
missing_pct

JrId             0.000000
JournalTitle     0.000000
NlmId            0.000000
MedAbbr          0.000029
IsoAbbr          0.000029
ISSN (Print)     0.243476
ISSN (Online)    0.534139
dtype: float64

In [11]:
# Save journal dataframe as a TSV
path = 'data/pubmed-journals.tsv'
journal_df.to_csv(path, sep='\t', index=False)