# Download from PMC FTP server

In [None]:
import os
import sys
from ftplib import FTP
from fnmatch import fnmatch
from urllib.parse import urlparse
from tqdm import tqdm

# You can also integrate tqdm into Jupyter but it may require more setup
#from tqdm import tqdm_notebook as tqdm

# Configuration

In [None]:
data_path = 'data'
output_dir = os.path.join(data_path, 'FTP_PUBMED_papers')

ftp_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
file_pattern = '*.xml.tar.gz'

# Or you could start by just downloading one file
#file_pattern = 'non_comm_use.A-B.xml.tar.gz'

# Download

In [None]:
def download_ftp_file(ftp_client, filename, output_filename):
    # use temp files so we know the file is not complete
    temp_output_filename = output_filename + '.part'
    if os.path.isfile(output_filename):
        os.remove(output_filename)
    with open(temp_output_filename, 'wb') as f:
        # need to change the type to binary for the size command to succeed
        ftp_client.voidcmd('TYPE I')
        total = ftp_client.size(filename)

        with tqdm(total=total, unit='B', unit_scale=True, leave=False, file=sys.stdout) as pbar:
            pbar.set_description(filename)

            def cb(data):
                pbar.update(len(data))
                f.write(data)

            ftp_client.retrbinary('RETR {}'.format(filename), cb)
    os.rename(temp_output_filename, output_filename)

def download_files_from_ftp(ftp_url, file_pattern, output_dir):
    parsed_ftp_url = urlparse(ftp_url)
    with FTP(parsed_ftp_url.hostname) as ftp_client:
        ftp_client.login()
        ftp_client.cwd(parsed_ftp_url.path)
        filenames = ftp_client.nlst()
        matching_filenames = [fn for fn in filenames if fnmatch(fn, file_pattern)]
        for filename in matching_filenames:
            output_filename = os.path.join(output_dir, filename)
            if os.path.isfile(output_filename):
                print('already downloaded:', output_filename)
            else:
                download_ftp_file(ftp_client, filename, output_filename)
                print('downloaded:', output_filename)

os.makedirs(output_dir, exist_ok=True)
download_files_from_ftp(ftp_url, file_pattern, output_dir)