In [1]:
import glob
import os
import time
from PyPDF2 import PdfReader
from multiprocessing import Pool, cpu_count, set_start_method, current_process
from pdf2image import convert_from_path


In [2]:
#| export
def batch_process_pdf(pdf_path:str, output_folder:str, batch_size:int=10, debug:bool=False):
    num_pages = get_pdf_length(pdf_path)
    tasks = [(pdf_path, output_folder, start, min(start + batch_size - 1, num_pages))
             for start in range(1, num_pages + 1, batch_size)]

    def debug_print(message):
        if debug:
            print(message)

    with Pool(cpu_count()) as pool:
        start_time = time.time()
        for i, task in enumerate(tasks):
            debug_print(f"Starting batch {i+1}/{len(tasks)}: pages {task[2]} to {task[3]} on process {current_process().name}")
        results = pool.starmap_async(convert_pdf_to_images, tasks)
        while not results.ready():
            remaining = results._number_left
            debug_print(f"{remaining} tasks remaining. Total elapsed time: {time.time() - start_time:.2f} seconds")
            time.sleep(1)  # Sleep briefly to avoid flooding the output
        flat_list = [item for sublist in results.get() for item in sublist]
        debug_print(f"All tasks completed. Total processing time: {time.time() - start_time:.2f} seconds")
    return flat_list

In [3]:
#| export
def convert_pdf_to_images(pdf_path:str, # Path to the PDF file
                          output_folder:str, # Output folder for the images
                          start_page:int=None, # Starting page for conversion
                          end_page:int=None # Ending page for conversion
                          )->list: # List of image paths
    # Extract base filename without extension
    base_filename = os.path.splitext(os.path.basename(pdf_path))[0]

    # Determine conversion parameters based on provided page ranges
    if start_page and end_page:
        # Convert a specific range of pages
        images = convert_from_path(pdf_path, first_page=start_page, last_page=end_page, dpi=300, fmt='png')
        start_index = start_page
    elif start_page:
        # Convert from start_page to the end of the PDF
        images = convert_from_path(pdf_path, first_page=start_page, dpi=300, fmt='png')
        start_index = start_page
    elif end_page:
        # Convert from the beginning of the PDF to end_page
        images = convert_from_path(pdf_path, last_page=end_page, dpi=300, fmt='png')
        start_index = 1
    else:
        # Convert the entire PDF
        images = convert_from_path(pdf_path, dpi=300, fmt='png')
        start_index = 1

    # List to store image paths
    image_paths = []

    # Save each page as an image and collect the paths
    for i, image in enumerate(images, start=start_index):
        image_path = os.path.join(output_folder, f'{base_filename}_page_{i}.png')
        image.save(image_path, 'PNG')
        image_paths.append(image_path)

    return image_paths


In [4]:
#| export
def get_pdf_length(pdf_path:str # Path to the PDF file
                   )->int: # Number of pages in the PDF file
    """ Get the number of pages in a PDF file"""
    with open(pdf_path, "rb") as file:
        pdf = PdfReader(file)
        num_pages = len(pdf.pages)
    return num_pages

In [5]:
#| export
input_dir = '../samples'
image_dir = '../samples/images'
output_dir = '../samples/output'
# Use glob to find all PDF files in the directory
all_pdf_files = glob.glob(os.path.join(input_dir, '*.pdf'))

# Filter out files containing 'sample' in the filename
full_pdf_files = [file for file in all_pdf_files if 'sample' not in os.path.basename(file).lower()]

In [6]:
[print(f"{i} - {file}:{get_pdf_length(file)}") for i,file in enumerate(full_pdf_files)]

0 - ../samples/Hall et al., 1911_only data tables.pdf:16
1 - ../samples/Lipincott, 1905.pdf:110
2 - ../samples/simpson et al., 1929.pdf:323
3 - ../samples/BEACH Slides copy.pdf:18


[None, None, None, None]

In [7]:
#| export
set_start_method('spawn')

In [8]:
print(cpu_count())

16


In [9]:
image_paths = batch_process_pdf(full_pdf_files[0], image_dir, batch_size=10, debug=True)

Starting batch 1/2: pages 1 to 10 on process MainProcess
Starting batch 2/2: pages 11 to 16 on process MainProcess
2 tasks remaining. Total elapsed time: 0.00 seconds


Process SpawnPoolWorker-1:
Process SpawnPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/kellycaylor/mambaforge/envs/usgs_extract/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kellycaylor/mambaforge/envs/usgs_extract/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kellycaylor/mambaforge/envs/usgs_extract/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/kellycaylor/mambaforge/envs/usgs_extract/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'convert_pdf_to_images' on <module '__main__' (built-in)>
  File "/Users/kellycaylor/mambaforge/envs/usgs_extract/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kellycaylor/mambaforge/envs/usgs_extra

2 tasks remaining. Total elapsed time: 1.01 seconds
2 tasks remaining. Total elapsed time: 2.01 seconds
2 tasks remaining. Total elapsed time: 3.01 seconds
2 tasks remaining. Total elapsed time: 4.02 seconds
2 tasks remaining. Total elapsed time: 5.03 seconds
2 tasks remaining. Total elapsed time: 6.03 seconds
2 tasks remaining. Total elapsed time: 7.03 seconds
2 tasks remaining. Total elapsed time: 8.03 seconds
2 tasks remaining. Total elapsed time: 9.04 seconds
2 tasks remaining. Total elapsed time: 10.04 seconds


KeyboardInterrupt: 

In [None]:
image_paths = batch_process_pdf(full_pdf_files[0], image_dir, 10)
