In [1]:
import numpy as np
import os
import fitz
import io
from PIL import Image


# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

path_pdfs = os.path.join(
    repo_path, 'files', 'pdfs'
)
path_images_from_pdfs = os.path.join(
    repo_path, 'files', 'images-from-pdfs'
)

file_paths = []
for dirpath, _, filenames in os.walk(path_pdfs):
    for filename in filenames:
        if filename.endswith(':Zone.Identifier'):
            # A convenience hack for Windows subsystem for linux
            continue
        if filename.startswith('.'):
            # A convenience hack for Windows subsystem for linux
            continue
        if not filename.endswith('.pdf'):
            continue
        file_path = os.path.join(dirpath, filename)
        file_paths.append(file_path)

        
num_orig = len(file_paths)
print(f'We have {num_orig} files to process')


We have 22 files to process


In [2]:
output_format = 'png'
i = 0
for file_path in file_paths:
    i += 1
    file_dir, filename = os.path.split(file_path)
    # print(filename)
    # print(file_dir)
    dir_for_file = filename.lower().replace(' ', '_').replace('.pdf', '').replace('copy_of_', '')
    images_dir = os.path.join(path_images_from_pdfs, dir_for_file)
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)
    print(f'[{i} of {num_orig}] working on {file_path}')
    pdf_file = fitz.open(file_path)
    pdf_image_list = []
    for page_index in range(len(pdf_file)):
        # get the page itself
        page = pdf_file[page_index]
        pdf_image_list += page.get_images(full=True)
    if not pdf_image_list:
        continue
    print(f'[{i} of {num_orig}] {file_path} has {len(pdf_image_list)} images!! ')
    img_i = 0
    for img in pdf_image_list:
        img_i += 1
        xref = img[0]
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        print(f'Image was a {image_ext}')
        image = Image.open(io.BytesIO(image_bytes))
        if image.mode != 'RGB':
            image = image.convert('RGB')
        new_file_name = f'{dir_for_file}_image_{img_i:02d}.png'
        new_file_path = os.path.join(images_dir, new_file_name)
        image.save(
                open(new_file_path, "wb"),
                format=output_format.upper(),
        )
        print(f'Saved: {new_file_path}')
        
    

[1 of 22] working on /home/ekansa/github/open-context-jupyter/files/pdfs/Copy of Stop and Smell Obs Form.pdf
[1 of 22] /home/ekansa/github/open-context-jupyter/files/pdfs/Copy of Stop and Smell Obs Form.pdf has 2 images!! 
Image was a jpeg
Saved: /home/ekansa/github/open-context-jupyter/files/images-from-pdfs/stop_and_smell_obs_form/stop_and_smell_obs_form_image_01.png
Image was a jpeg
Saved: /home/ekansa/github/open-context-jupyter/files/images-from-pdfs/stop_and_smell_obs_form/stop_and_smell_obs_form_image_02.png
[2 of 22] working on /home/ekansa/github/open-context-jupyter/files/pdfs/Copy of A Pun Goes Here reorg.pdf
[2 of 22] /home/ekansa/github/open-context-jupyter/files/pdfs/Copy of A Pun Goes Here reorg.pdf has 9 images!! 
Image was a jpeg
Saved: /home/ekansa/github/open-context-jupyter/files/images-from-pdfs/a_pun_goes_here_reorg/a_pun_goes_here_reorg_image_01.png
Image was a jpeg
Saved: /home/ekansa/github/open-context-jupyter/files/images-from-pdfs/a_pun_goes_here_reorg/a_pun