In [1]:
import numpy as np
import os
import fitz
import io
from PIL import Image


# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

path_pdfs = os.path.join(
    repo_path, 'files', 'pdfs'
)

file_paths = []
for dirpath, _, filenames in os.walk(path_pdfs):
    for filename in filenames:
        if filename.endswith(':Zone.Identifier'):
            # A convenience hack for Windows subsystem for linux
            continue
        if filename.startswith('.'):
            # A convenience hack for Windows subsystem for linux
            continue
        if not filename.endswith('.pdf'):
            continue
        file_path = os.path.join(dirpath, filename)
        file_paths.append(file_path)

        
num_orig = len(file_paths)
print(f'We have {num_orig} files to process')


We have 1421 files to process


In [2]:
output_format = 'png'
i = 0
for file_path in file_paths:
    i += 1
    new_path = file_path.replace('/files/pdfs/', '/files/images-from-pdfs/')
    new_dir = os.path.dirname(new_path)
    new_file = new_path.replace('.pdf', f'.{output_format}')
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    if os.path.exists(new_file):
        continue
    print(f'[{i} of {num_orig}] working on {file_path}')
    pdf_file = fitz.open(file_path)
    pdf_image_list = []
    for page_index in range(len(pdf_file)):
        # get the page itself
        page = pdf_file[page_index]
        pdf_image_list += page.get_images(full=True)
    if not pdf_image_list:
        continue
    if len(pdf_image_list) > 1:
        print(f'[{i} of {num_orig}] {file_path} has more than {len(pdf_image_list)} images!! ')
    img_i = 0
    for img in pdf_image_list:
        if img_i > 0:
            continue
        xref = img[0]
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        print(f'Image was a {image_ext}')
        image = Image.open(io.BytesIO(image_bytes))
        image.save(
                open(new_file, "wb"),
                format=output_format.upper(),
        )
        img_i += 1
        
    

[1 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-loose-paper-side-1.pdf
Image was a png
[2 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-245-276.pdf
Image was a png
[3 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-597-628-page-20.pdf
Image was a png
[4 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-001-031.pdf
Image was a png
[5 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-181-212-page-7.pdf
Image was a png
[6 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-150-180.pdf
Image was a png
[7 of 14

[51 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/small-finds-1972-absolute-1-656/small-finds-1972-absolute-337-368-page-12.pdf
Image was a png
[52 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-page-47.pdf
Image was a png
[53 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-page-7.pdf
Image was a png
[54 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-page-25.pdf
Image was a png
[55 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-page-11.pdf
Image was a png
[56 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-page-54.pdf
Image was a png
[57 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-inside-

[108 of 1421] working on /home/ekansa/github/open-context-jupyter/files/pdfs/trench-c-september-1972/trench-c-1972-completed-journal.pdf
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png
Image was a png

KeyboardInterrupt: 