In [None]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [None]:
!apt-get install -y poppler-utils
!which pdftoppm
!ls -la /usr/bin/pdftoppm

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.9 [186 kB]
Fetched 186 kB in 1s (269 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126380 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.9_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.9) ...
Setting up poppler-utils (22.02.0-2ubuntu0.9) ...
Processing triggers for man-db (2.10.2-1) ...
/usr/bin/pdftoppm
-rwxr-xr-x 1 root root 35240 Jul 25 15:21 /usr/bin/pdftoppm


In [None]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from PIL import Image
import re
from pathlib import Path
import shutil
from typing import List, Tuple

In [None]:
PDF_PATH = "/content/drive/MyDrive/Miki_class/Project/Catalog/PDFs"
IMAGES_PATH = "/content/drive/MyDrive/Miki_class/Project/Catalog/images"

In [None]:
# PDF conversion settings
PDF_DPI = 200  # High quality for OCR and detection
IMAGE_FORMAT = "jpg"  # Consistent format
IMAGE_QUALITY = 95  # High quality JPEG

In [None]:
def convert_pdf_to_images(pdf_path: str, issue_number: str) -> Tuple[bool, int]:

    # Create issue folder
    issue_folder = os.path.join(IMAGES_PATH, issue_number)
    os.makedirs(issue_folder, exist_ok=True)

    try:
        print(f"   Converting PDF to images...")

        # Convert PDF to images
        pages = convert_from_path(pdf_path, dpi=PDF_DPI)

        if not pages:
            print(f"   No pages found in PDF")
            return False, 0

        print(f"   Processing {len(pages)} pages...")

        # Save each page
        saved_pages = 0
        for i, page in enumerate(pages):
            page_number = i + 1
            filename = f"page_{page_number:02d}.jpg"
            page_path = os.path.join(issue_folder, filename)

            # Save with high quality
            page.save(page_path)
            saved_pages += 1

            if (i + 1) % 5 == 0:  # Progress update every 5 pages
                print(f"      Saved {i + 1}/{len(pages)} pages...")

        print(f"   Successfully saved {saved_pages} pages to Issue {issue_number}")
        return True, saved_pages

    except Exception as e:
        print(f"   Error converting PDF: {str(e)}")
        return False, 0

In [None]:
if not os.path.exists(PDF_PATH):
    print(f"PDF directory not found: {PDF_PATH}")
    exit()

pdf_files = [f for f in os.listdir(PDF_PATH) if f.endswith('.pdf')]

if not pdf_files:
    print(f"No PDF files found in: {PDF_PATH}")
    exit()

print(f"Found {len(pdf_files)} PDF files to convert")
print()

Found 18 PDF files to convert



In [None]:
total_converted = 0
total_pages = 0
successful_conversions = []
failed_conversions = []

# Convert each PDF
for i, pdf_file in enumerate(pdf_files, 1):
    # Extract issue number from filename (remove .pdf extension)
    issue_number = os.path.splitext(pdf_file)[0]
    pdf_path = os.path.join(PDF_PATH, pdf_file)

    print(f"[{i}/{len(pdf_files)}] Converting: {pdf_file}")
    print(f"   Target: Issue {issue_number}")

    # Convert PDF
    success, pages_converted = convert_pdf_to_images(pdf_path, issue_number)

    if success:
        successful_conversions.append({
            'pdf_file': pdf_file,
            'issue_number': issue_number,
            'pages': pages_converted
        })
        total_converted += 1
        total_pages += pages_converted
        print(f"   Success: {pages_converted} pages converted")
    else:
        failed_conversions.append({
            'pdf_file': pdf_file,
            'issue_number': issue_number,
            'error': 'Conversion failed'
        })
        print(f"   Failed to convert")

    print()

[1/18] Converting: 1193.pdf
   Target: Issue 1193
   Converting PDF to images...
   Processing 208 pages...
      Saved 5/208 pages...
      Saved 10/208 pages...
      Saved 15/208 pages...
      Saved 20/208 pages...
      Saved 25/208 pages...
      Saved 30/208 pages...
      Saved 35/208 pages...
      Saved 40/208 pages...
      Saved 45/208 pages...
      Saved 50/208 pages...
      Saved 55/208 pages...
      Saved 60/208 pages...
      Saved 65/208 pages...
      Saved 70/208 pages...
      Saved 75/208 pages...
      Saved 80/208 pages...
      Saved 85/208 pages...
      Saved 90/208 pages...
      Saved 95/208 pages...
      Saved 100/208 pages...
      Saved 105/208 pages...
      Saved 110/208 pages...
      Saved 115/208 pages...
      Saved 120/208 pages...
      Saved 125/208 pages...
      Saved 130/208 pages...
      Saved 135/208 pages...
      Saved 140/208 pages...
      Saved 145/208 pages...
      Saved 150/208 pages...
      Saved 155/208 pages...
      Saved 1

In [None]:
if os.path.exists(IMAGES_PATH):
    all_issue_folders = [f for f in os.listdir(IMAGES_PATH)
                        if os.path.isdir(os.path.join(IMAGES_PATH, f))]

    print(f"Total issue folders: {len(all_issue_folders)}")

    # Count total images
    total_images = 0
    for folder in all_issue_folders:
        folder_path = os.path.join(IMAGES_PATH, folder)
        image_files = [f for f in os.listdir(folder_path)
                      if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        total_images += len(image_files)

    print(f"Total images across all issues: {total_images}")

    print(f"\nCONVERSION COMPLETE!")
    print(f"   All PDFs converted to high-quality images")
    print(f"   Images saved to: {IMAGES_PATH}")

else:
    print("Images directory not found after conversion")

Total issue folders: 31
Total images across all issues: 5435

CONVERSION COMPLETE!
   All PDFs converted to high-quality images
   Images saved to: /content/drive/MyDrive/Miki_class/Project/Catalog/images
