In [8]:
# # v0.2.1 (tqdm, directory mods, archive dir creation & automove;)

import pdfplumber
import easyocr
from PIL import Image
import io
import os
import shutil
from tqdm import tqdm

# Initialize EasyOCR reader (English only for now; you can add more)
reader = easyocr.Reader(['en'], verbose=False, gpu=False)

In [9]:
# Input and output directories
input_dir = './input'
output_dir = './output'
archive_dir = os.path.join(input_dir, 'archive_converted')

# Create directories if they don't exist
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(archive_dir, exist_ok=True)

In [11]:
%%time

# Get list of PDF files in the input directory
pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]

# Loop through all PDF files in the input directory with tqdm progress bar
for filename in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(input_dir, filename)

    # List to store extracted text per page
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # First try extracting using pdfplumber (text layer)
            text = page.extract_text()

            if text and text.strip():
                print(f"Text layer found on page {page_num+1} of {filename}, using pdfplumber.")
            else:
                print(f"No text layer on page {page_num+1} of {filename}, using OCR fallback.")

                # Convert pdfplumber page to image (RGB PIL image)
                pil_image = page.to_image(resolution=300).original.convert("RGB")
                # Convert to bytes for easyocr
                img_byte_arr = io.BytesIO()
                pil_image.save(img_byte_arr, format='PNG')
                img_byte_arr = img_byte_arr.getvalue()

                # Run OCR with EasyOCR
                result = reader.readtext(img_byte_arr, detail=0, paragraph=True)
                text = "\n".join(result)

            all_text.append(f"\n\n--- PAGE {page_num+1} ---\n\n{text}")

    # Join all text into a single string
    final_text = "\n".join(all_text)

    # Extract base name (without extension) and add suffix
    base_name = os.path.splitext(filename)[0]
    output_filename = f"{base_name}.txt"
    output_path = os.path.join(output_dir, output_filename)

    # Save to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_text)

    # Move the processed PDF to the archive directory
    shutil.move(pdf_path, os.path.join(archive_dir, filename))

print("All PDFs processed and archived.")

Processing PDFs:   0%|                                    | 0/1 [00:00<?, ?it/s]

No text layer on page 1 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 2 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 3 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 4 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 5 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 6 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 7 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.
No text layer on page 8 of fxd_Chapter 18 - Collection Management(1).pdf, using OCR fallback.


Processing PDFs: 100%|███████████████████████████| 1/1 [03:42<00:00, 222.76s/it]

All PDFs processed and archived.
CPU times: user 21min 25s, sys: 2min 36s, total: 24min 2s
Wall time: 3min 42s





In [5]:
import pdfplumber
import easyocr
from PIL import Image
import io
import os

# Initialize EasyOCR reader (English only for now; you can add more)
reader = easyocr.Reader(['en'], verbose=False, gpu=False)

In [6]:
# Input and output directories
input_dir = './input'
output_dir = './output'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [7]:
%%time
# Loop through all PDF files in the input directory
for filename in os.listdir(input_dir):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(input_dir, filename)

        # List to store extracted text per page
        all_text = []

        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                # First try extracting using pdfplumber (text layer)
                text = page.extract_text()

                if text and text.strip():
                    print(f"Text layer found on page {page_num+1} of {filename}, using pdfplumber.")
                else:
                    print(f"No text layer on page {page_num+1} of {filename}, using OCR fallback.")

                    # Convert pdfplumber page to image (RGB PIL image)
                    pil_image = page.to_image(resolution=300).original.convert("RGB")
                    # Convert to bytes for easyocr
                    img_byte_arr = io.BytesIO()
                    pil_image.save(img_byte_arr, format='PNG')
                    img_byte_arr = img_byte_arr.getvalue()

                    # Run OCR with EasyOCR
                    result = reader.readtext(img_byte_arr, detail=0, paragraph=True)
                    text = "\n".join(result)

                all_text.append(f"\n\n--- PAGE {page_num+1} ---\n\n{text}")

        # Join all text into a single string
        final_text = "\n".join(all_text)

        # Extract base name (without extension) and add suffix
        base_name = os.path.splitext(filename)[0]
        output_filename = f"{base_name}.txt"
        output_path = os.path.join(output_dir, output_filename)

        # Save to file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(final_text)

        print(f"Saved: {output_filename}")

print("All PDFs processed.")

Text layer found on page 1 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
No text layer on page 2 of 608_s2_Goleman Leadership That Gets Results.pdf, using OCR fallback.
Text layer found on page 3 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 4 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 5 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 6 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 7 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 8 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 9 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 10 of 608_s2_Goleman Leadership That Gets Results.pdf, using pdfplumber.
Text layer found on page 11 of

In [None]:
# v0.1.2 (tqdm progress bar added;)

import pdfplumber
import easyocr
from PIL import Image
import io
import os
from tqdm import tqdm  # Import tqdm

# Initialize EasyOCR reader (English only for now; you can add more)
reader = easyocr.Reader(['en'], verbose=False)

# Input and output directories
input_dir = './input'
output_dir = './output'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get list of PDF files in the input directory
pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]

# Loop through all PDF files in the input directory with tqdm progress bar
for filename in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(input_dir, filename)

    # List to store extracted text per page
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # First try extracting using pdfplumber (text layer)
            text = page.extract_text()

            if text and text.strip():
                print(f"Text layer found on page {page_num+1} of {filename}, using pdfplumber.")
            else:
                print(f"No text layer on page {page_num+1} of {filename}, using OCR fallback.")

                # Convert pdfplumber page to image (RGB PIL image)
                pil_image = page.to_image(resolution=300).original.convert("RGB")
                # Convert to bytes for easyocr
                img_byte_arr = io.BytesIO()
                pil_image.save(img_byte_arr, format='PNG')
                img_byte_arr = img_byte_arr.getvalue()

                # Run OCR with EasyOCR
                result = reader.readtext(img_byte_arr, detail=0, paragraph=True)
                text = "\n".join(result)

            all_text.append(f"\n\n--- PAGE {page_num+1} ---\n\n{text}")

    # Join all text into a single string
    final_text = "\n".join(all_text)

    # Extract base name (without extension) and add suffix
    base_name = os.path.splitext(filename)[0]
    output_filename = f"{base_name}.txt"
    output_path = os.path.join(output_dir, output_filename)

    # Save to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_text)

print("All PDFs processed.")