This is a test for free PDF OCR using PyMuPDF (fitz), tesseract, pytesseract.  
For a paid and admitedly much better outcome, you can use Google Vision https://cloud.google.com/vision?hl=en and other well-known LLMs


pip install pytesseract  

In [1]:
# https://tesseract-ocr.github.io/tessdoc/Installation.html
# Install Tesseract from here:https://github.com/UB-Mannheim/tesseract/wiki
# https://github.com/tesseract-ocr/tessdoc/blob/main/Data-Files.md
# Best data (slow but more accurate from here: https://github.com/tesseract-ocr/tessdata_best/tree/main)
# Get both ell.traineddata and script/Greek.traineddata
import fitz  # PyMuPDF
import io
from pathlib import Path
from PIL import Image
import cv2
import numpy as np

import pytesseract
import os

https://www.greek-language.gr/greekLang/files/document/modern_greek/grammatiki.triantafyllidi.pdf  

In [None]:
pdf_name= 'grammatiki.triantafyllidi'

In [5]:
def create_folder(folder_path):
    path = Path(folder_path)
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)
    return path


In [16]:
def my_zfill(number, max_number):
    max_digits = len(str(max_number))
    return str(number).zfill(max_digits)


In [None]:
def get_image_files(folder, img_formats=('*.png', '*.jpg', '*.jpeg', '*.gif', '*.bmp')):
    return [img for img_format in img_formats for img in folder.glob(img_format)]


In [None]:
path = Path.cwd()
pdf_path = path/'data/pdfs' / (pdf_name + '.pdf')
images_path = create_folder(path/'data/images'/pdf_name)
output_path = create_folder(path/'output'/pdf_name)
path, pdf_path, images_path, output_path

In [7]:
def add_tesseract_to_path(tesseract_path):
    # Check if the Tesseract path is already in the PATH environment variable
    if tesseract_path not in os.environ['PATH']:
        # Add Tesseract path to the PATH environment variable
        os.environ['PATH'] += os.pathsep + tesseract_path

# Set the Tesseract directory
tesseract_path = r'C:\Program Files\Tesseract-OCR'

# Add Tesseract to PATH if it's not already there
add_tesseract_to_path(tesseract_path)


### Inspect Images

In [26]:
def inspect_images_in_pdf(pdf_path, n_pages=1):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    total_pages = len(pdf_document)
    pages = min(n_pages, total_pages)
    
    # Loop through each page
    for page_number in range(pages):
        page = pdf_document.load_page(page_number)
        image_list = page.get_images(full=True)
        
        zpage = my_zfill(page_number + 1, pages)

        print(f"[INFO] Page {zpage} contains {len(image_list)} images")
        
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            
            image_ext = base_image["ext"]
            width = base_image["width"]
            height = base_image["height"]
            bpc = base_image["bpc"]  # bits per component
            colorspace = base_image["colorspace"] if "colorspace" in base_image else "unknown"
            filter_type = base_image["filter"] if "filter" in base_image else "unknown"
            
            print(f"Image {image_index + 1}:")
            print(f"  - Format: {image_ext}")
            print(f"  - Dimensions: {width} x {height}")
            print(f"  - Bits per component: {bpc}")
            print(f"  - Colorspace: {colorspace}")
            print(f"  - Compression: {filter_type}")
            print()


In [31]:
inspect_images_in_pdf(pdf_path, n_pages=2)

[INFO] Page 1 contains 1 images
Image 1:
  - Format: jpeg
  - Dimensions: 1054 x 1487
  - Bits per component: 8
  - Colorspace: 3
  - Compression: unknown

[INFO] Page 2 contains 1 images
Image 1:
  - Format: png
  - Dimensions: 2512 x 2365
  - Bits per component: 1
  - Colorspace: 1
  - Compression: unknown



### Get text from PDF (test)

In [4]:
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Initialize an empty string to store the extracted text
    text = ""
    
    # Iterate over each page
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)  # Load a page
        text += page.get_text()  # Extract text from the page
    
    return text


In [5]:
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)




### Exctract Images Function

In [15]:
def extract_images_from_pdf(pdf_path, n_pages = 1, output_folder=images_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    total_pages = len(pdf_document)
    if n_pages == 0:
        n_pages = total_pages
    pages = min(n_pages, total_pages)

    if not output_folder.exists():
        output_folder.mkdir(parents=True)

    # Loop through each page
    for page_number in range(pages):
        page = pdf_document.load_page(page_number)
        image_list = page.get_images(full=True)
        
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            zpage = my_zfill(page_number + 1, pages)
            zimage = my_zfill(image_index + 1, len(image_list))

            # Write JB2 image to a file
            image_path = output_folder / f"page{zpage}_image{zimage}.{image_ext}"
            with open(image_path, 'wb') as f:
                f.write(image_bytes)

            print(f"[INFO] Converted and saved image {zimage} on page {zpage} as {image_path}")



### Extract Images

In [None]:
extract_images_from_pdf(pdf_path, 0)

### Preprocess Images

In [18]:
def denoise(image):
    # Apply non-local means denoising
    image = cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)
    return image

def gaussian_blur(image):
    # Apply Gaussian blur
    image = cv2.GaussianBlur(image, (5, 5), 0)
    return image

def adaptive_thresholding(image):
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive thresholding
    image = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 9, 2)
    return image

def equalize(image):
    image = cv2.equalizeHist(image)
    return image

def convert_to_binary(image):
    _, binary_image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(binary_image)

def preprocess_image(image):

    
    # image = denoise(image)
    # image = gaussian_blur(image)
    image = adaptive_thresholding(image)
    image = equalize(image)
    image = convert_to_binary(image)

    return image

### Extract Text from Images Function

In [135]:
import re

def remove_line_breaks(text):
    # Replace consecutive line breaks with a single line break
    text = re.sub(r'\n+', '\n', text)
    # Remove leading and trailing whitespace and tabs
    text = re.sub(r'(\n|^)[\t ]+|[\t ]+(\n|$)', '', text)

    # Delete dash and line break
    text = re.sub(r'\-\n', '', text)
    # Replace comma and line break with space
    text = re.sub(r'\,\n', ' ', text)

    # If nesxt line 1st letter is a lowercase, replace line break with space
    text = re.sub(r'\n(?=[a-z\u03AC-\u03CE])', ' ', text)

    return text

# tesseract seems to want to use older intonated characters
def replace_text(text, replacements):
    for key, value in replacements.items():
        text = text.replace(key, value)
    return text

replacements = {'ᾶ':'ά','ἄ':'ά','ἆ':'ά','ᾱ':'ά',
                'ἶ':'ί','ἰ':'ί',
                'καὶ':'και',
                'ὃ':'δ','ὅ':'δ',
                'ῃ':'η','ῄ':'ή','ῆ':'ή',
                'ὦ':'ώ','ῶ':'ώ','ὤ':'ώ','ὣ':'ώ',
                'ἔ':'έ','έ':'έ',
                'ὗ':'ύ','ὕ':'ύ'}

def extract_text_from_images(folder_path, num_images=1, img_format='png', output_file='output.txt', language='eng', **kwargs):

    indices = False
    if kwargs:
        indices = kwargs.get('indices')
        rotate = kwargs.get('rotate')['rotate']

    folder = Path(folder_path)
    images = list(folder.glob('*.' + img_format))
    total_num_images = len(images)
    if num_images == 0:
        num_images = total_num_images
    num_images = min(num_images, total_num_images)
    images = images[:num_images]
    # print(images)
    with open(output_file, 'w') as f:
        for idx, image_path in enumerate(images):
            
            image = cv2.imread(str(image_path))

            # Some images maybe rotated. Inspect extracted images and pass a list and the rotate argument e.g. cv2.ROTATE_90_CLOCKWISE
            if indices and (idx+1 in indices):
                image = cv2.rotate(image, rotate)

            image = preprocess_image(image)
            # image.show()

            # I did not manage to make these tests work for tessedit_char_whitelist.
            # ellcaps = 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'
            # elllow = 'αβγδεζηθικλμνξοπρστυφχψω'
            # ellintcaps = 'ΆΈΉΊΌΎΏ'
            # ellintlow = 'άέήίόύώ'
            # ellumbcaps = 'ΪΫ'
            # ellumblow = 'ϊϋ'
            # ellintumblow = 'ΐΰ'
            # engcaps = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            # englow = 'abcdefghijklmnopqrstuvwxyz'
            # digs = '0123456789 '
            # allowedchars = ellcaps + elllow + ellintcaps + ellintlow + ellumbcaps + ellumblow + ellintumblow + digs + engcaps + englow
            # custom_config = r'-c tessedit_char_whitelist='+allowedchars

            text = pytesseract.image_to_string(image, lang=language) #, config=custom_config)
            text = replace_text(text, replacements)
            text = remove_line_breaks(text)
            f.write(f"{text}\n\n")


### Tests

In [21]:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# image = Image.open(images_path/'page01_image1.png')
image = cv2.imread(str(images_path/'page001_image1.jpeg'))
image = cv2.imread(str(images_path/'page002_image1.png'))
image = preprocess_image(image)
image.show()

# mytext = pytesseract.image_to_string(image, lang='ell+eng')
# mytext

In [41]:
image = cv2.imread(str(images_path/'page01_image1.png'))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 5, 2)
image = equalize(image)
image = convert_to_binary(image)
image.show()

### Extract Text from Images

In [137]:
# extract_text_from_images(images_path, num_images=3, img_format='*', output_file='output.txt', language='ell+eng')
image_ind_to_rotate = [105, 113, 145] + list(range(150, 158)) + [160, 161]
extract_text_from_images(images_path, num_images=0, img_format='*', output_file=output_path/'output.txt', language='ell', 
                         indices=image_ind_to_rotate, rotate={'rotate': cv2.ROTATE_90_CLOCKWISE})
