# OCR and image to EPUB "Papá o el diario de Alicia Mir"

This is the first OCR. It does the following:

- Detects text of the book separating by pages and including a probability of OCR accuracy.
- Separates it as a txt in a single page.

Missing:
- Requires manual revision of the txt to turn it into an Epub.
- General analytics of accuracy and rules of revision to simplify the revision of the complete text.
- No front-end.

## Part 1: OCR Tests

This section is deprecated but useful to validate the OCR for this book.

In [1]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import cv2
import numpy as np
import re
import language_tool_python

# Initialize LanguageTool for Spanish
tool = language_tool_python.LanguageTool('es')

# Path to your PDF file
pdf_path = 'book.pdf'

def preprocess_with_opencv(img):
    """
    Preprocess the image using OpenCV, excluding deskewing.
    """
    # Convert to grayscale
    img_gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    # Apply Gaussian blur
    img_blur = cv2.GaussianBlur(img_gray, (5, 5), 0)
    # Apply Otsu's binarization
    _, img_binary = cv2.threshold(img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return img_binary

def detect_page_number(text):
    """
    Detect lines that likely contain page numbers.
    """
    removal_values = []
    lines = text.splitlines()
    for line in lines:
        # Strip leading/trailing whitespace
        stripped_line = line.strip()
        
        # Regex to match patterns like "Page 1", "1 / 100", or numeric-only lines
        if re.match(r"^(page\s*\d+|\d+\s*/\s*\d+|\d+)$", stripped_line, re.IGNORECASE):
            removal_values.append(line)
    
    return removal_values

def clean_text(text):
    """
    Clean the extracted OCR text to remove nonsensical lines and improve output quality,
    while preserving meaningful punctuation.
    """
    # Store the original and cleaned lines in a dictionary
    lines = text.splitlines()
    modified_lines = {}

    for line in lines:
        # Keep the original line
        original_line = line

        # Start with the assumption that the line is clean
        cleaned_line = original_line.strip()

        # Remove lines with mostly non-alphanumeric characters
        if len(re.findall(r'[a-zA-Z0-9]', cleaned_line)) / max(len(cleaned_line), 1) < 0.5:
            cleaned_line = ""

        # Remove lines that are too short (e.g., isolated symbols)
        if len(cleaned_line) < 3:
            cleaned_line = ""

        # Remove excessive repeated characters (e.g., "aaaaaa" or "!!!!")
        if re.match(r"(.)\1{3,}", cleaned_line):
            cleaned_line = ""

        # Keep meaningful punctuation while removing stray symbols
        # Allow common Spanish punctuation and symbols
        cleaned_line = re.sub(r"[^\w\s,.!?¿¡:\"'()-]", "", cleaned_line)

        # If the line changes, store it in the dictionary
        if original_line != cleaned_line:
            modified_lines[original_line] = cleaned_line

    # Replace only the modified lines in the original text
    for original, cleaned in modified_lines.items():
        text = text.replace(original, cleaned)

    return text

def normalize_newlines(text):
    """
    Normalize newline characters:
    - Replace single '\n' with a space.
    - Replace consecutive '\n\n' or more with a single '\n\n'.
    """
    # Replace single newlines surrounded by non-newline characters with a space
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    # Replace multiple newlines with a double newline
    text = re.sub(r"\n{2,}", "\n\n", text)
    return text

def remove_short_lines_in_place(text):
    """
    Remove lines directly from the text that have no words with more than three characters,
    without splitting and rejoining the text.
    :param text: The input text.
    :return: The cleaned text with unwanted lines removed.
    """
    result = []
    start = 0
    length = len(text)
    while start < length:
        # Find the end of the current line
        end = text.find('\n', start)
        if end == -1:
            end = length

        line = text[start:end].strip()
        
        # Determine if the current line ends with a hyphen
        previous_line_ended_with_hyphen = line.endswith("-")

        if previous_line_ended_with_hyphen:
            # Remove the hyphen and concatenate with this line
            result[-1] = result[-1] + line
        else:
            result.append(line)

        

        # Move to the next line
        start = end + 1


    # Reassemble the text from the valid lines in the result
    return "\n".join(result)

# Initialize an empty list to hold the text for each page
pages_text = []

def is_meaningful_text(text, min_length=10, min_alpha_ratio=0.5):
    """
    Check if the text is meaningful based on length and alphanumeric ratio.
    :param text: The text to check.
    :param min_length: Minimum length for the text to be considered meaningful.
    :param min_alpha_ratio: Minimum ratio of alphanumeric characters.
    :return: True if the text is meaningful, False otherwise.
    """
    # Remove whitespace
    stripped_text = text.strip()
    
    # Check length
    if len(stripped_text) < min_length:
        return False
    
    # Check alphanumeric ratio
    alphanumeric_chars = sum(c.isalnum() for c in stripped_text)
    total_chars = len(stripped_text)
    if alphanumeric_chars / max(total_chars, 1) < min_alpha_ratio:
        return False

    return True

# Open the PDF file
with fitz.open(pdf_path) as pdf_document:
    for page_num in range(pdf_document.page_count):
        # Get each page as a pixmap (image)
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=300)

        # Convert pixmap to PIL image
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # Preprocess the image with OpenCV
        preprocessed_img = preprocess_with_opencv(img)

        # Convert the processed OpenCV image back to a PIL image for Tesseract
        preprocessed_pil_img = Image.fromarray(preprocessed_img)

        # Perform OCR on the preprocessed image using Spanish language
        page_text = pytesseract.image_to_string(preprocessed_pil_img, lang='spa').strip()

        # Detect and remove page numbers
        page_number_text = detect_page_number(page_text)
        if page_number_text is not None:
            for page_number in page_number_text:
                page_text = page_text.replace(page_number, "")

        # Clean nonsensical text
        page_text = clean_text(page_text)
        page_text = normalize_newlines(page_text)
        page_text = remove_short_lines_in_place(page_text)
        page_text = re.sub(r"(\n\n)(.{1,3})(\n\n)", r"\1\3", page_text, flags=re.DOTALL)
        page_text = re.sub(r"\n{3,}", "\n\n", page_text)
        

        # Correct spelling and grammar
        matches = tool.check(page_text)
        corrected_text = language_tool_python.utils.correct(page_text, matches)
        corrected_text = re.sub(r":(.)", r"\1", corrected_text)
        if is_meaningful_text(corrected_text) is True:
            pages_text.append(corrected_text)
        print(f"\n------------------------\n------------------------\nOriginal:\n{page_text}\n------------------------\n------------------------\nFinal:\n{corrected_text}\n##############################\n##############################\n\n")

pages_text

CalledProcessError: Command '['/usr/bin/java', '-version']' returned non-zero exit status 1.

## Using the OCR

This section saves the complete book.

In [3]:
import os
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import cv2
import numpy as np
import re
import language_tool_python

# Initialize LanguageTool for Spanish
tool = language_tool_python.LanguageTool('es')

# Path to your PDF file
pdf_path = 'data/raw/book.pdf'

# Create output folder
output_folder = "output"
os.makedirs(output_folder, exist_ok=True)

def preprocess_with_opencv(img):
    """Preprocess the image using OpenCV."""
    img_gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
    img_blur = cv2.GaussianBlur(img_gray, (5, 5), 0)
    _, img_binary = cv2.threshold(img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return img_binary

def clean_text(text):
    """Clean the OCR text to remove nonsensical lines."""
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        cleaned_line = line.strip()
        if len(re.findall(r'[a-zA-Z0-9]', cleaned_line)) / max(len(cleaned_line), 1) < 0.5:
            continue
        if len(cleaned_line) < 3:
            continue
        if re.match(r"(.)\1{3,}", cleaned_line):
            continue
        cleaned_line = re.sub(r"[^\w\s,.!?¿¡:\"'()-”“–]", "", cleaned_line)
        cleaned_lines.append(cleaned_line)

    return "\n".join(cleaned_lines)

def normalize_newlines(text):
    """Normalize newlines."""
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    text = re.sub(r"\n{2,}", "\n\n", text)
    return text
final_text = ""
# Process each page
with fitz.open(pdf_path) as pdf_document:
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        preprocessed_img = preprocess_with_opencv(img)
        preprocessed_pil_img = Image.fromarray(preprocessed_img)

        # Perform OCR with Tesseract
        ocr_result = pytesseract.image_to_data(preprocessed_pil_img, lang='spa', output_type=pytesseract.Output.DICT)
        text = " ".join(ocr_result['text']).strip()
        confidence = sum(ocr_result['conf']) / max(len(ocr_result['conf']), 1)  # Average confidence

        # Clean and correct text
        text = clean_text(text)
        text = normalize_newlines(text)
        matches = tool.check(text)
        corrected_text = language_tool_python.utils.correct(text, matches)
        corrected_text = re.sub(r" {2,}", "\n   ", corrected_text)

        # Format the output
        output_text = f"Page Number: {page_num +1}\n\nConfidence: {confidence:.2f}\n\nText:\n{corrected_text}\n\n"
        final_text += output_text
        # Save to a file
        output_file = os.path.join(output_folder, f"book.txt")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(final_text)

        print(f"Saved page {page_num + 1} to {output_file}.", end='\r')

CalledProcessError: Command '['/usr/bin/java', '-version']' returned non-zero exit status 1.

## Creating the EPUB

This section creates the EPUB file.