# Image Processing Pipeline Notebook

This notebook demonstrates the image processing and OCR pipeline steps.


In [3]:
# Setup and imports
%reload_ext autoreload
%autoreload 1
%aimport core.ocr
%aimport core.image
%aimport core.text

import cv2
from core import ocr, image, text
from pathlib import Path
from PIL import Image
import sys
sys.path.append('src')
from main import Pipeline

# Initialize pipeline
pipeline = Pipeline()

## Block Segmentation Demo


In [14]:

def process_blocks(preprocess_path: Path, output_dir: Path, year: str, page: str):
    """Process image blocks and return segmented blocks."""
    blocks = core.Image(preprocess_path, output_dir).block_segmentation()
    block_info = []
    for block_path in blocks:
        block_path = Path(block_path)
        block_num = int(block_path.stem.split('-b')[-1])
        block_info.append((block_path, block_num))
    return block_info

def process_lines(block_path: Path, output_dir: Path, year: str, page: str, block_num: int):
    """Process block lines and return segmented lines."""
    lines = core.Image(block_path, output_dir).line_segmentation()
    line_info = []
    for line_path in lines:
        line_path = Path(line_path)
        # Extract line number from filename (e.g., line_y1922-p028-b0-r5.png -> 5)
        line_num = int(line_path.stem.split('-r')[-1])
        # For now, use a default height since it's not in the filename
        height = 0
        line_info.append((line_path, line_num, height))
    return line_info

def process_block_text(block_path: Path, year: str, page: str, block_num: int):
    """Process block text using OCR."""
    return core.OCR(block_path).block_to_string()

def process_line_text(line_path: Path, year: str, page: str, block_num: int, line_num: int, height: int):
    """Process line text using OCR."""
    return core.OCR(line_path).line_to_string()

# Process blocks
blocks = process_blocks(
    preprocess_path=pipeline.io.PATH_PREPROCESS / 'preprocess_y1922-p028.png',
    output_dir=pipeline.io.PATH_BLOCK,
    year='1922',
    page='028'
)


2024-12-01 16:49:44,539 - INFO -  [1mStarting - Line                    segmentation in /Users/floriancochard/Documents/extract_data_from_paper/src/data/output/block/block_y1922-p028-b0.png file. [0m
2024-12-01 16:49:44,626 - DEBUG - 	 > preprocessing block
2024-12-01 16:49:44,626 - DEBUG - 	 > grayscale
2024-12-01 16:49:44,628 - DEBUG - 	 > binarize
2024-12-01 16:49:44,635 - DEBUG - 	 > segment lines
2024-12-01 16:49:44,858 - INFO - 		 > 0-th line                        considered (area = 9748)
2024-12-01 16:49:44,863 - INFO - 		 > 1-th line                        considered (area = 135271)
2024-12-01 16:49:44,875 - INFO - 		 > 2-th line                        considered (area = 6290)
2024-12-01 16:49:44,877 - INFO - 		 > 5-th line                        considered (area = 342418)
2024-12-01 16:49:44,902 - INFO - 		 > 6-th line                        considered (area = 21207)
2024-12-01 16:49:44,907 - INFO - 		 > 7-th line                        considered (area = 712737)
2024-12-01

## Pipeline Execution


In [12]:

# Process method-specific segmentation
method = pipeline.params.METHOD
strings = []

try:
    if method == "BLOCK":
        for block_path, block_num in blocks:
            string = process_block_text(block_path, '1922', '028', block_num)
            strings.append(string)
            
    elif method == "LINE":
        for block_path, block_num in blocks:
            lines = process_lines(block_path, pipeline.io.PATH_LINE, '1922', '028', block_num)
            for line_path, line_num, height in lines:
                string = process_line_text(line_path, '1922', '028', block_num, line_num, height)
                strings.append(string)
    else:
        raise ValueError(f"Unsupported method: {method}")
except Exception as e:
    print(f"Error processing images: {str(e)}")

Error processing images: too many values to unpack (expected 2)
