In [1]:
!pip install pytesseract
!apt-get install tesseract-ocr tesseract-ocr-heb poppler-utils

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils tesseract-ocr-heb
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 618 kB of archives.
After this operation, 1,673 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-heb all 1:4.00~git30-7274cfa-1.1 [432 kB]
Fetched 618 kB in 0s (2,728 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126371 files and direct

In [2]:
import subprocess
import sys

def install_hebrew_ocr():
    """Install Hebrew language pack for tesseract"""
    try:
        print("📦 Installing Hebrew OCR support...")

        # Install Hebrew language pack
        result = subprocess.run(['apt-get', 'update'], capture_output=True, text=True)
        result = subprocess.run(['apt-get', 'install', '-y', 'tesseract-ocr-heb'], capture_output=True, text=True)

        if result.returncode == 0:
            print("✅ Hebrew language pack installed successfully!")

            # Test Hebrew support
            import pytesseract
            langs = pytesseract.get_languages()
            if 'heb' in langs:
                print("✅ Hebrew (heb) language confirmed available")
                print(f"📋 Available languages: {sorted(langs)}")
                return True
            else:
                print("⚠️ Hebrew not found in available languages")
                print(f"📋 Available languages: {sorted(langs)}")
                return False
        else:
            print(f"❌ Installation failed: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Error installing Hebrew support: {e}")
        return False

# Install Hebrew support
hebrew_installed = install_hebrew_ocr()

📦 Installing Hebrew OCR support...
✅ Hebrew language pack installed successfully!
✅ Hebrew (heb) language confirmed available
📋 Available languages: ['eng', 'heb', 'osd']


In [3]:
import cv2
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import json
from typing import List, Dict, Tuple
import pytesseract
import re

pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

In [4]:
BASE_PATH = "/content/drive/MyDrive/Miki_class/Project/Catalog"
SINGLE_AD_INPUT = os.path.join(BASE_PATH, "classified_images", "single_ads")
SINGLE_AD_OUTPUT = os.path.join(BASE_PATH, "processed_ads", "single_ads")

In [5]:
# Processing parameters
CROP_MARGINS = True     # Whether to crop newspaper margins/headers
TOP_MARGIN = 50         # Pixels to crop from top (for headers)
AUTO_DETECT_FOOTER = True  # Automatically detect and remove footers
FOOTER_SCAN_HEIGHT = 180   # Height from bottom to scan for footer (pixels)
MANUAL_BOTTOM_MARGIN = 30  # Fallback bottom margin if no footer detected
LEFT_MARGIN = 20        # Pixels to crop from left
RIGHT_MARGIN = 20       # Pixels to crop from right

# Footer detection parameters
FOOTER_TEXT_PATTERNS = [
    "כח הפרסום",           # Main footer text
    r"הפרסום\s+\d+",       # "הפרסום" + space + any numbers
    r"כח הפרסום\s+\d+",    # "כח הפרסום" + space + any numbers
    r"\d+\s+הפרסום",       # Numbers + space + "הפרסום" (reverse order)
    "הפרסום",
]

# Multiple OCR strategies
OCR_STRATEGIES = [
    ("Hebrew + English", r'--oem 3 --psm 6 -l heb+eng'),
    ("Hebrew only", r'--oem 3 --psm 6 -l heb'),
    ("Single text line Hebrew", r'--oem 3 --psm 7 -l heb'),
    ("Single word Hebrew", r'--oem 3 --psm 8 -l heb'),
    ("Sparse text Hebrew", r'--oem 3 --psm 11 -l heb'),
] if hebrew_installed else [
    ("English fallback", r'--oem 3 --psm 6 -l eng'),
    ("Auto detect", r'--oem 3 --psm 6'),
]

# Quality settings
MIN_IMAGE_SIZE = (200, 200)  # Minimum size for output ads
JPEG_QUALITY = 90           # Output quality (1-100)

In [6]:
def extract_page_number(filename: str) -> int:
    try:
        # Remove extension
        name_without_ext = os.path.splitext(filename)[0]

        if name_without_ext.startswith('page_'):
            # Format: page_05.jpg
            return int(name_without_ext.split('_')[1])
        else:
            # Format: 5.jpg
            return int(name_without_ext)
    except:
        return 0  # Return 0 if can't parse

In [7]:
def get_single_ad_images(input_base_path: str) -> List[Dict]:
    """Get all images from single_ad_pages folders"""
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    all_images = []

    if not os.path.exists(input_base_path):
        print(f"❌ Input path does not exist: {input_base_path}")
        return []

    # Go through each issue folder
    for issue_folder in os.listdir(input_base_path):
        issue_path = os.path.join(input_base_path, issue_folder)

        if not os.path.isdir(issue_path):
            continue

        print(f"📁 Found issue folder: {issue_folder}")

        # Get all image files in this issue
        issue_images = []
        for filename in os.listdir(issue_path):
            if any(filename.lower().endswith(ext) for ext in image_extensions):
                page_num = extract_page_number(filename)
                if page_num > 0:  # Only include files we can parse
                    image_info = {
                        'issue': issue_folder,
                        'page_num': page_num,
                        'filename': filename,
                        'full_path': os.path.join(issue_path, filename)
                    }
                    issue_images.append(image_info)

        # Sort by page number
        issue_images.sort(key=lambda x: x['page_num'])
        all_images.extend(issue_images)

        print(f"  ✓ Found {len(issue_images)} images")
        for img in issue_images:
            print(f"    - Page {img['page_num']}: {img['filename']}")

    return all_images

In [8]:
def try_visual_detection(footer_region: np.ndarray, scan_height: int) -> Tuple[bool, int]:
    """More conservative visual footer detection"""

    gray = cv2.cvtColor(footer_region, cv2.COLOR_BGR2GRAY)
    height, width = gray.shape

    # Look for text regions in BOTTOM portion only (last 30% of scan region)
    bottom_region = gray[int(height * 0.7):, :]

    # Threshold to find dark text on light background
    _, thresh = cv2.threshold(bottom_region, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Count dark pixels (potential text)
    dark_pixel_ratio = np.sum(thresh == 255) / (bottom_region.shape[0] * bottom_region.shape[1])

    # More conservative thresholds to reduce false positives
    if 0.01 < dark_pixel_ratio < 0.15:  # Between 1% and 15% dark pixels
        # Look for horizontal text patterns
        horizontal_projection = np.sum(thresh, axis=1)
        text_lines = np.where(horizontal_projection > width * 0.1)[0]  # Lines with significant text

        if len(text_lines) > 0:
            print(f"  👁️ Visual detection: Found text patterns (dark ratio: {dark_pixel_ratio:.3f})")
            # Crop from where text starts, plus padding
            first_text_line = text_lines[0]
            crop_from_bottom = height - int(height * 0.7) - first_text_line + 20
            return True, max(40, min(crop_from_bottom, scan_height // 2))

    print(f"  👁️ Visual detection: No footer pattern (dark ratio: {dark_pixel_ratio:.3f})")
    return False, 0

In [9]:
def has_footer_characteristics(footer_region: np.ndarray) -> bool:
    """Check if region has footer-like characteristics"""

    gray = cv2.cvtColor(footer_region, cv2.COLOR_BGR2GRAY)
    height, width = gray.shape

    # Check for consistent background in top portion
    top_portion = gray[:height//3, :]
    top_std = np.std(top_portion)

    # Check for text-like patterns in bottom portion
    bottom_portion = gray[height*2//3:, :]
    edges = cv2.Canny(bottom_portion, 50, 150)
    edge_density = np.sum(edges > 0) / (bottom_portion.shape[0] * bottom_portion.shape[1])

    # Footer characteristics: uniform top, some complexity at bottom
    if top_std < 20 and edge_density > 0.01:
        return True

    return False

In [10]:
def estimate_crop_position(text: str, scan_height: int) -> int:
    """Estimate where to crop based on detected text"""
    # Simple heuristic: crop about 1/3 of scan region if we found footer text
    return min(80, scan_height // 3)

In [11]:
def try_ocr_detection(footer_region: np.ndarray, scan_height: int, scan_start_y: int) -> Tuple[bool, int, str]:
    """Try OCR-based footer detection with precise positioning"""

    # Preprocess image for better OCR
    preprocessing_methods = [
        ("original", footer_region),
        ("contrast", cv2.convertScaleAbs(cv2.cvtColor(footer_region, cv2.COLOR_BGR2GRAY), alpha=2.0, beta=0)),
        ("threshold", cv2.threshold(cv2.cvtColor(footer_region, cv2.COLOR_BGR2GRAY), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]),
        ("grayscale", cv2.cvtColor(footer_region, cv2.COLOR_BGR2GRAY)),
    ]

    best_text = ""
    best_method = ""

    for preprocess_name, processed_img in preprocessing_methods:
        # Convert to PIL for tesseract
        if len(processed_img.shape) == 3:
            pil_img = Image.fromarray(cv2.cvtColor(processed_img, cv2.COLOR_BGR2RGB))
        else:
            pil_img = Image.fromarray(processed_img)

        for strategy_name, config in OCR_STRATEGIES:
            try:
                extracted_text = pytesseract.image_to_string(pil_img, config=config)
                extracted_text = extracted_text.strip()

                if len(extracted_text) > len(best_text):
                    best_text = extracted_text
                    best_method = f"{preprocess_name} + {strategy_name}"

                # Check for footer patterns (stricter matching)
                for pattern in FOOTER_TEXT_PATTERNS:
                    if pattern in extracted_text:
                        print(f"  ✅ OCR found pattern '{pattern}' in text: '{extracted_text[:50]}...'")
                        print(f"  🔧 Using method: {preprocess_name} + {strategy_name}")

                        # Try to get precise positioning with bounding boxes
                        crop_pos = get_precise_footer_position(pil_img, config, pattern, scan_height)
                        return True, crop_pos, best_method

            except Exception as e:
                continue

    if best_text:
        print(f"  📝 Best OCR text found: '{best_text[:100]}...'")
        print(f"  ❌ But no footer patterns matched")
    else:
        print(f"  ❌ No readable text found in footer region")

    return False, 0, best_method

In [12]:
def detect_footer_location(image: np.ndarray, scan_height: int = 150) -> Tuple[bool, int, str]:
    """
    footer detection with correct cropping calculation

    Returns:
        (footer_found: bool, crop_position: int, detection_method: str)
    """
    height, width = image.shape[:2]

    # Define the bottom region to scan
    scan_region_start = max(0, height - scan_height)
    footer_region = image[scan_region_start:height, :]

    print(f"  🔍 Scanning footer region: {width}x{scan_height} pixels")
    print(f"  📍 Scan region: y={scan_region_start} to y={height}")

    # Strategy 1: OCR-based detection with precise cropping
    footer_found, crop_pos, method = try_ocr_detection(footer_region, scan_height, scan_region_start)
    if footer_found:
        print(f"  ✅ OCR detected footer, will crop {crop_pos} pixels from bottom")
        return True, crop_pos, f"OCR: {method}"

    # Strategy 2: Visual pattern detection (more conservative)
    footer_found, crop_pos = try_visual_detection(footer_region, scan_height)
    if footer_found:
        print(f"  👁️ Visual detected footer, will crop {crop_pos} pixels from bottom")
        return True, crop_pos, "Visual pattern"

    print(f"  ❌ No footer detected - using minimal crop")
    return False, 30, "None"  # Minimal fallback crop



In [13]:
def get_precise_footer_position(pil_img: Image, config: str, found_pattern: str, scan_height: int) -> int:
    """Get precise footer position using OCR bounding boxes"""
    try:
        # Get detailed OCR data with bounding boxes
        data = pytesseract.image_to_data(pil_img, config=config, output_type=pytesseract.Output.DICT)

        # Find text that contains our pattern
        min_y = scan_height  # Start from bottom of scan region

        for i, text in enumerate(data['text']):
            if text.strip() and int(data['conf'][i]) > 30:  # Confidence threshold
                if found_pattern in text or any(part in text for part in found_pattern.split()):
                    y_position = int(data['top'][i])
                    min_y = min(min_y, y_position)
                    print(f"    📍 Found footer text '{text}' at y={y_position}")

        if min_y < scan_height:
            # Add padding above the footer text
            crop_position = scan_height - min_y + 20  # 20px padding above text
            print(f"    📏 Calculated crop position: {crop_position} pixels from bottom")
            return max(50, min(crop_position, scan_height - 20))  # Ensure reasonable bounds
        else:
            print(f"    ⚠️ Could not locate footer text precisely, using default")
            return 80  # Default footer crop

    except Exception as e:
        print(f"    ⚠️ Precise positioning failed: {e}")
        return 80  # Default footer crop

In [14]:
def crop_margins(image: np.ndarray, top: int, left: int, right: int,
                      auto_footer: bool = True, scan_height: int = 150) -> Tuple[np.ndarray, Dict]:
    """
    smart crop with detailed debug output
    """
    height, width = image.shape[:2]
    print(f"  📐 Input image size: {width}x{height}")

    # Top and side margins
    y_start = min(top, height // 4)
    x_start = min(left, width // 4)
    x_end = max(width - right, width * 3 // 4)

    print(f"  ✂️ Side/top crops: left={x_start}, right={width-x_end}, top={y_start}")

    # Bottom margin with smart footer detection
    if auto_footer:
        footer_found, bottom_crop, method = detect_footer_location(image, scan_height)
    else:
        footer_found = False
        bottom_crop = MANUAL_BOTTOM_MARGIN
        method = "Manual"

    y_end = height - bottom_crop

    print(f"  📏 Bottom crop: {bottom_crop} pixels (method: {method})")
    print(f"  📏 Final crop coordinates: x={x_start}:{x_end}, y={y_start}:{y_end}")

    # Ensure valid coordinates
    y_start = max(0, y_start)
    y_end = min(height, max(y_end, height * 3 // 4))  # Don't crop more than 1/4
    x_start = max(0, x_start)
    x_end = min(width, max(x_end, width * 3 // 4))

    if y_end <= y_start or x_end <= x_start:
        print("  ⚠️ Warning: Invalid crop coordinates, returning original image")
        return image, {'error': 'Invalid crop coordinates'}

    cropped = image[y_start:y_end, x_start:x_end]
    final_height, final_width = cropped.shape[:2]

    print(f"  ✅ Final size: {final_width}x{final_height}")
    print(f"  📊 Cropped: {width-final_width}x{height-final_height} pixels total")

    crop_info = {
        'footer_detected': footer_found,
        'detection_method': method,
        'top_cropped': y_start,
        'bottom_cropped': bottom_crop,
        'left_cropped': x_start,
        'right_cropped': width - x_end,
        'original_size': (width, height),
        'cropped_size': (final_width, final_height)
    }

    return cropped, crop_info

In [15]:
def process_single_ad(image_info: Dict, output_dir: str) -> Dict:
    """Process a single ad page with smart footer detection"""
    issue = image_info['issue']
    page_num = image_info['page_num']
    filename = image_info['filename']
    input_path = image_info['full_path']

    print(f"\n📄 Processing Issue {issue}, Page {page_num} ({filename})")

    try:
        # Read image
        image = cv2.imread(input_path)
        if image is None:
            print(f"❌ Could not load image: {input_path}")
            return {'success': False, 'error': 'Could not load image'}

        original_height, original_width = image.shape[:2]
        print(f"  📐 Original size: {original_width}x{original_height}")

        # Smart crop margins if enabled
        crop_info = {}
        if CROP_MARGINS:
            image, crop_info = crop_margins(
                image, TOP_MARGIN, LEFT_MARGIN, RIGHT_MARGIN,
                AUTO_DETECT_FOOTER, FOOTER_SCAN_HEIGHT
            )

            if 'error' in crop_info:
                print(f"  ❌ Cropping failed: {crop_info['error']}")
                return {'success': False, 'error': crop_info['error']}

            new_height, new_width = image.shape[:2]
            print(f"  ✂️  After smart cropping: {new_width}x{new_height}")
            if crop_info.get('footer_detected', False):
                print(f"  📄 Footer auto-detected and removed")
            else:
                print(f"  📄 No footer detected, minimal bottom crop applied")
        else:
            new_height, new_width = original_height, original_width
            crop_info = {'footer_detected': False}

        # Check minimum size
        if new_width < MIN_IMAGE_SIZE[0] or new_height < MIN_IMAGE_SIZE[1]:
            print(f"⚠️  Warning: Image too small after cropping ({new_width}x{new_height})")

        # Create output filename
        output_filename = f"issue_{issue}_page_{page_num}_single_ad.jpg"
        output_path = os.path.join(output_dir, output_filename)

        # Save processed image
        success = cv2.imwrite(output_path, image, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])

        if success:
            print(f"  ✅ Saved: {output_filename}")
            result = {
                'success': True,
                'issue': issue,
                'page_num': page_num,
                'original_filename': filename,
                'output_filename': output_filename,
                'output_path': output_path,
                'original_size': (original_width, original_height),
                'final_size': (new_width, new_height),
                'cropped': CROP_MARGINS,
                'footer_detected': crop_info.get('footer_detected', False),
                'crop_details': crop_info
            }
        else:
            print(f"❌ Failed to save image")
            result = {'success': False, 'error': 'Failed to save image'}

        return result

    except Exception as e:
        print(f"❌ Error processing image: {e}")
        return {'success': False, 'error': str(e)}

In [16]:
print("🔍 Discovering single ad images...")
single_ad_images = get_single_ad_images(SINGLE_AD_INPUT)

if single_ad_images:
    print(f"\n📋 FOUND {len(single_ad_images)} SINGLE AD IMAGES:")
    print("="*50)

    current_issue = None
    for img in single_ad_images:
        if img['issue'] != current_issue:
            current_issue = img['issue']
            print(f"\n📖 Issue {current_issue}:")
        print(f"  - Page {img['page_num']}: {img['filename']}")
else:
    print("❌ No single ad images found. Check your folder structure!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    - Page 122: 122.jpg
    - Page 123: 123.jpg
    - Page 125: 125.jpg
    - Page 126: 126.jpg
    - Page 127: 127.jpg
    - Page 129: 129.jpg
    - Page 131: 131.jpg
    - Page 153: 153.jpg
    - Page 157: 157.jpg
    - Page 159: 159.jpg
    - Page 160: 160.jpg
    - Page 161: 161.jpg
    - Page 162: 162.jpg
    - Page 163: 163.jpg
    - Page 164: 164.jpg
    - Page 165: 165.jpg
    - Page 166: 166.jpg
    - Page 167: 167.jpg
    - Page 168: 168.jpg
📁 Found issue folder: 1228
  ✓ Found 123 images
    - Page 3: 3.jpg
    - Page 5: 5.jpg
    - Page 6: 6.jpg
    - Page 7: 7.jpg
    - Page 8: 8.jpg
    - Page 9: 9.jpg
    - Page 10: 10.jpg
    - Page 11: 11.jpg
    - Page 13: 13.jpg
    - Page 15: 15.jpg
    - Page 16: 16.jpg
    - Page 17: 17.jpg
    - Page 18: 18.jpg
    - Page 19: 19.jpg
    - Page 20: 20.jpg
    - Page 21: 21.jpg
    - Page 22: 22.jpg
    - Page 23: 23.jpg
    - Page 24: 24.jpg
    - Page 25: 25.jpg
   

In [17]:
if single_ad_images:
    print(f"\n🚀 Processing {len(single_ad_images)} single ad images...")

    # Create output directory
    os.makedirs(SINGLE_AD_OUTPUT, exist_ok=True)

    # Process each image
    results = {
        'total_processed': 0,
        'successful': 0,
        'failed': 0,
        'details': []
    }

    for image_info in single_ad_images:
        result = process_single_ad(image_info, SINGLE_AD_OUTPUT)
        results['details'].append(result)
        results['total_processed'] += 1

        if result['success']:
            results['successful'] += 1
        else:
            results['failed'] += 1

    # Print summary with footer detection stats
    print(f"\n📊 PROCESSING COMPLETE:")
    print("="*40)
    print(f"Total images processed: {results['total_processed']}")
    print(f"Successful: {results['successful']}")
    print(f"Failed: {results['failed']}")

    if results['successful'] > 0:
        footer_detected_count = sum(1 for detail in results['details']
                                  if detail.get('success', False) and detail.get('footer_detected', False))
        print(f"Footer auto-detected: {footer_detected_count}/{results['successful']}")

    if results['failed'] > 0:
        print(f"\n❌ Failed images:")
        for detail in results['details']:
            if not detail['success']:
                print(f"  - Error: {detail.get('error', 'Unknown error')}")

    # Save results
    results_file = os.path.join(SINGLE_AD_OUTPUT, "processing_results.json")
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    print(f"\n💾 Results saved to: {results_file}")

    # Show output directory contents
    if os.path.exists(SINGLE_AD_OUTPUT):
        output_files = [f for f in os.listdir(SINGLE_AD_OUTPUT) if f.endswith('.jpg')]
        print(f"\n📁 Output directory contains {len(output_files)} ad files:")
        for file in sorted(output_files)[:10]:  # Show first 10
            print(f"  - {file}")
        if len(output_files) > 10:
            print(f"  ... and {len(output_files) - 10} more")

else:
    print("❌ No images to process")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  🔧 Using method: original + Sparse text Hebrew
    ⚠️ Could not locate footer text precisely, using default
  ✅ OCR detected footer, will crop 80 pixels from bottom
  📏 Bottom crop: 80 pixels (method: OCR: original + Hebrew + English)
  📏 Final crop coordinates: x=20:1477, y=50:2259
  ✅ Final size: 1457x2209
  📊 Cropped: 40x130 pixels total
  ✂️  After smart cropping: 1457x2209
  📄 Footer auto-detected and removed
  ✅ Saved: issue_1229_page_109_single_ad.jpg

📄 Processing Issue 1229, Page 113 (page_113.jpg)
  📐 Original size: 1497x2339
  📐 Input image size: 1497x2339
  ✂️ Side/top crops: left=20, right=20, top=50
  🔍 Scanning footer region: 1497x180 pixels
  📍 Scan region: y=2159 to y=2339
  📝 Best OCR text found: 'הר...'
  ❌ But no footer patterns matched
  👁️ Visual detection: No footer pattern (dark ratio: 0.709)
  ❌ No footer detected - using minimal crop
  📏 Bottom crop: 30 pixels (method: None)
  📏 Final crop coord