In [1]:
!pip install pytesseract
!apt-get install tesseract-ocr tesseract-ocr-heb poppler-utils
!pip install opencv-python

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils tesseract-ocr-heb
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 618 kB of archives.
After this operation, 1,673 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-heb all 1:4.00~git30-7274cfa-1.1 [432 kB]
Fetched 618 kB in 1s (573 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126371 files and director

In [2]:
import cv2
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import json
from typing import List, Dict, Tuple

In [3]:
BASE_PATH = "/content/drive/MyDrive/Miki_class/Project/Catalog"
MULTI_AD_INPUT = os.path.join(BASE_PATH, "classified_images", "multi_ads")
MULTI_AD_OUTPUT = os.path.join(BASE_PATH, "processed_ads", "multi_ads")

In [4]:
# Simple parameters
MIN_AD_SIZE = 200  # Minimum width and height for ads
JPEG_QUALITY = 90  # Output quality

In [5]:
def extract_page_number(filename: str) -> int:
    """Extract page number from filename"""
    try:
        name_without_ext = os.path.splitext(filename)[0]
        if name_without_ext.startswith('page_'):
            return int(name_without_ext.split('_')[1])
        else:
            return int(name_without_ext)
    except:
        return 0

In [6]:
def get_multi_ad_images(input_base_path: str) -> List[Dict]:
    """Get all images from multi_ad_pages folders"""
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
    all_images = []

    if not os.path.exists(input_base_path):
        print(f" Input path does not exist: {input_base_path}")
        return []

    for issue_folder in os.listdir(input_base_path):
        issue_path = os.path.join(input_base_path, issue_folder)

        if not os.path.isdir(issue_path):
            continue

        print(f" Found issue folder: {issue_folder}")

        issue_images = []
        for filename in os.listdir(issue_path):
            if any(filename.lower().endswith(ext) for ext in image_extensions):
                page_num = extract_page_number(filename)
                if page_num > 0:
                    image_info = {
                        'issue': issue_folder,
                        'page_num': page_num,
                        'filename': filename,
                        'full_path': os.path.join(issue_path, filename)
                    }
                    issue_images.append(image_info)

        issue_images.sort(key=lambda x: x['page_num'])
        all_images.extend(issue_images)

        print(f"   Found {len(issue_images)} images")
        for img in issue_images:
            print(f"    - Page {img['page_num']}: {img['filename']}")

    return all_images

In [7]:
def detect_and_crop_ads_simple(image_path: str, output_dir: str, issue: str, page_num: int) -> Dict:
    """
    Improved ad detection - larger size threshold to avoid small text blocks
    """
    print(f"\n Processing: Issue {issue}, Page {page_num}")

    try:
        # Read the image
        image = cv2.imread(image_path)
        if image is None:
            print(f" Could not load image: {image_path}")
            return {'success': False, 'error': 'Could not load image'}

        print(f"   Image size: {image.shape[1]}x{image.shape[0]}")

        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Edge detection with lower thresholds to catch weaker edges
        edges = cv2.Canny(gray, 50, 120)  # More sensitive to weak edges
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        print(f"   Found {len(contours)} total contours")

        # Calculate dynamic size thresholds based on image size
        image_area = image.shape[0] * image.shape[1]

        # Minimum ad should be at least 1/50th of the page area (less strict)
        min_ad_area = image_area / 50

        # Or use minimum dimensions (less strict)
        min_width = max(250, int(image.shape[1] * 0.12))  # 12% of image width
        min_height = max(250, int(image.shape[0] * 0.12))  # 12% of image height

        print(f"   Thresholds - Min area: {min_ad_area:.0f}, Min size: {min_width}x{min_height}")

        # Process contours with stricter filtering
        saved_ads = []
        ad_count = 0

        for idx, contour in enumerate(contours):
            x, y, w, h = cv2.boundingRect(contour)
            area = w * h

            # Multiple criteria for valid ads
            size_ok = w >= min_width and h >= min_height
            area_ok = area >= min_ad_area
            aspect_ok = 0.2 <= (w/h) <= 5.0  # Very lenient aspect ratio

            if size_ok and area_ok and aspect_ok:
                ad_count += 1

                # Extract ad region immediately
                ad_image = image[y:y + h, x:x + w]

                # Save immediately
                ad_filename = f"issue_{issue}_page_{page_num}_ad_{ad_count}.jpg"
                ad_path = os.path.join(output_dir, ad_filename)

                success = cv2.imwrite(ad_path, ad_image, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])

                if success:
                    saved_ads.append(ad_filename)
                    print(f"   Saved ad {ad_count}: {ad_filename} ({w}x{h}, area={area:.0f})")
                else:
                    print(f"   Failed to save ad {ad_count}")
            else:
                print(f"   Rejected: {w}x{h} (area={area:.0f}) - size_ok={size_ok}, area_ok={area_ok}, aspect_ok={aspect_ok}")

        # Fallback: if no ads found, save full page
        if len(saved_ads) == 0:
            print(f"   No ads detected - saving full page")
            full_page_filename = f"issue_{issue}_page_{page_num}_full_page.jpg"
            full_page_path = os.path.join(output_dir, full_page_filename)
            success = cv2.imwrite(full_page_path, image, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])

            if success:
                saved_ads.append(full_page_filename)

        return {
            'success': True,
            'issue': issue,
            'page_num': page_num,
            'ads_detected': ad_count,
            'ads_saved': len(saved_ads),
            'output_files': saved_ads,
            'detection_method': 'improved_thresholds'
        }

    except Exception as e:
        print(f" Error processing page: {e}")
        return {'success': False, 'error': str(e)}

In [8]:
def process_multi_ad_page(image_info: Dict, output_dir: str) -> Dict:
    """Process a multi-ad page using simple direct approach"""
    return detect_and_crop_ads_simple(
        image_info['full_path'],
        output_dir,
        image_info['issue'],
        image_info['page_num']
    )

In [9]:
# Get all multi-ad images
multi_ad_images = get_multi_ad_images(MULTI_AD_INPUT)

if multi_ad_images:
    print(f"\n FOUND {len(multi_ad_images)} MULTI-AD IMAGES:")
    print("="*50)

    current_issue = None
    for img in multi_ad_images:
        if img['issue'] != current_issue:
            current_issue = img['issue']
            print(f"\n Issue {current_issue}:")
        print(f"  - Page {img['page_num']}: {img['filename']}")
else:
    print(" No multi-ad images found. Check your folder structure!")

# Process all images
if multi_ad_images:
    print(f"\n Processing {len(multi_ad_images)} multi-ad images...")

    # Create output directory
    os.makedirs(MULTI_AD_OUTPUT, exist_ok=True)

    # Process each image
    results = {
        'total_processed': 0,
        'successful': 0,
        'failed': 0,
        'total_ads_detected': 0,
        'total_ads_saved': 0,
        'details': []
    }

    for image_info in multi_ad_images:
        result = process_multi_ad_page(image_info, MULTI_AD_OUTPUT)
        results['details'].append(result)
        results['total_processed'] += 1

        if result['success']:
            results['successful'] += 1
            results['total_ads_detected'] += result.get('ads_detected', 0)
            results['total_ads_saved'] += result.get('ads_saved', 0)
        else:
            results['failed'] += 1

    # Print summary
    print(f"\n PROCESSING COMPLETE:")
    print("="*40)
    print(f"Total pages processed: {results['total_processed']}")
    print(f"Successful: {results['successful']}")
    print(f"Failed: {results['failed']}")
    print(f"Total ads detected: {results['total_ads_detected']}")
    print(f"Total ads saved: {results['total_ads_saved']}")

    if results['successful'] > 0:
        avg_ads_per_page = results['total_ads_detected'] / results['successful']
        print(f"Average ads per page: {avg_ads_per_page:.1f}")

    if results['failed'] > 0:
        print(f"\n Failed pages:")
        for detail in results['details']:
            if not detail['success']:
                print(f"  - Error: {detail.get('error', 'Unknown error')}")

    # Save results
    results_file = os.path.join(MULTI_AD_OUTPUT, "processing_results.json")
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    print(f"\n Results saved to: {results_file}")

    # Show output directory contents
    if os.path.exists(MULTI_AD_OUTPUT):
        output_files = [f for f in os.listdir(MULTI_AD_OUTPUT) if f.endswith('.jpg')]
        print(f"\n Output directory contains {len(output_files)} ad files:")
        for file in sorted(output_files)[:10]:  # Show first 10
            print(f"  - {file}")
        if len(output_files) > 10:
            print(f"  ... and {len(output_files) - 10} more")

else:
    print(" No images to process")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   Rejected: 1x1 (area=1) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 10x9 (area=90) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 9x10 (area=90) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 1x1 (area=1) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 18x14 (area=252) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 1x2 (area=2) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 2x1 (area=2) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 1x1 (area=1) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 2x1 (area=2) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 19x29 (area=551) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 1x1 (area=1) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 1x1 (area=1) - size_ok=False, area_ok=False, aspect_ok=True
   Rejected: 2x1 (area=2) - size_ok=False, area

In [10]:
# def visualize_simple_detection(image_info: List[Dict], max_images: int = 2):
#     """Visualize detection results"""
#     print(f"\n Detection Visualization (showing first {max_images} pages)")

#     count = 0
#     for img_info in image_info:
#         if count >= max_images:
#             break

#         try:
#             # Load image
#             original = cv2.imread(img_info['full_path'])
#             if original is None:
#                 continue

#             # Detect contours for visualization
#             gray = cv2.cvtColor(original, cv2.COLOR_BGR2GRAY)
#             edges = cv2.Canny(gray, 100, 200)
#             contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

#             # Draw detection boxes
#             vis_image = original.copy()
#             valid_ads = 0
#             for contour in contours:
#                 x, y, w, h = cv2.boundingRect(contour)
#                 if w > MIN_AD_SIZE and h > MIN_AD_SIZE:
#                     valid_ads += 1
#                     cv2.rectangle(vis_image, (x, y), (x + w, y + h), (0, 255, 0), 3)
#                     cv2.putText(vis_image, f'Ad {valid_ads}', (x, y-10),
#                                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

#             # Convert for matplotlib
#             original_rgb = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)
#             vis_rgb = cv2.cvtColor(vis_image, cv2.COLOR_BGR2RGB)

#             # Create visualization
#             fig, axes = plt.subplots(1, 2, figsize=(20, 10))

#             # Original
#             axes[0].imshow(original_rgb)
#             axes[0].set_title(f'Original - Issue {img_info["issue"]}, Page {img_info["page_num"]}\nSize: {original.shape[1]}x{original.shape[0]}', fontsize=12)
#             axes[0].axis('off')

#             # Detection results
#             axes[1].imshow(vis_rgb)
#             axes[1].set_title(f'Simple Detection: {valid_ads} ads found\nMethod: Direct processing', fontsize=12)
#             axes[1].axis('off')

#             plt.tight_layout()
#             plt.show()

#             count += 1

#         except Exception as e:
#             print(f"Error visualizing {img_info.get('filename', 'unknown')}: {e}")

# # Run visualization if we have successful results
# if multi_ad_images and results['successful'] > 0:
#     visualize_simple_detection(multi_ad_images[:2])
# else:
#     print(" No successful results to visualize")