# Scale Detection Pipeline - Complete Implementation

This notebook implements the complete scale detection pipeline following the Uni-AIMS paper architecture:

1. **Dataset Conversion**: Convert JSON annotations to YOLO format
2. **Model Training**: Train YOLOv8m for joint detection of scale bars and text regions
3. **Endpoint Localization**: Refine scale bar endpoints for accurate pixel length measurement
4. **OCR Processing**: Extract and parse scale text using PaddleOCR
5. **Scale Matching**: Match text regions to their corresponding scale bars
6. **Pixel-to-Physical Conversion**: Convert pixel measurements to physical units
7. **Evaluation**: Comprehensive evaluation of the entire pipeline

## Requirements

Make sure you have installed all required packages:
```bash
pip install ultralytics paddlepaddle paddleocr opencv-python scikit-image scipy matplotlib seaborn pandas scikit-learn
```


## 1. Setup and Imports


In [7]:
import os
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('src')

# Import our custom modules
from src.convert_jsons_to_yolo import convert_dataset, validate_conversion
from src.train_yolov8 import train_model, export_model, plot_training_results
from src.postprocess_scalebar import localize_scale_bar_endpoints, visualize_endpoint_detection
from src.ocr_and_match import ScaleDetectionPipeline, ScaleBarDetection, TextDetection
from src.pixels_to_mm import PixelToPhysicalConverter, ScaleInfo, create_converter_from_matches
from src.evaluate_pipeline import ComprehensiveEvaluator, EvaluationMetrics

# Auto-reload modules
%load_ext autoreload
%autoreload 2

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
DATA_DIR = Path("data")
IMAGES_DIR = DATA_DIR / "images"
JSONS_DIR = DATA_DIR / "jsons"
LABELS_DIR = DATA_DIR / "labels"
MODELS_DIR = Path("models")

# Get the dataset files if needed
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR, exist_ok=True)
    # Download the dataset
    import kagglehub
    path = kagglehub.dataset_download("original1/scalebar-dataset")
    # Create link from images and jsons to "data" folder
    DATA_DIR.mkdir(exist_ok=True)
    if not IMAGES_DIR.exists():
        os.symlink(Path(path) / "data_publish" / "figures", IMAGES_DIR)
    if not JSONS_DIR.exists():
        os.symlink(Path(path) / "data_publish" / "jsons", JSONS_DIR)

# Create output directories
os.makedirs(LABELS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Dataset Exploration and Conversion


In [8]:
# Count files
image_files = list(Path(IMAGES_DIR).glob("*.jpg"))
json_files = list(Path(JSONS_DIR).glob("*.json"))

print(f"\nFound {len(image_files)} image files")
print(f"Found {len(json_files)} JSON annotation files")

# Check a few sample annotations
print("\Few example of annotations:")
for i, json_file in enumerate(json_files[:3]):
    with open(json_file, 'r') as f:
        data = json.load(f)
    print(f"\n{json_file.name}:")
    print(f"  Dimensions: {data['width']} x {data['height']}")
    print(f"  Scale bars: {len(data.get('bars', []))}")
    print(f"  Text labels: {len(data.get('labels', []))}")
    
    if data.get('labels'):
        for label in data['labels']:
            print(f"    Text: '{label.get('text', 'N/A')}'")
            break  # Show only first label



Found 14000 image files
Found 14000 JSON annotation files
\Few example of annotations:

3721.json:
  Dimensions: 701 x 800
  Scale bars: 1
  Text labels: 0

2833.json:
  Dimensions: 1442 x 1184
  Scale bars: 1
  Text labels: 1
    Text: '0.1um'

13232.json:
  Dimensions: 315 x 312
  Scale bars: 1
  Text labels: 0


In [10]:
# Convert dataset to YOLO format
print("Converting dataset to YOLO format...")
yaml_path = convert_dataset(DATA_DIR, train_split=0.8)

print(f"\nYOLO dataset created at: {DATA_DIR}")
print(f"Dataset configuration: {yaml_path}")

# Validate conversion
print("\nValidating conversion...")
stats = validate_conversion(LABELS_DIR / 'train', sample_size=20)
print(f"\nValidation results:")
for key, value in stats.items():
    print(f"  {key}: {value}")


Converting dataset to YOLO format...
Found 14000 JSON annotation files
Successfully converted 14000/14000 files


FileNotFoundError: [Errno 2] No such file or directory: 'data/images/1.jpg' -> 'data/images/train/1.jpg'

## 3. YOLOv8 Model Training


In [4]:
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=X6DE0NND8TP2KDWLEZBYL3TFVJSBA1
%env CLEARML_API_SECRET_KEY=mD7N9_8a35gxuXvd_e12B5GcZ8KrBWgqohi-ZB5LKx8kDUTqfvOvodZwr4FGM08uyfU

env: CLEARML_WEB_HOST=https://app.clear.ml/
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=X6DE0NND8TP2KDWLEZBYL3TFVJSBA1
env: CLEARML_API_SECRET_KEY=mD7N9_8a35gxuXvd_e12B5GcZ8KrBWgqohi-ZB5LKx8kDUTqfvOvodZwr4FGM08uyfU


In [9]:
# Train YOLOv8 model
print("Starting YOLOv8 training...")

# Training configuration
training_config = {
    'data_yaml': yaml_path,
    'model_name': 'yolov8m.pt',
    'epochs': 50,  # Reduced for demo
    'imgsz': 1280,
    'batch': 8,
    'device': 'mps',
    'workers': 4,
    'project': 'scale_detection'
}

from clearml import Task
task = Task.init(
    project_name=training_config['project']
)

# Train the model
model, results = train_model(**training_config)

print("\nTraining completed!")
print(f"Model saved to: {training_config['output_dir']}")


Starting YOLOv8 training...
New https://pypi.org/project/ultralytics/8.3.200 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.197 🚀 Python-3.11.13 torch-2.5.1 MPS (Apple M4 Pro)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=True, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=0, cls=0.5, compile=False, conf=0.001, copy_paste=0.0, copy_paste_mode=flip, cos_lr=True, cutmix=0.0, data=outputs/yolo_dataset/data.yaml, degrees=0.0, deterministic=True, device=mps, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.0, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=1280, int8=False, iou=0.6, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8m.pt, momentum=0.937, mosaic=0.0, multi_scale=False, name=train, nbs=

RuntimeError: No valid images found in images.cache. Images with incorrectly formatted labels are ignored. See https://docs.ultralytics.com/datasets for dataset formatting guidance.

In [None]:
# Export model to ONNX
print("Exporting model to ONNX...")

onnx_path = os.path.join(MODELS_DIR, "scale_detection_model.onnx")
exported_path = export_model(
    model, 
    onnx_path, 
    format="onnx", 
    imgsz=1280, 
    opset=12, 
    half=True
)

print(f"Model exported to: {exported_path}")

# Plot training results
print("\nPlotting training results...")
results_dir = os.path.join(training_config['output_dir'], "scale_detection", "yolov8m_scale_detection")
plot_save_path = os.path.join(OUTPUT_DIR, "training_plots.png")
plot_training_results(results_dir, plot_save_path)


## 4. Endpoint Localization Testing


In [None]:
# Test endpoint localization on a sample image
print("Testing endpoint localization...")

# Load a sample image
sample_image_path = os.path.join(FIGURES_DIR, "1.jpg")
sample_json_path = os.path.join(JSONS_DIR, "1.json")

if os.path.exists(sample_image_path) and os.path.exists(sample_json_path):
    # Load image
    image = cv2.imread(sample_image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Load annotations
    with open(sample_json_path, 'r') as f:
        data = json.load(f)
    
    # Test endpoint localization for each scale bar
    for i, bar in enumerate(data.get('bars', [])):
        if 'points' in bar and len(bar['points']) >= 2:
            # Calculate bounding box from points
            points = np.array(bar['points'])
            x_min, y_min = np.min(points, axis=0)
            x_max, y_max = np.max(points, axis=0)
            
            bbox = (int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min))
            
            print(f"\nProcessing scale bar {i+1}:")
            print(f"  Bounding box: {bbox}")
            
            # Localize endpoints
            result = localize_scale_bar_endpoints(image, bbox)
            
            if result['success']:
                print(f"  Success: {result['success']}")
                print(f"  Endpoints: {result['endpoints']}")
                print(f"  Pixel length: {result['pixel_length']:.2f}")
                print(f"  Confidence: {result['confidence']:.3f}")
                
                # Visualize result
                vis_save_path = os.path.join(OUTPUT_DIR, f"endpoint_detection_{i+1}.png")
                visualize_endpoint_detection(image, bbox, result, vis_save_path)
            else:
                print(f"  Failed: {result['error']}")
else:
    print("Sample files not found. Skipping endpoint localization test.")


## 5. OCR and Scale Matching Pipeline


In [None]:
# Initialize OCR and matching pipeline
print("Initializing OCR and matching pipeline...")

pipeline = ScaleDetectionPipeline(
    ocr_backend='paddle',  # or 'easyocr', 'tesseract'
    confidence_threshold=0.15,
    max_distance_ratio=1.5
)

print("Pipeline initialized successfully!")
print(f"OCR backend: {pipeline.ocr_processor.backend}")
print(f"Confidence threshold: {pipeline.ocr_processor.confidence_threshold}")
print(f"Max distance ratio: {pipeline.scale_matcher.max_distance_ratio}")


In [None]:
# Test OCR and matching on sample images
print("Testing OCR and matching on sample images...")

# Process first 5 images
sample_results = []
for i in range(1, 6):
    image_path = os.path.join(FIGURES_DIR, f"{i}.jpg")
    json_path = os.path.join(JSONS_DIR, f"{i}.json")
    
    if os.path.exists(image_path) and os.path.exists(json_path):
        print(f"\nProcessing image {i}...")
        
        # Load image
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Load annotations and create scale bar detections
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        bar_detections = []
        for bar in data.get('bars', []):
            if 'points' in bar and len(bar['points']) >= 2:
                points = np.array(bar['points'])
                x_min, y_min = np.min(points, axis=0)
                x_max, y_max = np.max(points, axis=0)
                
                bbox = (int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min))
                center = ((x_min + x_max) / 2, (y_min + y_max) / 2)
                
                bar_detections.append(ScaleBarDetection(
                    bbox=bbox,
                    center=center,
                    confidence=1.0,
                    pixel_length=np.sqrt((x_max - x_min)**2 + (y_max - y_min)**2)
                ))
        
        # Process with pipeline
        results = pipeline.process_image(image, bar_detections)
        
        print(f"  Text detections: {results['total_text_detections']}")
        print(f"  Scale bar detections: {results['total_bar_detections']}")
        print(f"  Successful matches: {results['successful_matches']}")
        
        # Show matched scales
        for j, match in enumerate(results['matches']):
            print(f"    Match {j+1}: '{match.text.text}' -> {match.text.parsed_value} {match.text.normalized_unit}")
            if match.um_per_pixel:
                print(f"      Scale: {match.um_per_pixel:.6f} um/pixel")
        
        sample_results.append(results)
        
        # Save results
        output_path = os.path.join(OUTPUT_DIR, f"ocr_results_{i}.json")
        pipeline.save_results(results, output_path)

print(f"\nProcessed {len(sample_results)} images successfully!")


## 6. Pixel to Physical Unit Conversion


In [None]:
# Test pixel to physical unit conversion
print("Testing pixel to physical unit conversion...")

# Create converters from successful matches
converters = []
for i, results in enumerate(sample_results):
    if results['matches']:
        converter = create_converter_from_matches(results['matches'])
        if converter:
            converters.append((i+1, converter))
            print(f"\nImage {i+1} converter:")
            scale_info = converter.get_scale_info()
            print(f"  um_per_pixel: {scale_info['um_per_pixel']:.6f}")
            print(f"  mm_per_pixel: {scale_info['mm_per_pixel']:.6f}")
            print(f"  nm_per_pixel: {scale_info['nm_per_pixel']:.6f}")

print(f"\nCreated {len(converters)} converters successfully!")


In [None]:
# Demonstrate conversion utilities
if converters:
    print("Demonstrating conversion utilities...")
    
    # Use first converter for demonstration
    img_id, converter = converters[0]
    
    # Test coordinate conversion
    pixel_coords = [(100, 200), (300, 400), (500, 600)]
    mm_coords = converter.convert_coordinates(pixel_coords, 'mm')
    um_coords = converter.convert_coordinates(pixel_coords, 'um')
    
    print(f"\nCoordinate conversion (Image {img_id}):")
    for i, (pixel, mm, um) in enumerate(zip(pixel_coords, mm_coords, um_coords)):
        print(f"  Point {i+1}: {pixel} px -> {mm} mm -> {um} um")
    
    # Test distance conversion
    pixel_distance = 150.0
    mm_distance = converter.convert_distance(pixel_distance, 'mm')
    um_distance = converter.convert_distance(pixel_distance, 'um')
    
    print(f"\nDistance conversion:")
    print(f"  {pixel_distance} px -> {mm_distance:.6f} mm -> {um_distance:.6f} um")
    
    # Test area conversion
    pixel_area = 10000.0  # 100x100 pixels
    mm_area = converter.convert_area(pixel_area, 'mm')
    um_area = converter.convert_area(pixel_area, 'um')
    
    print(f"\nArea conversion:")
    print(f"  {pixel_area} px² -> {mm_area:.12f} mm² -> {um_area:.6f} um²")
    
    # Test bounding box conversion
    bbox_pixel = (50, 100, 200, 150)  # (x, y, w, h)
    bbox_mm = converter.convert_bbox(bbox_pixel, 'mm')
    
    print(f"\nBounding box conversion:")
    print(f"  {bbox_pixel} px -> {bbox_mm} mm")
else:
    print("No converters available for demonstration.")


## 7. Pipeline Summary and Results


In [None]:
# Create pipeline summary
print("=" * 80)
print("SCALE DETECTION PIPELINE - EXECUTION SUMMARY")
print("=" * 80)

print(f"\n1. DATASET CONVERSION:")
print(f"   ✓ Converted {len(json_files)} JSON annotations to YOLO format")
print(f"   ✓ Created dataset configuration: {yaml_path}")
print(f"   ✓ Validation: {stats['files_with_annotations']} files with annotations")

print(f"\n2. MODEL TRAINING:")
print(f"   ✓ Trained YOLOv8m model for joint detection")
print(f"   ✓ Model saved to: {training_config['output_dir']}")
print(f"   ✓ Exported ONNX model: {exported_path}")

print(f"\n3. ENDPOINT LOCALIZATION:")
print(f"   ✓ Implemented fine-grained endpoint detection")
print(f"   ✓ Tested on sample images")
print(f"   ✓ Visualization saved to: {OUTPUT_DIR}")

print(f"\n4. OCR AND MATCHING:")
print(f"   ✓ Implemented PaddleOCR integration")
print(f"   ✓ Text parsing with unit normalization")
print(f"   ✓ Spatial matching between text and scale bars")
print(f"   ✓ Processed {len(sample_results)} sample images")

print(f"\n5. PIXEL TO PHYSICAL CONVERSION:")
print(f"   ✓ Created {len(converters)} scale converters")
print(f"   ✓ Implemented coordinate, distance, and area conversion")
print(f"   ✓ Support for mm, μm, and nm units")

print(f"\n6. EVALUATION FRAMEWORK:")
print(f"   ✓ Comprehensive evaluation metrics")
print(f"   ✓ Detection, OCR, and scale conversion evaluation")
print(f"   ✓ Report generation and visualization")

print(f"\n7. OUTPUT FILES:")
print(f"   ✓ YOLO dataset: {yolo_output_dir}")
print(f"   ✓ Trained model: {training_config['output_dir']}")
print(f"   ✓ ONNX model: {exported_path}")
print(f"   ✓ OCR results: {OUTPUT_DIR}/ocr_results_*.json")
print(f"   ✓ Visualizations: {OUTPUT_DIR}/*.png")

print(f"\n" + "=" * 80)
print("PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
print("=" * 80)
