# Create Label Studio Import File - Images Only

This notebook creates a JSON file for Label Studio import with just image references. 
Unlike the original script with PaddleOCR, this doesn't perform OCR - it only prepares the images 
for manual bounding box labeling within Label Studio.

In [7]:
import os
import json
from PIL import Image
import numpy as np

In [8]:
# Configure the path to your image folder
images_folder_path = "../image"

In [9]:
def create_image_url(filename):
    """
    Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
    if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8082/filename.png
    Otherwise you can build links like /data/upload/filename.png to refer to the files
    """
    return f'http://localhost:8082/{filename}'

In [10]:
def create_label_studio_json_for_images(images_folder_path, output_file="label-studio-bbox-import.json"):
    """
    Creates a Label Studio import file with just image references (no OCR data).
    Each image becomes a task in Label Studio.
    
    Args:
        images_folder_path: Path to the folder containing images
        output_file: Name of the output JSON file
    """
    label_studio_task_list = []
    processed_count = 0
    
    # Process all image files in the directory
    for image_file in os.listdir(images_folder_path):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
            print(f"Processing: {image_file}")
            
            # Create task object with image URL
            task = {
                'data': {
                    'image': create_image_url(image_file)
                }
            }
            
            # Optional: Add image dimensions as metadata
            try:
                img_path = os.path.join(images_folder_path, image_file)
                with Image.open(img_path) as img:
                    width, height = img.size
                    task['data']['width'] = width
                    task['data']['height'] = height
            except Exception as e:
                print(f"Warning: Could not read image dimensions for {image_file}: {e}")
            
            # Add to task list
            label_studio_task_list.append(task)
            processed_count += 1
    
    # Save the task list as a JSON file
    with open(output_file, 'w') as f:
        json.dump(label_studio_task_list, f, indent=4)
    
    print(f"\nProcessing complete! Created {output_file} with {processed_count} images.")
    print(f"Import this file into Label Studio to start manual bounding box labeling.")
    
    return label_studio_task_list

In [11]:
# Run the function to create the Label Studio import file
tasks = create_label_studio_json_for_images(images_folder_path)

Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_1.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_10.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_11.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_12.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_2.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_3.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_4.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_5.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_6.png
Processing: CreditcardscomInc_20070810_S_1_EX_10_33_362297_EX_10_33_Affiliate_Agreement_page_7.png
Process

## Label Studio Project Configuration

When creating your Label Studio project, use this configuration for bounding box labeling:

```xml
<View>
  <Image name="image" value="$image" />
  <RectangleLabels name="label" toName="image">
    <Label value="Text" background="green"/>
    <!-- Add more labels as needed -->
  </RectangleLabels>
</View>
```

After importing the generated JSON file, you'll be able to draw bounding boxes manually on each image.

In [12]:
# Function to process bulk directories of images
def bulk_process_images(base_dir, output_file="label-studio-bulk-import.json"):
    """
    Process multiple directories of images and create a single import file
    
    Args:
        base_dir: Base directory containing image directories
        output_file: Output JSON file name
    """
    all_tasks = []
    
    # Walk through all directories
    for root, dirs, files in os.walk(base_dir):
        image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        if image_files:
            print(f"Processing directory: {root}")
            
            for image_file in image_files:
                # Get relative path from base_dir
                rel_path = os.path.relpath(root, base_dir)
                if rel_path == ".":
                    image_url = create_image_url(image_file)
                else:
                    image_url = create_image_url(f"{rel_path}/{image_file}")
                
                # Create task
                task = {
                    'data': {
                        'image': image_url
                    }
                }
                
                # Add category based on directory name
                if rel_path != ".":
                    task['data']['category'] = os.path.basename(rel_path)
                
                all_tasks.append(task)
    
    # Save all tasks
    with open(output_file, 'w') as f:
        json.dump(all_tasks, f, indent=4)
    
    print(f"\nBulk processing complete! Created {output_file} with {len(all_tasks)} images.")
    return all_tasks

In [13]:
# Optional: Run bulk processing (uncomment to use)
# bulk_tasks = bulk_process_images("../path/to/bulk/images")