In [None]:
%pip install eyepop==3.12.0

In [None]:
import getpass

EYEPOP_ACCOUNT_ID=input("Enter your Account UUID: ")
EYEPOP_API_KEY=getpass.getpass('Enter your API KEY: ')

In [None]:
NAMESPACE_PREFIX="XXXXXXXXXX" # Change this to your namespace prefix

In [None]:
from eyepop import EyePopSdk
from eyepop.data.data_types import InferRuntimeConfig, VlmAbilityGroupCreate, VlmAbilityCreate, TransformInto
from eyepop.worker.worker_types import CropForward, ForwardComponent, FullForward, InferenceComponent, Pop
import json


ability_prototypes = [
    VlmAbilityCreate(
        name=f"{NAMESPACE_PREFIX}.image-describe.OCR-Translate-Image",
        description="Translate the text into English",
        worker_release="qwen3-instruct",
        text_prompt="""
          You are given a single image that may contain text in any language.
          Your task is to read ALL legible text in the image and translate it into English.
          Return ONLY valid JSON.
          Do not include explanation.
          Do not include markdown.
          Do not include commentary.
          ----------------------------------------
          INSTRUCTIONS:
          1. Extract only text that is clearly readable in the image.
          2. Preserve the original reading order as it appears visually:
            - Top to bottom
            - Left to right
            - Group by regions/blocks (e.g., headings, paragraphs, labels, signs).
          3. Do NOT guess missing characters or words.
          4. If a portion is partially unreadable, include the readable part and use "â€¦" to mark missing content.
          5. Keep numbers, dates, emails, URLs, product codes exactly as shown.
          6. If text is already English, still return it as the translation.
          7. If no readable text is present, return:
          {
            "has_text": false,
            "detected_language": null,
            "blocks": [],
            "full_translation": null
          }
          ----------------------------------------
          RETURN THIS EXACT JSON STRUCTURE:
          {
            "has_text": true,
            "detected_language": null,
            "blocks": [
              {
                "block_id": 1,
                "original_text": null,
                "english_translation": null
              }
            ],
            "full_translation": null
          }
          ----------------------------------------
          FIELD RULES:
          - detected_language: set to the best single language label if clear (e.g., "Spanish", "French", "Japanese"); otherwise null.
          - blocks: include one entry per visually distinct text block (title, label cluster, paragraph, sign panel, etc.).
          - original_text: the exact extracted text for that block (preserve line breaks if visible).
          - english_translation: a faithful English translation of that block.
          - full_translation: concatenate english_translation from all blocks in order, separated by newline characters.
          ----------------------------------------
          STRICT OUTPUT RULES:
          - Output must be strict JSON (double quotes, no trailing commas).
          - Do not include any additional keys.
          - Do not wrap in code fences.
          Return only the JSON object.
        """,
        transform_into=TransformInto(),
        config=InferRuntimeConfig(
            max_new_tokens=250,
            image_size=512
        ),
        is_public=False
    )
]



In [None]:
with EyePopSdk.dataEndpoint(api_key=EYEPOP_API_KEY, account_id=EYEPOP_ACCOUNT_ID) as endpoint:
    for ability_prototype in ability_prototypes:
        ability_group = endpoint.create_vlm_ability_group(VlmAbilityGroupCreate(
            name=ability_prototype.name,
            description=ability_prototype.description,
            default_alias_name=ability_prototype.name,
        ))
        ability = endpoint.create_vlm_ability(
            create=ability_prototype,
            vlm_ability_group_uuid=ability_group.uuid,
        )
        ability = endpoint.publish_vlm_ability(
            vlm_ability_uuid=ability.uuid,
            alias_name=ability_prototype.name,
        )
        ability = endpoint.add_vlm_ability_alias(
            vlm_ability_uuid=ability.uuid,
            alias_name=ability_prototype.name,
            tag_name="latest"
        )
        print(f"created ability {ability.uuid} with alias entries {ability.alias_entries}")

### Evalulate on a Single Image

In [None]:
from pathlib import Path


pop = Pop(components=[
   InferenceComponent(
       ability=f"{NAMESPACE_PREFIX}.image-describe.OCR-Translate-Image:latest"
   )
])


with EyePopSdk.workerEndpoint(api_key=EYEPOP_API_KEY) as endpoint:
   endpoint.set_pop(pop)
   sample_img_path = Path("/content/images.jpeg")
   job = endpoint.upload(sample_img_path)
   while result := job.predict():
      print(json.dumps(result, indent=2))

print("Done")

### Evaluation Flow

In [None]:
from pathlib import Path

pop = Pop(components=[
    InferenceComponent(
        ability=f"{NAMESPACE_PREFIX}.image-describe.OCR-Translate-Image:latest"
    )
])

all_results = {}

with EyePopSdk.workerEndpoint(api_key=EYEPOP_API_KEY) as endpoint:
    endpoint.set_pop(pop)
    directory_path = Path("/content/")

    for item in directory_path.iterdir():
        job = endpoint.upload(str(item))
        file_results = []
        while result := job.predict():
            file_results.append(result)

        all_results[item.name] = file_results

output_path = Path("/content/sample_data/output.json")
with open(output_path, "w") as f:
  json.dump(all_results, f, indent=2)

print("Done")