# Demo for launching an OCR experiments



In [27]:
import os, sys, time, json
import dotenv
from IPython.display import HTML
import numpy as np
import pandas as pd
import requests

dotenv.load_dotenv("../.env")
sys.path.append("..")
from evalap.utils import log_and_raise_for_status

#EVALAP_API_URL = "http://localhost:8000/v1"
EVALAP_API_URL = "https://evalap.etalab.gouv.fr/v1"
EVALAP_API_KEY = os.getenv("EVALAP_API_KEY") 
ALBERT_API_URL = "https://albert.api.etalab.gouv.fr/v1"
ALBERT_API_KEY = os.getenv("ALBERT_API_KEY")
OPENAI_URL = "https://api.openai.com/v1"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
headers = {"Authorization": f"Bearer {EVALAP_API_KEY}"}

In [26]:
system_prompt = """You are tasked with generating a JSON representation from the analysis (OCR) of a given image. 
Your goal is to create a JSON object that has the content of the image extracted in a structured format. 

The JSON should be structured to represent the content of the image in a way that corresponds to standard markdown primitives. Here's how to approach this task:

The JSON should contain a list of blocks, where each block represents a distinct element in the image, such as headers, paragraphs, or tables.
Here is a an exemple of the json schema wanted: 

Schema:
```json
[
 {
   "type": "string (e.g  Text, Table, Code, SectionHeader, Figure, Equation, Handwriting, PageFooter, PageHeader, Picture, TableOfContents etc)",
   "text": "string (mardown formated text)"
 },
 ...
]
```

Example:
```json
[
 {
    "type": "Header", "text": "## I am a level 2 header"
 },
 {
   "type": "Paragraph", "text": "I am a **paragraph**"
 }
]
````

Follow these guidelines when creating the JSON:

1. The main structure should be a list of blocks. Each block are object containing a `type` and a `text field`.
2. Each block is an object containing a `type` and a `text field`. They should correspond to a standard markdown primitive (e.g., Header, Paragraph, Table).
3. Identify headers based on font size, weight, or positioning. These should be represented as "Header" blocks.
4. Group continuous lines of text into "Paragraph" blocks.
5. Identify tabular data and represent it as "Table" blocks. Only create table blocks for actual tabular data, not for text formatting.
6. Do not create separate blocks for inline formatting (bold, italic) or URLs. Keep these within the relevant "Paragraph" block.
7. If you encounter lists, represent them as "List" blocks, with nested items if applicable.
8. For images or diagrams, use an "Image" block and include any available descriptive text.

Remember, the goal is to create a structured representation of the image content that could be easily converted to markdown or used for further processing. Focus on the main structural elements and avoid over-complicating the JSON with minor formatting details.

Do not explain your answer. Just answer with the JSON result directly.
"""

In [28]:
# Designing my experiments
# --
expset_name = "albert_OCR_v2"
expset_readme = "Evaluating OCR capabilities of albert on the marker datasets" 
common_params = {
    "dataset" : "OCR_marker_benchmark",
    "model": {"sampling_params" : {"temperature": 0.2}},
    "metrics" : ["ocr_v1", "output_length", "generation_time"],
    "with_vision": True  # use parquet dataset
}
grid_params = {
    "model": [
        {"name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", "system_prompt": system_prompt, "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
        {"name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", "prelude_prompt": system_prompt, "base_url": ALBERT_API_URL, "api_key": ALBERT_API_KEY},
    ],
}


# Lauching the experiment set
# --
expset = {
    "name" : expset_name, 
    "readme": expset_readme,
    "cv": {"common_params": common_params, "grid_params": grid_params, "repeat":1}
}
response = requests.post(f'{EVALAP_API_URL}/experiment_set', json=expset, headers=headers)
resp = response.json()
if "id" in resp:
    expset_id = resp["id"]
    print(f'Created expset: {resp["name"]} ({resp["id"]})')
else:
    print(resp)

Created expset: albert_OCR_v2 (60)
