In [14]:
import textwrap
from google import genai
from google.genai import types
from pydantic import BaseModel, Field
from typing import List, Optional
from erc3 import erc3 as dev, ApiException, TaskInfo, ERC3
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
client = genai.Client()
core = ERC3()
MODEL_ID = "gemini-2.5-flash"

In [6]:
res = core.start_session(
    benchmark="demo",
    workspace="dev",
    name=f"First manual tests",
    architecture="manual",
    # can also set to compete_budget, compete_speed and/or compete_local
    flags=["compete_accuracy"]
)

res.session_id

In [9]:
status = core.session_status(res.session_id)
print(f"Session has {len(status.tasks)} tasks")

In [11]:
status.tasks

[TaskInfo(spec_id='spec1', task_id='tsk-42iBQ5Kaaq3s3CHpsgdRkD', num=0, task_text='Return secret', status='new', benchmark='demo', score=-1.0, error_message=None),
 TaskInfo(spec_id='spec2', task_id='tsk-42iBQ5KpxcndUL2E7dA4Lz', num=1, task_text='Return secret backwards', status='new', benchmark='demo', score=-1.0, error_message=None),
 TaskInfo(spec_id='spec3', task_id='tsk-42iBQ5L2k19taAxpNKV6Dw', num=2, task_text='Close task without doing anything!', status='new', benchmark='demo', score=-1.0, error_message=None),
 TaskInfo(spec_id='spec 4', task_id='tsk-42iBQ5LEPi1BAQfhBkqX6A', num=3, task_text='Return secret number 1 from the list', status='new', benchmark='demo', score=-1.0, error_message=None)]

In [None]:
# task = core.start_new_task("erc3-test", "project_check_by_member")
#run_agent(MODEL_ID, core, task)

## LLM abstraction

In [15]:
import time
from typing import List, Type, TypeVar

from erc3 import ERC3, TaskInfo
from pydantic import BaseModel

T = TypeVar('T', bound=BaseModel)

class MyLLM:
    client: genai.Client
    api: ERC3
    task: TaskInfo
    model: str

    def __init__(self, api: ERC3, model: str, task: TaskInfo) -> None:
        self.api = api
        self.model = model
        self.task = task
        self.client = genai.Client()

    def query(self, prompt: str, response_format: Type[T], model: str = None) -> T:
        started = time.time()

        resp = self.client.models.generate_content(
            model=model or self.model,
            contents=prompt,
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=response_format,
                temperature=0,
            ),
        )

        usage = resp.usage_metadata
        self.api.log_llm(
            task_id=self.task.task_id,
            model=model or self.model,
            duration_sec=time.time() - started,
            completion=resp.text,
            prompt_tokens=usage.prompt_token_count,
            completion_tokens=usage.candidates_token_count,
            cached_prompt_tokens=usage.cached_content_token_count or 0,
        )

        return response_format.model_validate_json(resp.text)

In [None]:
def run_agent(model: str, api: ERC3, task: TaskInfo):
    erc_client = api.get_erc_client(task)
    about = erc_client.who_am_i()
    llm = MyLLM(api=api, model=model, task=task)



## Run all tasks

In [13]:

for task in status.tasks:
    print("="*40)
    print(f"Starting Task: {task.task_id} ({task.spec_id}): {task.task_text}")
    # start the task
    core.start_task(task)
    try:
        run_agent(MODEL_ID, core, task)
    except Exception as e:
        print(e)
    result = core.complete_task(task)
    if result.eval:
        explain = textwrap.indent(result.eval.logs, "  ")
        print(f"\nSCORE: {result.eval.score}\n{explain}\n")

'ssn-42iBQ5KK48VytKZT5WiHSY'

In [16]:
core.submit_session(res.session_id)

ApiException: session has unfinished tasks