In [1]:
import base64
import kaggle_benchmarks as kbench
from kaggle_benchmarks.content_types import images


@kbench.task(name="what_time_is_it")
def what_time_is_it(llm):
    image_path = "/kaggle/input/datasets/gpreda/images-with-details/what_hour_it_is.png"

    try:
        with open(image_path, "rb") as image_file:
            image_b64 = base64.b64encode(image_file.read()).decode("utf-8")
    except FileNotFoundError:
        kbench.assertions.assert_fail(
            f"Image file not found at {image_path}. \
            Ensure the images dataset is attached."
        )
        return

    clock_image = images.from_base64(image_b64, format="jpeg")

    prompt = """
        You are an expert image analyser.
        From the image of the clock, answer to the questions:
        - What time is it (hour and minutes)?
        - How many seconds show the clock?
        - What brand is the clock?
    """

    response = llm.prompt(prompt, image=clock_image)

    assessment = kbench.assertions.assess_response_with_judge(
        criteria=[
            "The response correctly identifies that the time is 10 and 8 minutes",
            "The response correctly identifies that the clock shows 43 seconds",
            "The response correctly identifies that the clock brand is Seiko",
        ],
        response_text=response,
        judge_llm=kbench.judge_llm
    )
    
    if assessment is None:
        kbench.assertions.assert_fail(
            expectation="Judge LLM failed to provide an assessment."
        )
    else:
        total = len(assessment.results)
        passed = sum(1 for r in assessment.results if r.passed)
    
        score = passed / total  # gives 0, 1/3, 2/3, 3/3
    
        print(f"Score: {passed}/{total} = {score}")
    
        # Optional: assert at least something passed
        kbench.assertions.assert_true(
            passed > 0,
            expectation="At least one criterion should be satisfied."
        )
    
        return float(score)

what_time_is_it.run(kbench.llm)

Score: 1/3 = 0.3333333333333333


Wrong return type <class 'float'>. Expected None | kaggle_benchmarks.results.Unknown. This may need to lead to unexpected task behavior.


BokehModel(combine_events=True, render_bundle={'docs_json': {'51649919-425f-4575-9a60-02b83a5f5058': {'versionâ€¦