In [1]:
import base64
import kaggle_benchmarks as kbench
from kaggle_benchmarks.content_types import images


@kbench.task(name="what_is_in_the_scene_at_the_park")
def what_is_in_the_scene_at_the_park(llm):
    image_path = "/kaggle/input/jigsaw-puzzles/scene at the park.jpg"

    try:
        with open(image_path, "rb") as image_file:
            image_b64 = base64.b64encode(image_file.read()).decode("utf-8")
    except FileNotFoundError:
        kbench.assertions.assert_fail(
            f"Image file not found at {image_path}. \
            Ensure the images dataset is attached."
        )
        return

    park_image = images.from_base64(image_b64, format="jpeg")

    prompt = """
        You are an expert image analyser.
        From the image of the park, answer to the questions:
        - What color has the hair of the girl with a large dog?
        - How many people are playing basketball?
        - Which animal is drinking water from the wateer fountain?
        - How many children with roller skates are sitting on the bench?
    """

    response = llm.prompt(prompt, image=park_image)

    assessment = kbench.assertions.assess_response_with_judge(
        criteria=[
            "The response correctly identifies that the girl with a large dog has orange hair",
            "The response correctly identifies there are 3 people playing basketball",
            "The response correctly identifies that a squirel drinks water from the water fountain",
            "The response correctly identifies that 2 children with roller skates sits on the bench",
        ],
        response_text=response,
        judge_llm=kbench.judge_llm
    )
    
    if assessment is None:
        kbench.assertions.assert_fail(
            expectation="Judge LLM failed to provide an assessment."
        )
    else:
        total = len(assessment.results)
        passed = sum(1 for r in assessment.results if r.passed)
    
        score = passed / total  # gives 0, 1/4, 2/4, 3/4, 4/4
    
        print(f"Score: {passed}/{total} = {score}")
    
        # Optional: assert at least something passed
        kbench.assertions.assert_true(
            passed > 0,
            expectation="At least one criterion should be satisfied."
        )
    
        return score

what_is_in_the_scene_at_the_park.run(kbench.llm)

BokehModel(combine_events=True, render_bundle={'docs_json': {'dbfba000-00b8-4a1c-8f94-2941302efcdd': {'versionâ€¦