In [1]:
import base64
import kaggle_benchmarks as kbench
from kaggle_benchmarks.content_types import images


@kbench.task(name="decode_the_room")
def decode_the_room(llm):
    image_path = "/kaggle/input/datasets/gpreda/images-with-details/messy_bedroom.png"

    try:
        with open(image_path, "rb") as image_file:
            image_b64 = base64.b64encode(image_file.read()).decode("utf-8")
    except FileNotFoundError:
        kbench.assertions.assert_fail(
            f"Image file not found at {image_path}. \
            Ensure the images dataset is attached."
        )
        return

    park_image = images.from_base64(image_b64, format="jpeg")

    prompt = """
        You are an expert image analyser.
        From the image of the messy bedroom, answer to the questions:
        - What color is the cat?
        - Where is the cat hiding?
        - What is in the cristal globe placed on the table in the left side?
        - How many candles are in the birthday cake on the same table?
        - What color has the dog siting on the floor?
    """

    response = llm.prompt(prompt, image=park_image)

    assessment = kbench.assertions.assess_response_with_judge(
        criteria=[
            "The response correctly identifies that the cat is white",
            "The response correctly identifies that the cat is hiding in the drawer",
            "The response correctly identifies that cristal globe contains a snowman",
            "The response correctly identifies that there are 12 candles in the birthday cake",
            "The response correctly identifies that the dog is brown",
        ],
        response_text=response,
        judge_llm=kbench.judge_llm
    )
    
    if assessment is None:
        kbench.assertions.assert_fail(
            expectation="Judge LLM failed to provide an assessment."
        )
    else:
        total = len(assessment.results)
        passed = sum(1 for r in assessment.results if r.passed)
    
        score = passed / total  # gives 0, 1/5, 2/5, 3/5, 4/5, 5/5
    
        print(f"Score: {passed}/{total} = {score}")
    
        # Optional: assert at least something passed
        kbench.assertions.assert_true(
            passed > 0,
            expectation="At least one criterion should be satisfied."
        )
    
        return float(score)

decode_the_room.run(kbench.llm)

Wrong return type <class 'float'>. Expected None | kaggle_benchmarks.results.Unknown. This may need to lead to unexpected task behavior.


Score: 2/5 = 0.4


BokehModel(combine_events=True, render_bundle={'docs_json': {'e9032010-86f7-4c83-a1a6-3ad6fe24b802': {'versionâ€¦