In [1]:
import kaggle_benchmarks as kbench
import pandas as pd

# 1. Prepare a sample dataset mimicking the structure of the MedQA-Ro dataset.
data_df = pd.read_csv("/kaggle/input/datasets/gpreda/medqaro-benchmark-for-medical-q-and-a-in-romanian/romedqa_val_dataset.csv")

# 1.1. Prepare a reduced sample
data_df = data_df.head(50)

# 1.2. Rename columns
data_df.columns = ["Nr", "Epicrisis", "Question", "Answer"]

# 1.3. Filter only payload
data_df = data_df[["Epicrisis", "Question", "Answer"]]


# 2. Define a task that evaluates a single question-answer pair.
# This task will be called for each row in the DataFrame.
@kbench.task(name="single_romanian_med_qa", store_task=False)
def single_romanian_med_qa(llm, Epicrisis: str, Question: str, Answer: str) -> dict:
    """Evaluates the model on a single Romanian medical question based on a given context."""
    prompt = f"""
    Citește următorul context medical (Epicrisis) și răspunde la întrebare.

    Context:
    {Epicrisis}

    Întrebare:
    {Question}
    """
    response = llm.prompt(prompt)

    # Check if the ground-truth answer is present in the model's response.
    is_correct = Answer.lower() in response.lower()

    return {
        "is_correct": is_correct,
        "predicted_answer": response,
        "gold_answer": Answer,
    }

# 3. Define the main task to evaluate the entire dataset.
@kbench.task(name="evaluate_med_qa_dataset")
def evaluate_med_qa_dataset(llm, df: pd.DataFrame) -> tuple[int, int]:
    """Runs the medical QA evaluation on the entire dataset and returns the score."""
    with kbench.client.enable_cache():
        # Use .evaluate() to run the single_item_task on each row of the DataFrame.
        runs = single_romanian_med_qa.evaluate(
            llm=[llm],
            evaluation_data=df,
            n_jobs=2,
        )

    eval_df = runs.as_dataframe()

    # Handle cases where evaluation might fail or produce no results.
    if eval_df.empty or 'result' not in eval_df.columns:
        return 0, len(df)

    # Calculate the total number of correct answers.
    # The 'is_correct' field is extracted from the result dictionary of each run.
    correct_count = int(eval_df.result.str.get("is_correct").sum())
    total_count = len(df)

    return correct_count , total_count

# 4. Run the main evaluation task with the prepared DataFrame.
evaluate_med_qa_dataset.run(kbench.llm, df=data_df)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   20.5s


[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  1.8min


[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:  3.2min finished


BokehModel(combine_events=True, render_bundle={'docs_json': {'fdf6d17a-d8e9-4a35-b629-7e5372b42690': {'version…