In [2]:
import json
import glob

# Function to count total assistant messages and "i don't know" responses
def count_idontknow_responses(jsonl_files):
    total_assistant_lines = 0
    idontknow_count = 0
    
    for file in jsonl_files:
        with open(file, 'r') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    messages = data.get("messages", [])
                    for msg in messages:
                        if msg.get("role") == "assistant":
                            total_assistant_lines += 1
                            content = msg.get("content", "").strip().lower()
                            if "i don't know" in content or "i dont know" in content:
                                idontknow_count += 1
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line in {file}")

    return total_assistant_lines, idontknow_count


In [17]:
# Function to count "I don't know" in ground truth answers
def count_ground_truth_idontknow(dataset):
    total_gt_answers = 0
    idontknow_gt_count = 0

    for sample in dataset:
        answers = sample.get("answers", {}).get("ans_full", [])
        if isinstance(answers, list):
            for ans in answers:
                if isinstance(ans, str):
                    total_gt_answers += 1
                    if "i don't know" in ans.lower() or "i dont know" in ans.lower():
                        idontknow_gt_count += 1

    return total_gt_answers, idontknow_gt_count


In [20]:
jsonl_files = glob.glob('sft_caption_data_case_0_zero_shot_30tokens.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_caption_data_case_0_zero_shot_65tokens.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_1.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_2_web_search_only.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_3_image_search_only.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_4_image_web.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_5_web_search_rephrase.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_5_web_search_rephrase.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")



['sft_caption_data_case_0_zero_shot_30tokens.jsonl']
total: 1326
'I don't know' in responses: 965
ratio: 0.73

['sft_caption_data_case_0_zero_shot_65tokens.jsonl']
total: 1326
'I don't know' in responses: 965
ratio: 0.73

['sft_response_data_case_1.jsonl']
total: 1326
'I don't know' in responses: 648
ratio: 0.49

['sft_response_data_case_2_web_search_only.jsonl']
total: 1326
'I don't know' in responses: 433
ratio: 0.33

['sft_response_data_case_3_image_search_only.jsonl']
total: 1326
'I don't know' in responses: 651
ratio: 0.49

['sft_response_data_case_4_image_web.jsonl']
total: 1326
'I don't know' in responses: 424
ratio: 0.32

['sft_response_data_case_5_web_search_rephrase.jsonl']
total: 1326
'I don't know' in responses: 434
ratio: 0.33

['sft_response_data_case_5_web_search_rephrase.jsonl']
total: 1326
'I don't know' in responses: 434
ratio: 0.33


In [None]:
from datasets import load_dataset
dataset = load_dataset("crag-mm-2025/crag-mm-single-turn-public", split="validation")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 8.42k/8.42k [00:00<00:00, 10.5MB/s]
Downloading data: 100%|██████████| 467M/467M [03:03<00:00, 2.54MB/s] 
Downloading data: 100%|██████████| 458M/458M [03:47<00:00, 2.02MB/s] 
Downloading data:  37%|███▋      | 220M/594M [01:55<02:52, 2.16MB/s] Error while downloading from https://huggingface.co/datasets/crag-mm-2025/crag-mm-single-turn-public/resolve/711dd84fa2f1611975d476261afcb07292151923/data/validation-00002-of-00005.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.
Trying to resume download...
Downloading data: 100%|██████████| 594M/594M [05:09<00:00, 1.92MB/s]
Downloading data:  70%|██████▉   | 231M/330M [04:00<00:43, 2.30MB/s] Error while downloading from https://huggingface.co/datasets/crag-mm-2025/crag-mm-single-turn-public/resolve/711dd84fa2f1611975d476261afcb07292151923/data/validation-00003-of-00005.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', 

In [26]:
# ONLY GROUND TRUTH ANSWERS SAVED
# Save to a local file (one JSON object per line)
# dataset.to_json("crag_gt.jsonl", orient="records", lines=True)

import json
from datasets import load_dataset

output_path = "crag_gt.jsonl"

with open(output_path, "w", encoding="utf-8", errors="replace") as f:
    for session_id, answer in zip(dataset["session_id"], dataset["answers"]):
        obj = {
            "session_id": session_id,
            "ans_full": answer.get("ans_full", [])
        }
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Saved {len(dataset)} entries to {output_path}")

Saved 1938 entries to crag_gt.jsonl


In [18]:
count_ground_truth_idontknow = count_ground_truth_idontknow(dataset)
print(f"\nGround truth answers total: {count_ground_truth_idontknow[0]}")
print(f"'I don't know' in ground truth answers: {count_ground_truth_idontknow[1]}")
print(f"Ratio: {count_ground_truth_idontknow[1] / count_ground_truth_idontknow[0] if count_ground_truth_idontknow[0] > 0 else 0:.2f}")


Ground truth answers total: 1938
'I don't know' in ground truth answers: 0
Ratio: 0.00


In [19]:
def find_ground_truth_from_session_id(session_id):

    target_session_id = session_id
    # Find the matching example
    match = next((item for item in dataset if item["session_id"] == target_session_id), None)

    # Step 3: Load and display the image
    if match:
        image = match["image"]
        print("Session ID:", match["session_id"])
        print("Question:", match["turns"])
        print("Answer:", match["answers"]["ans_full"]) # HERE
        print("PIL Image Object:", image)
        print("Image size:", image.size)
        print("Image mode:", image.mode)
        image.show() # HERE
    else:
        print("No matching session_id found.")

In [None]:
# what needs to be done
# compare ground truth vs. our responses in scenarios
# generate a [0, 1, -1, ...]
# 0 as "I don't know" in our responses, 1 as correct answer as ground truth, -1 as our response try to answer but not correct, and so on

# ultimate goal is to make the model make the most use of the found search results
# with good reasoning, knowing when to answer, when to say "I don't know"
# 


# currently image search results are not in at all. To update
# zero-shot - be a helpful assistant

In [None]:
# METHOD 2: Launch from JSONL file (ground truth and session id only in the file)

import json
from evaluation import CRAGEvaluator

# === Config
PREDICTIONS_FILE = "sft_caption_data_case_0_zero_shot_65tokens.jsonl"
GROUND_TRUTH_FILE = "crag_gt.jsonl"  # ✅ your local GT .jsonl file
LINE_NUM = 2  # ✅ set to 0, 1, ... to evaluate one line; or None to run all

# === Load evaluator using local GT
evaluator = CRAGEvaluator.from_jsonl(GROUND_TRUTH_FILE)

# === Evaluate predictions
with open(PREDICTIONS_FILE, 'r') as f:
    for i, line in enumerate(f):
        if LINE_NUM is not None and i != LINE_NUM:
            continue
        sample = json.loads(line)
        result = evaluator.evaluate_one_jsonl_line(sample)
        print(f"[Line {i}] →", result)
        if LINE_NUM is not None:
            break

# self-quick EVALUATION
# run_eval.py


# # METHOD 1: Launch from dataset
# import json
# from datasets import load_dataset
# from evaluation import CRAGEvaluator

# # === Config ===
# JSONL_FILE = "sft_caption_data_case_0_zero_shot_65tokens.jsonl"
# LINE_NUM = 1  # ✅ Set to 0-based line index, e.g. 0 or 5, or keep as None to run all

# # === Load dataset ===
# dataset = load_dataset("crag-mm-2025/crag-mm-single-turn-public", split="validation")
# evaluator = CRAGEvaluator(dataset)

# # === Evaluate
# with open(JSONL_FILE, 'r') as f:
#     if LINE_NUM is not None:
#         for i, line in enumerate(f):
#             if i == LINE_NUM:
#                 sample = json.loads(line)
#                 result = evaluator.evaluate_one_jsonl_line(sample)
#                 print(f"[Line {i}] →", result)
#                 break
#     else:
#         for i, line in enumerate(f):
#             sample = json.loads(line)
#             result = evaluator.evaluate_one_jsonl_line(sample)
#             print(f"[Line {i}] →", result)

AttributeError: type object 'CRAGEvaluator' has no attribute 'from_jsonl'