In [2]:
import json
import glob

# Function to count total assistant messages and "i don't know" responses
def count_idontknow_responses(jsonl_files):
    total_assistant_lines = 0
    idontknow_count = 0
    
    for file in jsonl_files:
        with open(file, 'r') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    messages = data.get("messages", [])
                    for msg in messages:
                        if msg.get("role") == "assistant":
                            total_assistant_lines += 1
                            content = msg.get("content", "").strip().lower()
                            if "i don't know" in content or "i dont know" in content:
                                idontknow_count += 1
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line in {file}")

    return total_assistant_lines, idontknow_count


In [17]:
# Function to count "I don't know" in ground truth answers
def count_ground_truth_idontknow(dataset):
    total_gt_answers = 0
    idontknow_gt_count = 0

    for sample in dataset:
        answers = sample.get("answers", {}).get("ans_full", [])
        if isinstance(answers, list):
            for ans in answers:
                if isinstance(ans, str):
                    total_gt_answers += 1
                    if "i don't know" in ans.lower() or "i dont know" in ans.lower():
                        idontknow_gt_count += 1

    return total_gt_answers, idontknow_gt_count


In [20]:
jsonl_files = glob.glob('sft_caption_data_case_0_zero_shot_30tokens.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_caption_data_case_0_zero_shot_65tokens.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_1.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_2_web_search_only.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_3_image_search_only.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_4_image_web.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_5_web_search_rephrase.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")

jsonl_files = glob.glob('sft_response_data_case_5_web_search_rephrase.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")



['sft_caption_data_case_0_zero_shot_30tokens.jsonl']
total: 1326
'I don't know' in responses: 965
ratio: 0.73

['sft_caption_data_case_0_zero_shot_65tokens.jsonl']
total: 1326
'I don't know' in responses: 965
ratio: 0.73

['sft_response_data_case_1.jsonl']
total: 1326
'I don't know' in responses: 648
ratio: 0.49

['sft_response_data_case_2_web_search_only.jsonl']
total: 1326
'I don't know' in responses: 433
ratio: 0.33

['sft_response_data_case_3_image_search_only.jsonl']
total: 1326
'I don't know' in responses: 651
ratio: 0.49

['sft_response_data_case_4_image_web.jsonl']
total: 1326
'I don't know' in responses: 424
ratio: 0.32

['sft_response_data_case_5_web_search_rephrase.jsonl']
total: 1326
'I don't know' in responses: 434
ratio: 0.33

['sft_response_data_case_5_web_search_rephrase.jsonl']
total: 1326
'I don't know' in responses: 434
ratio: 0.33


In [None]:
from datasets import load_dataset
dataset = load_dataset("crag-mm-2025/crag-mm-single-turn-public", split="validation")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 8.42k/8.42k [00:00<00:00, 10.5MB/s]
Downloading data: 100%|██████████| 467M/467M [03:03<00:00, 2.54MB/s] 
Downloading data: 100%|██████████| 458M/458M [03:47<00:00, 2.02MB/s] 
Downloading data:  37%|███▋      | 220M/594M [01:55<02:52, 2.16MB/s] Error while downloading from https://huggingface.co/datasets/crag-mm-2025/crag-mm-single-turn-public/resolve/711dd84fa2f1611975d476261afcb07292151923/data/validation-00002-of-00005.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Read timed out.
Trying to resume download...
Downloading data: 100%|██████████| 594M/594M [05:09<00:00, 1.92MB/s]
Downloading data:  70%|██████▉   | 231M/330M [04:00<00:43, 2.30MB/s] Error while downloading from https://huggingface.co/datasets/crag-mm-2025/crag-mm-single-turn-public/resolve/711dd84fa2f1611975d476261afcb07292151923/data/validation-00003-of-00005.parquet: HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', 

In [26]:
# ONLY GROUND TRUTH ANSWERS SAVED
# Save to a local file (one JSON object per line)
# dataset.to_json("crag_gt.jsonl", orient="records", lines=True)

import json
from datasets import load_dataset

output_path = "crag_gt.jsonl"

with open(output_path, "w", encoding="utf-8", errors="replace") as f:
    for session_id, answer in zip(dataset["session_id"], dataset["answers"]):
        obj = {
            "session_id": session_id,
            "ans_full": answer.get("ans_full", [])
        }
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Saved {len(dataset)} entries to {output_path}")

Saved 1938 entries to crag_gt.jsonl


In [18]:
count_ground_truth_idontknow = count_ground_truth_idontknow(dataset)
print(f"\nGround truth answers total: {count_ground_truth_idontknow[0]}")
print(f"'I don't know' in ground truth answers: {count_ground_truth_idontknow[1]}")
print(f"Ratio: {count_ground_truth_idontknow[1] / count_ground_truth_idontknow[0] if count_ground_truth_idontknow[0] > 0 else 0:.2f}")


Ground truth answers total: 1938
'I don't know' in ground truth answers: 0
Ratio: 0.00


In [19]:
def find_ground_truth_from_session_id(session_id):

    target_session_id = session_id
    # Find the matching example
    match = next((item for item in dataset if item["session_id"] == target_session_id), None)

    # Step 3: Load and display the image
    if match:
        image = match["image"]
        print("Session ID:", match["session_id"])
        print("Question:", match["turns"])
        print("Answer:", match["answers"]["ans_full"]) # HERE
        print("PIL Image Object:", image)
        print("Image size:", image.size)
        print("Image mode:", image.mode)
        image.show() # HERE
    else:
        print("No matching session_id found.")

In [None]:
# what needs to be done
# compare ground truth vs. our responses in scenarios
# generate a [0, 1, -1, ...]
# 0 as "I don't know" in our responses, 1 as correct answer as ground truth, -1 as our response try to answer but not correct, and so on

# ultimate goal is to make the model make the most use of the found search results
# with good reasoning, knowing when to answer, when to say "I don't know"
# 


# currently image search results are not in at all. To update
# zero-shot - be a helpful assistant

In [2]:
# METHOD 2: Launch from JSONL file (ground truth and session id only in the file)

import os
import json
from evaluation import CRAGEvaluator
import time

# === Config
PREDICTIONS_FILE = "sft_response_data_case_5_web_search_rephrase.jsonl"
GROUND_TRUTH_FILE = "crag_gt.jsonl"
OUTPUT_FILE = "results_summary.jsonl"
MAX_LINES = None  # set to an int to limit number of evaluated lines

# ✅ Derive key from JSONL filename (without extension)
RESPONSE_NAME = os.path.splitext(os.path.basename(PREDICTIONS_FILE))[0]

# === Load evaluator
evaluator = CRAGEvaluator.from_jsonl(GROUND_TRUTH_FILE)

# === Evaluate predictions and collect results
results = []
with open(PREDICTIONS_FILE, "r") as f:
    for i, line in enumerate(f):
        if MAX_LINES is not None and i >= MAX_LINES:
            break
        sample = json.loads(line)
        result = evaluator.evaluate_one_jsonl_line(sample)
        results.append(result["accuracy"])
        # time.sleep(0.5)

# === Format as {"filename_key": [accuracies]} and append
record = {RESPONSE_NAME: results}

with open(OUTPUT_FILE, "a", encoding="utf-8") as out:
    out.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ Appended: {RESPONSE_NAME} → {len(results)} evaluations to {OUTPUT_FILE}")

2025-06-11 17:00:29,159 - INFO - [Eval] session=fd93d7e6-9cdf-4334-a5be-c2ad13880d0b
Q: 
GT: no, the subaru wrx is a compact car with a total passenger capacity of 5 people.
Pred: No, the Subaru WRX is not designed to transport seven passengers at once, as stated in the image caption.
Grok: {
    "accuracy": 1
}
2025-06-11 17:00:29,705 - INFO - [Eval] session=eaab8630-64ed-4feb-84e3-a20e0db97258
Q: 
GT: a 2023 chevrolet trailblazer with a 1.3l awd engine can travel from washington, dc to baltimore on 5 gallons of gas, with a distance of 40–45 miles between the cities. with an average fuel efficiency of 28 mpg, it can travel up to 140 miles on 5 gallons.
Pred: I don't know. The provided information does not include the exact fuel tank capacity of the 2023 Chevrolet Trailblazer AWD with a 1.3L engine, which is necessary to calculate the distance it can travel on 5 gallons of gas.
Grok: {
    "accuracy": 0
}
2025-06-11 17:00:32,190 - INFO - [Eval] session=4cd79b63-6e4b-400f-aee0-f917dc144

In [3]:
import json
from collections import defaultdict

def analyze_results_summary(results_file):
    with open(results_file, 'r') as f:
        for line in f:
            data = json.loads(line)
            for filename, results in data.items():
                # Initialize default counters
                counts = {
                    1: 0,    # Correct
                    0: 0,    # "I don't know"
                    -1: 0,   # Incorrect
                    -0.5: 0  # Uncertain
                }
                others = defaultdict(int)

                # Count accuracy values
                for result in results:
                    if result in counts:
                        counts[result] += 1
                    else:
                        others[result] += 1

                total = sum(counts.values()) + sum(others.values())

                # Print results
                summary = (
                    f"{filename}: 1: {counts[1]}; "
                    f"0: {counts[0]}, -1: {counts[-1]}, -0.5: {counts[-0.5]}"
                )
                for val, cnt in others.items():
                    summary += f", others:{val}: {cnt}"
                summary += f", total: {total}"
                print(summary)

if __name__ == "__main__":
    analyze_results_summary("results_summary.jsonl")


sft_caption_data_case_0_zero_shot_65tokens: 1: 149; 0: 968, -1: 207, -0.5: 2, total: 1326
sft_response_data_case_5_web_search_rephrase: 1: 295; 0: 437, -1: 589, -0.5: 5, total: 1326


In [8]:
# how many of 589 -1 includes i don't know
import json

def count_uncertain_answers_with_accuracy_filter(jsonl_path):
    keywords = ["i don't know", "not sure", "unsure"]
    uncertain_count = 0
    total_checked = 0

    with open(jsonl_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            if data.get("accuracy") != -1:
                continue  # Skip if not accuracy = -1

            total_checked += 1
            for message in data.get("messages", []):
                if message.get("role") == "assistant":
                    content = message.get("content", "").lower()
                    if any(keyword in content for keyword in keywords):
                        uncertain_count += 1
                    break  # Only check the first assistant response

    print(f"Total with accuracy=-1: {total_checked}")
    print(f"Uncertain responses: {uncertain_count}")
    print(f"Percentage: {uncertain_count / total_checked:.2f}%")
    return uncertain_count

# Example usage
count_uncertain_answers_with_accuracy_filter("selected_pipeline_finetune_data.jsonl")


Total with accuracy=-1: 589
Uncertain responses: 3
Percentage: 0.01%


3

In [2]:
from filter_wrong_answers import enrich_and_append

INPUT_FILE = "selected_pipeline_finetune_data.jsonl"
OUTPUT_FILE = "selected_pipeline_finetune_data_wrong_answers_with_flag.jsonl"

if __name__ == "__main__":
    enrich_and_append(INPUT_FILE, OUTPUT_FILE)
    print(f"✅ Done enriching {INPUT_FILE} → {OUTPUT_FILE}")


✅ Done enriching selected_pipeline_finetune_data.jsonl → selected_pipeline_finetune_data_wrong_answers_with_flag.jsonl


In [4]:
# how many wrong answers (accuracy = -1) are info sufficient so we can give ground truth?
import json

def count_info_sufficiency(file_path: str):
    total_neg1 = 0
    sufficient_count = 0

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            if data.get("accuracy") == -1:
                total_neg1 += 1
                if data.get("if_-1_is_info_enough_for_truth") == 1:
                    sufficient_count += 1

    print(f"Total accuracy = -1 entries: {total_neg1}")
    print(f"Of those, info_sufficient = 1: {sufficient_count}")

count_info_sufficiency("selected_pipeline_finetune_data_wrong_answers_with_flag.jsonl")

Total accuracy = -1 entries: 589
Of those, info_sufficient = 1: 244


In [5]:
import json

def replace_output_with_ground_truth(input_file: str, output_file: str):
    with open(input_file, "r", encoding="utf-8") as infile, \
         open(output_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            data = json.loads(line)

            # Check condition and replace output if needed
            if data.get("accuracy") == -1 and data.get("if_-1_is_info_enough_for_truth") == 1:
                data["finetune_output"] = data.get("ground_truth", "")

            # Write modified (or unmodified) line
            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

    print(f"✅ Updated dataset saved to: {output_file}")

# Example usage
if __name__ == "__main__":
    replace_output_with_ground_truth(
        input_file="selected_pipeline_finetune_data_wrong_answers_with_flag.jsonl",
        output_file="selected_pipeline_finetune_data_final.jsonl"
    )


✅ Updated dataset saved to: selected_pipeline_finetune_data_final.jsonl


In [3]:
import json
import pandas as pd
from datasets import load_dataset, Dataset, Features, Value, Image
from huggingface_hub import login
import os

# === (Optional) Login using Hugging Face token ===
# login(token="your_hf_token")  # or make sure you've run `huggingface-cli login`

# === Step 1: Load Hugging Face image dataset ===
print("🔄 Loading HF image dataset...")

hf_dataset = load_dataset("crag-mm-2025/crag-mm-single-turn-public", split="validation")
session_to_image = {
    example["session_id"]: example["image"] for example in hf_dataset
}
print("✅ HF dataset loaded.")

🔄 Loading HF image dataset...
✅ HF dataset loaded.


In [4]:
import json
import pandas as pd
from datasets import load_dataset
from PIL import Image as PILImage
from io import BytesIO
import pickle

# === Step 1: Load and filter JSONL ===
jsonl_path = "selected_pipeline_finetune_data_final.jsonl"
local_df = pd.read_json(jsonl_path, lines=True)
local_df["session_id"] = local_df["session_id"].astype(str)

# ✅ Filter to keep only accuracy == -1
local_df = local_df[local_df["accuracy"].isin([-1])].reset_index(drop=True) #[1, -1]

# === Step 3: Combine into message format ===
result_data = []

for example in local_df.to_dict(orient="records"):
    session_id = example["session_id"]
    image = session_to_image.get(session_id)
    if image is None:
        continue

    try:
        messages = example["messages"]
        user_msg = next(m for m in messages if m["role"] == "user")
        context_text = user_msg["content"]
    except Exception as e:
        print(f"⚠️ Skipping malformed message: {session_id}")
        continue

    result_data.append({
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": context_text},
                    {"type": "image", "image": image}
                ]
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": example["finetune_output"]}]
            }
        ]
    })

# === Step 4: Save ===
with open("train_conv.pkl", "wb") as f:
    pickle.dump(result_data, f)

print(f"✅ Saved {len(result_data)} filtered examples to train_conv.pkl")

✅ Saved 589 filtered examples to train_conv.pkl


In [1]:
import pickle

try:
    with open("train_conv.pkl", "rb") as f:
        train_conv = pickle.load(f)

    print(f"✅ Loaded {len(train_conv)} examples from train_conv.pkl")

    # Print the 5th item (index 4) if it exists
    if len(train_conv) >= 5:
        print("🔍 Example #5:")
        print(train_conv[4])
    else:
        print("⚠️ Less than 5 examples in the file.")

except Exception as e:
    print(f"❌ Failed to load pickle file: {e}")


✅ Loaded 589 examples from train_conv.pkl
🔍 Example #5:
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'Given the context below and the image, answer the question truthfully in one line. Use context to support your answer explicitly. If insufficient information is available, say so.\n\n##Image Caption: The object in the image is a Nissan 300ZX, which is an American-assembled vehicle.\n##Some Context: \n##Question: can i purchase an american assembled vehicle of this model?\n##Answer:'}, {'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3024x4032 at 0x110A21840>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': "I don't know"}]}]}


In [None]:
# eval after first finetuning - 805 1 / -1, 0612 1230AM
jsonl_files = glob.glob('sft_caption_data_case_0_zero_shot_30tokens.jsonl')
total, idk = count_idontknow_responses(jsonl_files)
print(f"\n{jsonl_files}\ntotal: {total}")
print(f"'I don't know' in responses: {idk}\nratio: {idk / total if total > 0 else 0:.2f}")