# data_caption_qa

In [None]:
import json
import random

# 读取图片ID
with open("./travel/data_sameimage_diffqa/data_metainfo.json", "r", encoding="utf-8") as f:
    meta = json.load(f)
image_ids = meta["image_ids"]

# 读取分析结果
with open("./travel/analysiss_all.json", "r", encoding="utf-8") as f:
    analysiss = json.load(f)

# 构建 image_name -> detailed_description 映射
image2desc = {}
for item in analysiss["results"]:
    image2desc[item["image"]] = item["analysis"]["detailed_description"]

# 问题模板（可自定义多个，随机选一个）
question_templates = [
    "请根据图片内容写一段详细描述。",
    "这张图片的详细介绍是什么？",
    "请为这张图片生成一段说明。",
    "请描述这张图片的内容。",
    "请写出这张图片的详细信息。",
]

caption_qa = []
for img_path in image_ids:
    img_name = img_path.split("/")[-1]
    if img_name in image2desc:
        q = random.choice(question_templates)
        a = image2desc[img_name]
        qa_item = {"messages": [{"role": "user", "content": f"<image>{q}"}, {"role": "assistant", "content": a}], "images": [img_path]}
        caption_qa.append(qa_item)
    else:
        print(f"图片 {img_name} 未在 analysiss_all.json 中找到 detailed_description，已跳过。")

# 保存
with open("./travel/caption_qa.json", "w", encoding="utf-8") as f:
    json.dump(caption_qa, f, ensure_ascii=False, indent=2)

print(f"已生成 {len(caption_qa)} 条 caption_qa 数据")

已生成 10 条 caption_qa 数据


# data_sameimage_diffqa

In [7]:
import json
from collections import defaultdict

# 读取原始数据
with open("./travel/dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 按图片分组
img2qas = defaultdict(list)
for item in data:
    img = item["images"][0]
    img2qas[img].append(item)

# 只取前10张图片
selected_imgs = list(img2qas.keys())[:10]

train, test = [], []
image_ids = []

for img in selected_imgs:
    qas = img2qas[img]
    image_ids.append(img)
    if len(qas) == 1:
        # 只有一个QA对时，训练和测试集都放同一个
        train.append(qas[0])
        test.append(qas[0])
    else:
        # 测试集只保留最后一条，其余都放训练集
        train.extend(qas[:-1])
        test.append(qas[-1])


# 保存
with open("data_part1_60.json", "w", encoding="utf-8") as f:
    json.dump(train, f, ensure_ascii=False, indent=2)
with open("data_part2_30.json", "w", encoding="utf-8") as f:
    json.dump(test, f, ensure_ascii=False, indent=2)

# 保存图片id和统计信息
meta_info = {"image_ids": image_ids, "train_count": len(train), "test_count": len(test), "image_count": len(image_ids)}
with open("data_metainfo.json", "w", encoding="utf-8") as f:
    json.dump(meta_info, f, ensure_ascii=False, indent=2)
print(f"Train: {len(train)}, Test: {len(test)}, Images: {len(image_ids)}")

Train: 13, Test: 10, Images: 10


# data_sameimage_simqa

In [None]:
import openai

client = openai.OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")  # Ollama 默认本地端口

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "获取城市天气",
            "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]},
        },
    }
]

messages = [{"role": "user", "content": "北京天气怎么样？"}]

response = client.chat.completions.create(
    model="qwen3:1.7b",
    messages=messages,
    tools=tools,
    tool_choice={"type": "function", "function": {"name": "get_weather"}},
    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)  # 只要是带 tools 标签的模型都可以

print(response)

ChatCompletion(id='chatcmpl-582', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_szwxh4gt', function=Function(arguments='{"city":"北京"}', name='get_weather'), type='function', index=0)]))], created=1747546079, model='qwen3:1.7b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=94, prompt_tokens=138, total_tokens=232, completion_tokens_details=None, prompt_tokens_details=None))


In [18]:
from openai import OpenAI
import json
from collections import defaultdict


def generate_similar_question(question):
    """用 function-calling 方式生成与输入问题意思相近但表达不同的问题（保持原语言）。"""

    tools = [
        {
            "type": "function",
            "function": {
                "name": "generate_question",
                "description": "生成与输入问题意思相近但表达不同的问题",
                "parameters": {
                    "type": "object",
                    "properties": {"question": {"type": "string", "description": "与输入意思相近但表达不同的问题"}},
                    "required": ["question"],
                },
            },
        }
    ]

    prompt = f"""你是一个只会通过工具（function call）返回答案的助手，不能直接输出答案。
请帮我生成一个与下列问题意思相近但表达完全不同的问题（保持原语言）：
{question}
你必须只通过function call调用"generate_question"返回结果，不要直接输出问题本身或其他内容。"""

    messages = [{"role": "user", "content": prompt}]
    client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
    response = client.chat.completions.create(
        model="qwen3:1.7b",
        messages=messages,
        max_tokens=512,
        temperature=0.0,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "generate_question"}},
        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    )
    print(response)

    msg = response.choices[0].message
    func_args = msg.tool_calls[0].function.arguments
    args = json.loads(func_args)
    return args["question"].strip()


# 读取原始数据
with open("./travel/dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 按图片分组
img2qas = defaultdict(list)
for item in data:
    img = item["images"][0]
    img2qas[img].append(item)

# 只取前10张图片
selected_imgs = list(img2qas.keys())[:10]

train, test = [], []
image_ids = []

similar_questions = defaultdict(list)
for img in selected_imgs:
    qas = img2qas[img]
    image_ids.append(img)
    for qa in qas:
        orig_question = qa["messages"][0]["content"].replace("<image>", "").strip()
        try:
            similar_question = generate_similar_question(orig_question)
            print(f"生成相似问题成功，图片: {img}, 问题: {orig_question}, 相似问题: {similar_question}")
            # 记录原始和相似问题对
            similar_questions[img].append({"original": orig_question, "similar": similar_question})
        except Exception as e:
            print(f"生成相似问题失败，图片: {img}, 问题: {orig_question}, 错误: {e}")
            continue
        # 训练集用原始问题
        train_qa = {"messages": [{"role": "user", "content": f"<image>{orig_question}"}, qa["messages"][1]], "images": qa["images"]}
        # 测试集用相似问题
        test_qa = {"messages": [{"role": "user", "content": f"<image>{similar_question}"}, qa["messages"][1]], "images": qa["images"]}
        train.append(train_qa)
        test.append(test_qa)

# 保存
with open("data_part1_60.json", "w", encoding="utf-8") as f:
    json.dump(train, f, ensure_ascii=False, indent=2)
with open("data_part2_30.json", "w", encoding="utf-8") as f:
    json.dump(test, f, ensure_ascii=False, indent=2)
# 保存图片id和统计信息
meta_info = {
    "image_ids": image_ids,
    "train_count": len(train),
    "test_count": len(test),
    "image_count": len(image_ids),
    "similar_questions": similar_questions,
}
with open("data_metainfo.json", "w", encoding="utf-8") as f:
    json.dump(meta_info, f, ensure_ascii=False, indent=2)

print(f"Train: {len(train)}, Test: {len(test)}, Images: {len(image_ids)}")

ChatCompletion(id='chatcmpl-775', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_8fd57ba6', function=Function(arguments='{"question":"这张图片展示的是哪个地点？"}', name='generate_question'), type='function', index=0)]))], created=1747546752, model='qwen3:1.7b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=157, prompt_tokens=217, total_tokens=374, completion_tokens_details=None, prompt_tokens_details=None))
生成相似问题成功，图片: images/030d76e031414d5303768e3e08bc6d173dc7383325fcb3746d47f3f382b9870e.jpg, 问题: 这张图片显示的是什么地方？, 相似问题: 这张图片展示的是哪个地点？
ChatCompletion(id='chatcmpl-669', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[Cha