In [None]:
import os
import json
import logging
import requests

# 设置日志记录器
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # 输出到控制台
        logging.FileHandler("evaluation_log.txt", mode='w', encoding='utf-8')  # 保存到文件
    ]
)
logger = logging.getLogger()
# base_url = "https://api.siliconflow.cn/v1/chat/completions"
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": ""
}
MODEL = "qwen-max-2025-01-25"

In [2]:
def read_query(query_file, x):
    with open(query_file, "r", encoding="utf-8") as f:
        content = f.read()

    lines = content.splitlines()
    for line in lines:
        if f"问题{x}" in line:
            return line.replace(f"问题{x}：", "").strip()
    
    return None

def read_answer(answer_file):
    with open(answer_file, "r", encoding="utf-8") as f:
        lines = f.readlines()  # 读取所有行

    answer_lines = []
    in_answer_section = False

    for line in lines:
        stripped_line = line.strip('\n')  # 保留行内空格仅去除换行符
        
        # 检测到新问题时，立即终止采集
        if stripped_line.startswith("问题："):
            if in_answer_section:
                break  # 遇到下一个问题区块，停止采集
            continue   # 跳过当前行的问题标记
        
        # 进入答案区块
        if stripped_line.startswith("答案："):
            in_answer_section = True
            answer_lines.append(stripped_line.replace("答案：", "", 1))  # 移除首个"答案："标记
            continue
        
        # 采集答案内容行
        if in_answer_section:
            answer_lines.append(stripped_line)

    # 拼接内容并清理首尾空行
    full_answer = '\n'.join(answer_lines).strip()
    return full_answer if full_answer else None

def read_raw_answer(answer_file, x):
    with open(answer_file, "r", encoding="utf-8") as f:
        content = f.read()

    lines = content.splitlines()
    for line in lines:
        if f"答案{x}" in line:
            return line.replace(f"答案{x}：", "").strip()
    
    return None

In [3]:
def evaluate(query_file, answer1_file, answer2_file, output_file, x):
    query = read_query(query_file, x)
    raw_answer = read_raw_answer(query_file, x)
    answer1 = read_answer(answer1_file)
    answer2 = read_answer(answer2_file)
    logger.info(f"Evaluating {query_file}...")
    logger.debug(f"Query: {query}")

    # print(f"问题：{query}")
    # print(f"原始答案：{raw_answer}")
    # print(f"answer1：{answer1}")
    # print(f"answer2：{answer2}")

    sys_prompt = """
    ---Role---
    你是一位经验丰富的金融领域专家，精通宏观经济、行业分析、公司财务分析和投资策略。你已经阅读了大量金融研报，能够基于数据驱动的分析方法，提供深入的市场解读和投资建议。你的回答严谨、逻辑清晰，并符合专业金融研究的标准。现在，你负责根据以下三个标准评估针对同一问题的两个答案：**全面性**、**多样性**和**实用性**。
    """
    
    prompt = f"""
    你将根据以下三个标准评估针对同一问题的两个答案：**全面性**、**多样性**和**实用性**。
    
    - **全面性**：答案是否全面覆盖金融问题的核心要素？需包括市场动态、政策解读、关键数据指标、风险因素，并提供数据支持或案例佐证。
    - **多样性**：答案是否呈现多元视角？需包含宏观经济趋势、行业细分差异、投资策略、监管政策，并整合多方观点。
    - **实用性**：答案是否具备决策支持价值？需提供可操作建议、风险评估模型、历史规律总结，或明确政策影响传导路径。

    评估要求：
    请基于问题的原始正确答案，严格对比答案1和答案2，针对每个标准选择更优者，并详细解释原因。
    请注意，必须在答案1或答案2中选择一个更优的，不可判定为平局。
    最后，三个标准的综合考量下，选出一个总体优胜者，并提供总结性的评估理由。  

    以下是问题：
    {query}

    以下是问题的原始正确答案：
    {raw_answer}
    
    以下是两个答案：
    
    **答案1：**
    {answer1}
    
    **答案2：**
    {answer2}
    
    请根据上述三个标准评估这两个答案，并为每个标准提供详细的解释。
    
    请以以下JSON格式输出您的评估结果,输出内容只需要包含JSON格式的评估结果，不要包含其他内容：
    
    {{
        "全面性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "多样性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "实用性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "总体优胜者": {{
            "Winner": "答案1或答案2",
            "Explanation": "综合以上三个标准总结为何该答案胜出"
        }}
    }}
    """
    
    try:
        request_body = {
            "model": MODEL,
            "messages": [
                {"role": "system", "content": sys_prompt.strip()}, 
                {"role": "user", "content": prompt.strip()}        
            ],
            "stream": False,  
        }
        response = requests.post(
            base_url,
            headers=HEADERS,
            json=request_body,
        )
        
        # 检查HTTP状态码
        if response.status_code != 200:
            logger.error(f"API请求失败，状态码：{response.status_code}，错误信息：{response.text}")
            return

       # 解析响应内容
        response_data = response.json()
        logger.debug(f"原始响应数据：{json.dumps(response_data, indent=2)}")

        eval_result = None
        
        # 处理格式1：OpenAI兼容格式
        if "choices" in response_data:
            if len(response_data["choices"]) == 0:
                logger.error("响应格式1中缺少choices数组内容")
                return
            
            choice = response_data["choices"][0]
            message = choice.get("message", {})
            eval_result = message.get("content")
            
            # 记录推理过程（如果存在）
            if "reasoning_content" in message:
                logger.debug(f"模型推理过程：{message['reasoning_content']}")

        # 处理格式2：ollama简化格式
        elif "message" in response_data:
            message = response_data["message"]
            if "content" not in message:
                logger.error("响应格式2中缺少content字段")
                return
            eval_result = message["content"]

        else:
            logger.error("未知的响应格式结构")
            logger.debug(f"完整响应结构：{json.dumps(response_data, indent=2)}")
            return

        # 统一校验内容有效性
        if not eval_result:
            logger.error("未能提取有效评估内容")
            return

        # 提取JSON内容（处理可能的代码块包裹）
        json_str = eval_result.strip()
        if json_str.startswith("```json"):
            json_str = json_str[7:-3].strip()  # 去除```json包裹
        elif json_str.startswith("```"):
            json_str = json_str[3:-3].strip()  # 去除通用代码块

        # 解析JSON
        try:
            eval_data = json.loads(json_str)
        except json.JSONDecodeError as e:
            logger.error(f"JSON解析失败：{str(e)}")
            logger.error(f"原始内容：\n{json_str}")
            return

        # 验证必要字段
        required_fields = ["全面性", "多样性", "实用性", "总体优胜者"]
        if not all(field in eval_data for field in required_fields):
            missing = set(required_fields) - set(eval_data.keys())
            logger.error(f"响应缺少必要字段：{missing}")
            return

        # 写入评估结果（保持原有逻辑不变）
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(eval_data, f, ensure_ascii=False, indent=4)
            f.write("\n")

        logger.info(f"评估结果已保存至：{output_file}")

    except requests.exceptions.Timeout:
        logger.error("API请求超时")
    except KeyError as e:
        logger.error(f"响应字段缺失：{str(e)}")
        logger.debug(f"完整响应结构：{json.dumps(response_data, indent=2)}")
    except Exception as e:
        logger.error(f"处理过程中发生未预期错误：{str(e)}")
    
    logger.info("\n")

In [4]:
def eval_1_hop():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/1_hop"
    answer2_path = "./FinRep/CRAG/1_hop"
    output_path = "./FinRep/Eval/FinRAG_CRAG/1_hop"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 1)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_1_hop()

2025-03-17 13:44:13,627 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 13:44:33,987 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/1_hop/eval000.json
2025-03-17 13:44:33,987 - INFO - 

2025-03-17 13:44:33,989 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 13:44:58,460 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/1_hop/eval001.json
2025-03-17 13:44:58,461 - INFO - 

2025-03-17 13:44:58,462 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 13:45:28,582 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/1_hop/eval002.json
2025-03-17 13:45:28,583 - INFO - 

2025-03-17 13:45:28,584 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 13:45:48,357 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/1_hop/eval003.json
2025-03-17 13:45:48,358 - INFO - 

2025-03-17 13:45:48,360 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 13:46:05,472 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/1_hop/eval004.json
2025-03-17 13:46:05,

In [5]:
def eval_N_hop():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/N_hop"
    answer2_path = "./FinRep/CRAG/N_hop"
    output_path = "./FinRep/Eval/FinRAG_CRAG/N_hop" 
    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 2)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_N_hop()

2025-03-17 14:44:06,165 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 14:44:32,105 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/N_hop/eval000.json
2025-03-17 14:44:32,106 - INFO - 

2025-03-17 14:44:32,107 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 14:44:57,036 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/N_hop/eval001.json
2025-03-17 14:44:57,037 - INFO - 

2025-03-17 14:44:57,038 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 14:45:22,800 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/N_hop/eval002.json
2025-03-17 14:45:22,801 - INFO - 

2025-03-17 14:45:22,802 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 14:45:48,045 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/N_hop/eval003.json
2025-03-17 14:45:48,045 - INFO - 

2025-03-17 14:45:48,047 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 14:46:15,084 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/N_hop/eval004.json
2025-03-17 14:46:15,

In [6]:
def eval_open():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/open"
    answer2_path = "./FinRep/CRAG/open"
    output_path = "./FinRep/Eval/FinRAG_CRAG/open" 

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 3)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_open()

2025-03-17 15:49:12,366 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 15:49:44,736 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/open/eval000.json
2025-03-17 15:49:44,737 - INFO - 

2025-03-17 15:49:44,739 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 15:50:09,505 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/open/eval001.json
2025-03-17 15:50:09,506 - INFO - 

2025-03-17 15:50:09,507 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 15:50:37,999 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/open/eval002.json
2025-03-17 15:50:38,000 - INFO - 

2025-03-17 15:50:38,002 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 15:51:11,805 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/open/eval003.json
2025-03-17 15:51:11,806 - INFO - 

2025-03-17 15:51:11,807 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 15:51:44,490 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/open/eval004.json
2025-03-17 15:51:44,491 -

In [7]:
def eval_1_hop_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/1_hop"
    answer1_path = "./FinRep/CRAG/1_hop"
    output_path = "./FinRep/Eval/FinRAG_CRAG/reverse/1_hop"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
         
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 1)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_1_hop_reverse()

2025-03-17 16:57:14,063 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 16:57:32,912 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/1_hop/eval000.json
2025-03-17 16:57:32,913 - INFO - 

2025-03-17 16:57:32,915 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 16:58:06,042 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/1_hop/eval001.json
2025-03-17 16:58:06,043 - INFO - 

2025-03-17 16:58:06,045 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 16:58:34,329 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/1_hop/eval002.json
2025-03-17 16:58:34,330 - INFO - 

2025-03-17 16:58:34,331 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 16:58:57,404 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/1_hop/eval003.json
2025-03-17 16:58:57,405 - INFO - 

2025-03-17 16:58:57,406 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 16:59:15,269 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse

In [8]:
def eval_N_hop_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/N_hop"
    answer1_path = "./FinRep/CRAG/N_hop"
    output_path = "./FinRep/Eval/FinRAG_CRAG/reverse/N_hop"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 2)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_N_hop_reverse()

2025-03-17 17:58:05,132 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 17:58:28,368 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/N_hop/eval000.json
2025-03-17 17:58:28,369 - INFO - 

2025-03-17 17:58:28,371 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 17:58:46,396 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/N_hop/eval001.json
2025-03-17 17:58:46,396 - INFO - 

2025-03-17 17:58:46,398 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 17:59:07,168 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/N_hop/eval002.json
2025-03-17 17:59:07,168 - INFO - 

2025-03-17 17:59:07,171 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 17:59:20,858 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/N_hop/eval003.json
2025-03-17 17:59:20,858 - INFO - 

2025-03-17 17:59:20,860 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 17:59:32,522 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse

In [9]:
def eval_open_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/open"
    answer1_path = "./FinRep/CRAG/open"
    output_path = "./FinRep/Eval/FinRAG_CRAG/reverse/open"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 3)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_open_reverse()

2025-03-17 18:43:13,477 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-17 18:43:37,968 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/open/eval000.json
2025-03-17 18:43:37,969 - INFO - 

2025-03-17 18:43:37,970 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-17 18:43:53,214 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/open/eval001.json
2025-03-17 18:43:53,215 - INFO - 

2025-03-17 18:43:53,216 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-17 18:44:08,761 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/open/eval002.json
2025-03-17 18:44:08,762 - INFO - 

2025-03-17 18:44:08,763 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-17 18:44:30,939 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/open/eval003.json
2025-03-17 18:44:30,939 - INFO - 

2025-03-17 18:44:30,941 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-17 18:44:45,045 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_CRAG/reverse/ope