In [None]:
import os
import json
import logging
import requests

# 设置日志记录器
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # 输出到控制台
        logging.FileHandler("evaluation_log.txt", mode='w', encoding='utf-8')  # 保存到文件
    ]
)
logger = logging.getLogger()
# base_url = "https://api.siliconflow.cn/v1/chat/completions"
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
HEADERS = {
    "Content-Type": "application/json",
    "Authorization": ""
}
MODEL = "qwen-max-2025-01-25"

In [6]:
def read_query(query_file, x):
    with open(query_file, "r", encoding="utf-8") as f:
        content = f.read()

    lines = content.splitlines()
    for line in lines:
        if f"问题{x}" in line:
            return line.replace(f"问题{x}：", "").strip()
    
    return None

def read_answer(answer_file):
    with open(answer_file, "r", encoding="utf-8") as f:
        lines = f.readlines()  # 读取所有行

    answer_lines = []
    in_answer_section = False

    for line in lines:
        stripped_line = line.strip('\n')  # 保留行内空格仅去除换行符
        
        # 检测到新问题时，立即终止采集
        if stripped_line.startswith("问题："):
            if in_answer_section:
                break  # 遇到下一个问题区块，停止采集
            continue   # 跳过当前行的问题标记
        
        # 进入答案区块
        if stripped_line.startswith("答案："):
            in_answer_section = True
            answer_lines.append(stripped_line.replace("答案：", "", 1))  # 移除首个"答案："标记
            continue
        
        # 采集答案内容行
        if in_answer_section:
            answer_lines.append(stripped_line)

    # 拼接内容并清理首尾空行
    full_answer = '\n'.join(answer_lines).strip()
    return full_answer if full_answer else None

def read_raw_answer(answer_file, x):
    with open(answer_file, "r", encoding="utf-8") as f:
        content = f.read()

    lines = content.splitlines()
    for line in lines:
        if f"答案{x}" in line:
            return line.replace(f"答案{x}：", "").strip()
    
    return None

In [7]:
def evaluate(query_file, answer1_file, answer2_file, output_file, x):
    query = read_query(query_file, x)
    raw_answer = read_raw_answer(query_file, x)
    answer1 = read_answer(answer1_file)
    answer2 = read_answer(answer2_file)
    logger.info(f"Evaluating {query_file}...")
    logger.debug(f"Query: {query}")

    # print(f"问题：{query}")
    # print(f"原始答案：{raw_answer}")
    # print(f"answer1：{answer1}")
    # print(f"answer2：{answer2}")

    sys_prompt = """
    ---Role---
    你是一位经验丰富的金融领域专家，精通宏观经济、行业分析、公司财务分析和投资策略。你已经阅读了大量金融研报，能够基于数据驱动的分析方法，提供深入的市场解读和投资建议。你的回答严谨、逻辑清晰，并符合专业金融研究的标准。现在，你负责根据以下三个标准评估针对同一问题的两个答案：**全面性**、**多样性**和**实用性**。
    """
    
    prompt = f"""
    你将根据以下三个标准评估针对同一问题的两个答案：**全面性**、**多样性**和**实用性**。
    
    - **全面性**：答案是否全面覆盖金融问题的核心要素？需包括市场动态、政策解读、关键数据指标、风险因素，并提供数据支持或案例佐证。
    - **多样性**：答案是否呈现多元视角？需包含宏观经济趋势、行业细分差异、投资策略、监管政策，并整合多方观点。
    - **实用性**：答案是否具备决策支持价值？需提供可操作建议、风险评估模型、历史规律总结，或明确政策影响传导路径。

    评估要求：
    请基于问题的原始正确答案，严格对比答案1和答案2，针对每个标准选择更优者，并详细解释原因。
    请注意，必须在答案1或答案2中选择一个更优的，不可判定为平局。
    最后，三个标准的综合考量下，选出一个总体优胜者，并提供总结性的评估理由。  

    以下是问题：
    {query}

    以下是问题的原始正确答案：
    {raw_answer}
    
    以下是两个答案：
    
    **答案1：**
    {answer1}
    
    **答案2：**
    {answer2}
    
    请根据上述三个标准评估这两个答案，并为每个标准提供详细的解释。
    
    请以以下JSON格式输出您的评估结果,输出内容只需要包含JSON格式的评估结果，不要包含其他内容：
    
    {{
        "全面性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "多样性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "实用性": {{
            "Winner": "答案1或答案2",
            "Explanation": "请在此提供解释"
        }},
        "总体优胜者": {{
            "Winner": "答案1或答案2",
            "Explanation": "综合以上三个标准总结为何该答案胜出"
        }}
    }}
    """
    
    try:
        request_body = {
            "model": MODEL,
            "messages": [
                {"role": "system", "content": sys_prompt.strip()}, 
                {"role": "user", "content": prompt.strip()}        
            ],
            "stream": False,  
        }
        response = requests.post(
            base_url,
            headers=HEADERS,
            json=request_body,
        )
        
        # 检查HTTP状态码
        if response.status_code != 200:
            logger.error(f"API请求失败，状态码：{response.status_code}，错误信息：{response.text}")
            return

       # 解析响应内容
        response_data = response.json()
        logger.debug(f"原始响应数据：{json.dumps(response_data, indent=2)}")

        eval_result = None
        
        # 处理格式1：OpenAI兼容格式
        if "choices" in response_data:
            if len(response_data["choices"]) == 0:
                logger.error("响应格式1中缺少choices数组内容")
                return
            
            choice = response_data["choices"][0]
            message = choice.get("message", {})
            eval_result = message.get("content")
            
            # 记录推理过程（如果存在）
            if "reasoning_content" in message:
                logger.debug(f"模型推理过程：{message['reasoning_content']}")

        # 处理格式2：ollama简化格式
        elif "message" in response_data:
            message = response_data["message"]
            if "content" not in message:
                logger.error("响应格式2中缺少content字段")
                return
            eval_result = message["content"]

        else:
            logger.error("未知的响应格式结构")
            logger.debug(f"完整响应结构：{json.dumps(response_data, indent=2)}")
            return

        # 统一校验内容有效性
        if not eval_result:
            logger.error("未能提取有效评估内容")
            return

        # 提取JSON内容（处理可能的代码块包裹）
        json_str = eval_result.strip()
        if json_str.startswith("```json"):
            json_str = json_str[7:-3].strip()  # 去除```json包裹
        elif json_str.startswith("```"):
            json_str = json_str[3:-3].strip()  # 去除通用代码块

        # 解析JSON
        try:
            eval_data = json.loads(json_str)
        except json.JSONDecodeError as e:
            logger.error(f"JSON解析失败：{str(e)}")
            logger.error(f"原始内容：\n{json_str}")
            return

        # 验证必要字段
        required_fields = ["全面性", "多样性", "实用性", "总体优胜者"]
        if not all(field in eval_data for field in required_fields):
            missing = set(required_fields) - set(eval_data.keys())
            logger.error(f"响应缺少必要字段：{missing}")
            return

        # 写入评估结果（保持原有逻辑不变）
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(eval_data, f, ensure_ascii=False, indent=4)
            f.write("\n")

        logger.info(f"评估结果已保存至：{output_file}")

    except requests.exceptions.Timeout:
        logger.error("API请求超时")
    except KeyError as e:
        logger.error(f"响应字段缺失：{str(e)}")
        logger.debug(f"完整响应结构：{json.dumps(response_data, indent=2)}")
    except Exception as e:
        logger.error(f"处理过程中发生未预期错误：{str(e)}")
    
    logger.info("\n")

In [8]:
def eval_1_hop():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/1_hop"
    answer2_path = "./FinRep/FinRAG/naive/1_hop"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/1_hop"

    for i in range(99, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 1)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_1_hop()

2025-03-19 18:17:50,527 - INFO - Evaluating ./FinRep/fin_queries/query099.txt...
2025-03-19 18:18:08,119 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/1_hop/eval099.json
2025-03-19 18:18:08,120 - INFO - 

2025-03-19 18:18:08,121 - INFO - Evaluating ./FinRep/fin_queries/query100.txt...
2025-03-19 18:18:23,651 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/1_hop/eval100.json
2025-03-19 18:18:23,651 - INFO - 

2025-03-19 18:18:23,652 - INFO - Evaluating ./FinRep/fin_queries/query101.txt...
2025-03-19 18:18:37,595 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/1_hop/eval101.json
2025-03-19 18:18:37,596 - INFO - 

2025-03-19 18:18:37,597 - INFO - Evaluating ./FinRep/fin_queries/query102.txt...
2025-03-19 18:18:55,991 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/1_hop/eval102.json
2025-03-19 18:18:55,991 - INFO - 

2025-03-19 18:18:55,992 - INFO - Evaluating ./FinRep/fin_queries/query103.txt...
2025-03-19 18:19:12,683 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/1_hop/eval103.json


In [9]:
def eval_1_hop_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/1_hop"
    answer1_path = "./FinRep/FinRAG/naive/1_hop"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/reverse/1_hop"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
         
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 1)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_1_hop_reverse()

2025-03-19 18:30:57,943 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-19 18:31:09,547 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/1_hop/eval000.json
2025-03-19 18:31:09,548 - INFO - 

2025-03-19 18:31:09,551 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-19 18:31:26,480 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/1_hop/eval001.json
2025-03-19 18:31:26,481 - INFO - 

2025-03-19 18:31:26,483 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-19 18:31:42,384 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/1_hop/eval002.json
2025-03-19 18:31:42,385 - INFO - 

2025-03-19 18:31:42,387 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-19 18:31:55,205 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/1_hop/eval003.json
2025-03-19 18:31:55,206 - INFO - 

2025-03-19 18:31:55,207 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-19 18:32:10,513 - INFO - 评估结果已保存至：./FinRep/Eval/Fin

In [10]:
def eval_N_hop():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/N_hop"
    answer2_path = "./FinRep/FinRAG/naive/N_hop"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/N_hop" 
    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 2)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_N_hop()

2025-03-19 19:12:43,341 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-19 19:13:05,580 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/N_hop/eval000.json
2025-03-19 19:13:05,583 - INFO - 

2025-03-19 19:13:05,585 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-19 19:13:24,307 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/N_hop/eval001.json
2025-03-19 19:13:24,310 - INFO - 

2025-03-19 19:13:24,317 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-19 19:13:43,760 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/N_hop/eval002.json
2025-03-19 19:13:43,763 - INFO - 

2025-03-19 19:13:43,765 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-19 19:13:58,420 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/N_hop/eval003.json
2025-03-19 19:13:58,424 - INFO - 

2025-03-19 19:13:58,429 - INFO - Evaluating ./FinRep/fin_queries/query005.txt...
2025-03-19 19:14:16,265 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/N_hop/eval005.json


In [11]:
def eval_N_hop_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/N_hop"
    answer1_path = "./FinRep/FinRAG/naive/N_hop"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/reverse/N_hop"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 2)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_N_hop_reverse()

2025-03-19 19:56:30,963 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-19 19:56:49,364 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/N_hop/eval000.json
2025-03-19 19:56:49,366 - INFO - 

2025-03-19 19:56:49,368 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-19 19:57:06,053 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/N_hop/eval001.json
2025-03-19 19:57:06,055 - INFO - 

2025-03-19 19:57:06,060 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-19 19:57:28,677 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/N_hop/eval002.json
2025-03-19 19:57:28,679 - INFO - 

2025-03-19 19:57:28,685 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-19 19:57:44,510 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/N_hop/eval003.json
2025-03-19 19:57:44,512 - INFO - 

2025-03-19 19:57:44,516 - INFO - Evaluating ./FinRep/fin_queries/query005.txt...
2025-03-19 19:58:00,216 - INFO - 评估结果已保存至：./FinRep/Eval/Fin

In [12]:
def eval_open():
    query_path = "./FinRep/fin_queries"
    answer1_path = "./FinRep/FinRAG/hybrid/open"
    answer2_path = "./FinRep/FinRAG/naive/open"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/open" 

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 3)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_open()

2025-03-19 20:41:10,382 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-19 20:41:29,531 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/open/eval000.json
2025-03-19 20:41:29,531 - INFO - 

2025-03-19 20:41:29,533 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-19 20:41:45,579 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/open/eval001.json
2025-03-19 20:41:45,580 - INFO - 

2025-03-19 20:41:45,583 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-19 20:42:05,333 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/open/eval002.json
2025-03-19 20:42:05,334 - INFO - 

2025-03-19 20:42:05,336 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-19 20:42:17,998 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/open/eval003.json
2025-03-19 20:42:17,999 - INFO - 

2025-03-19 20:42:18,001 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-19 20:42:36,196 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/open/eval004.json
2025-

In [13]:
def eval_open_reverse():
    query_path = "./FinRep/fin_queries"
    answer2_path = "./FinRep/FinRAG/hybrid/open"
    answer1_path = "./FinRep/FinRAG/naive/open"
    output_path = "./FinRep/Eval/FinRAG_NaiveRAG/reverse/open"

    for i in range(0, 150):
        query_file = os.path.join(query_path, f"query{i:03d}.txt")
        answer1_file = os.path.join(answer1_path, f"answer{i:03d}.txt")
        answer2_file = os.path.join(answer2_path, f"answer{i:03d}.txt")
        output_file = os.path.join(output_path, f"eval{i:03d}.json")
        
        if os.path.exists(query_file) and os.path.exists(answer1_file) and os.path.exists(answer2_file):
            evaluate(query_file, answer1_file, answer2_file, output_file, 3)
        else:
            logger.warning(f"Skipping {i:03d} due to missing files.")


eval_open_reverse()

2025-03-19 21:25:23,102 - INFO - Evaluating ./FinRep/fin_queries/query000.txt...
2025-03-19 21:25:40,390 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/open/eval000.json
2025-03-19 21:25:40,391 - INFO - 

2025-03-19 21:25:40,392 - INFO - Evaluating ./FinRep/fin_queries/query001.txt...
2025-03-19 21:25:55,186 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/open/eval001.json
2025-03-19 21:25:55,187 - INFO - 

2025-03-19 21:25:55,188 - INFO - Evaluating ./FinRep/fin_queries/query002.txt...
2025-03-19 21:26:10,343 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/open/eval002.json
2025-03-19 21:26:10,344 - INFO - 

2025-03-19 21:26:10,346 - INFO - Evaluating ./FinRep/fin_queries/query003.txt...
2025-03-19 21:26:24,445 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_NaiveRAG/reverse/open/eval003.json
2025-03-19 21:26:24,446 - INFO - 

2025-03-19 21:26:24,447 - INFO - Evaluating ./FinRep/fin_queries/query004.txt...
2025-03-19 21:26:39,949 - INFO - 评估结果已保存至：./FinRep/Eval/FinRAG_

KeyboardInterrupt: 