In [None]:
import openai
import json
import pandas as pd
from tqdm import tqdm
import re
import time
from datetime import datetime
import os
from dotenv import load_dotenv
from string import Template

In [None]:
load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_type = "openai"
openai.api_version = None

In [None]:
criteria_config = {
    "well-formed": lambda story, ac: story,
    "atomic": lambda story, ac: story,
    "minimal": lambda story, ac: story,
    "problem-oriented": lambda story, ac: story,
    "internal-consistency": lambda story, ac: story,
    # 默认情况：story + ac
    "_default": lambda story, ac: f"{story}\n\n{ac}"
}

def build_user_content(criteria: str, story: str, ac: str) -> str:
    func = criteria_config.get(criteria, criteria_config["_default"])
    return func(story, ac)

# 模型配置
models = [
    #{"name": "deepseek", "model": "deepseek-v3"},
    #{"name": "qwen", "model": "qwen2.5-72b-instruct"}
    {"name":"qwen-plus", "model": "qwen-plus"}
]

# well-formed, minimal, atomic, unambiguous, complete, conceptually-sound, problem-oriented
# 配置准则和Prompt类型
criterias = ['conceptually-sound']
sc = 3  # 每个模型评估次数

In [None]:
def load_from_file(path: str) -> str:
    try:
        # 读取模板文件
        with open(path, "r", encoding="utf-8") as file:
            return file.read().strip()
    except FileNotFoundError:
        print(f"❌ 错误：文件 {path} 未找到")
        return ""
    except Exception as e:
        print(f"❌ 错误读取文件：{e}")
        return ""

def clean_json_text(text: str) -> str:
    return re.sub(r"^```(json)?|```$", "", text.strip(), flags=re.IGNORECASE)

def evaluate_user_story(model_name, system_prompt, user_content : str, max_retries: int = 2, retry_interval: int = 2) -> dict:
    """调用 LLM 评估用户故事，支持自动重试"""
    for attempt in range(1, max_retries + 1):
        try:
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"{user_content}"}
                ],
                temperature=0
            )

            content = response.choices[0].message.content.strip()
            cleaned = clean_json_text(content)

            # 尝试解析为 JSON
            parsed = json.loads(cleaned)
            return parsed

        except json.JSONDecodeError as e:
            print(f"⚠️ 第 {attempt} 次解析失败：返回内容非 JSON，有效格式问题。")
            print("响应内容：", content)
            if attempt == max_retries:
                return {"error": "Invalid JSON", "raw": content}
            time.sleep(retry_interval)

        except Exception as e:
            print(f"⚠️ 第 {attempt} 次调用出错：{e}")
            if attempt == max_retries:
                return {"error": str(e)}
            time.sleep(retry_interval)


In [None]:
template = load_from_file("./prompt/template")
prompt_template = Template(template)
stories_file_path = './data/us/stories.xlsx'
prompt_type = "fewshot-exp"

In [None]:
# === 主循环 ===
for criteria in criterias:
    prompt_file = f"{criteria}"
    quality_criteria = load_from_file(f"./prompt/{criteria}/{prompt_type}")
    prompt = prompt_template.substitute(quality_criteria=quality_criteria)
    if not prompt:
        print("❌ Prompt 加载失败，退出程序")
        continue

    criteria_col = criteria.capitalize()
    us_df = pd.read_excel(stories_file_path, usecols=["Issue key", "story", "ac", "bg", criteria_col])
    eval_keys = pd.read_excel(f"data/us1/{criteria}.xlsx", usecols=["Issue key"])
    df_eval = eval_keys.merge(us_df, on="Issue key", how="left")

    all_results = []

    for _, row in tqdm(df_eval.iterrows(), total=len(df_eval), desc=f"Evaluating [{criteria}]"):
        issue_key = row["Issue key"]
        story = row["story"]
        ac = row["ac"]
        bg = row["bg"]
        expert = row[criteria_col]
        result_row = {"Issue key": issue_key, "BG": bg, "Story": story, "AC": ac, "Expert": expert}

        for model_cfg in models:
            model_name = model_cfg["model"]
            model_short = model_cfg["name"]

            for t in range(1, sc + 1):
                agent_score = -1
                try:
                    user_content = build_user_content(criteria, story, ac)
                    result = evaluate_user_story(model_name, prompt, user_content, 3, 1)
                    time.sleep(1)
                    col_prefix = f"{model_short}_{t}"
                    col_result = f"{model_short}_result_{t}"

                    if isinstance(result, dict):
                        val = result.get(f"violation")
                        if val is not None:
                            agent_score = 0 if val else 1

                    result_row[col_prefix] = agent_score
                    result_row[col_result] = json.dumps(result, indent=2, ensure_ascii=False)

                except Exception as e:
                    result_row[f"{model_short}_{t}"] = -1
                    result_row[f"{model_short}_result_{t}"] = json.dumps({"error": str(e)})

        all_results.append(result_row)

    current_time = datetime.now().strftime("%Y%m%d%H%M")
    out_df = pd.DataFrame(all_results)
    output_path = f"evaluation/{model_short}-{criteria}-{current_time}.xlsx"
    out_df.to_excel(output_path, index=False)
    print(f"✅ 保存至：{output_path}")