In [None]:
import openai
import json
import pandas as pd
from tqdm import tqdm
import re
import time
from datetime import datetime
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]
openai.api_type = "openai"
openai.api_version = None

In [None]:
stories_file_path = "data/us/stories.xlsx"

# 模型配置
models = [
    #{"name": "deepseek", "model": "deepseek-v3"},
    #{"name": "qwen", "model": "qwen2.5-72b-instruct"}
    {"name":"qwen2.5-32b", "model": "qwen2.5-32b-instruct"}
]

# well-formed, minimal, atomic, unambiguous, complete, conceptually-sound, problem-oriented
# 配置准则和Prompt类型
criterias = ['well-formed']
sc = 3  # 每个模型评估次数

In [None]:
def clean_text(text: str) -> str:
    if not text:
        return text
    # 去掉首尾的```代码块包裹
    text = text.strip()
    if text.startswith("```"):
        text = text.split("\n", 1)[1] if "\n" in text else text.replace("```", "")
    if text.endswith("```"):
        text = text[:-3]
    return text.strip()

In [None]:
def load_prompt_from_file(prompt_criteria: str) -> str:
    try:
        with open(f"template/{prompt_criteria}.txt", "r", encoding="utf-8") as file:
            return file.read().strip()
    except Exception as e:
        print(f"❌ 读取 prompt 文件失败: {e}")
        return ""

def clean_json_text(text: str) -> str:
    return re.sub(r"^```(json)?|```$", "", text.strip(), flags=re.IGNORECASE)

def evaluate_user_story(model_name, system_prompt, story, ac, bg: str, max_retries: int = 2, retry_interval: int = 2) -> dict:
    """调用 LLM 评估用户故事，支持自动重试"""
    for attempt in range(1, max_retries + 1):
        try:
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"{story}"}
                ],
                temperature=0
            )

            content = response.choices[0].message.content.strip()
            cleaned = clean_json_text(content)

            # 尝试解析为 JSON
            parsed = json.loads(cleaned)
            return parsed

        except json.JSONDecodeError as e:
            print(f"⚠️ 第 {attempt} 次解析失败：返回内容非 JSON，有效格式问题。")
            print("响应内容：", content)
            if attempt == max_retries:
                return {"error": "Invalid JSON", "raw": content}
            time.sleep(retry_interval)

        except Exception as e:
            print(f"⚠️ 第 {attempt} 次调用出错：{e}")
            if attempt == max_retries:
                return {"error": str(e)}
            time.sleep(retry_interval)


In [None]:
# === 主循环 ===
for criteria in criterias:
    prompt_file = f"{criteria}"
    prompt = load_prompt_from_file(prompt_file)
    if not prompt:
        print(f"❌ Prompt `{prompt_file}` 加载失败，跳过")
        continue

    criteria_col = criteria.capitalize()
    us_df = pd.read_excel(stories_file_path, usecols=["Issue key", "story", "ac", "bg", criteria_col])
    eval_keys = pd.read_excel(f"data/us1/{criteria}.xlsx", usecols=["Issue key"])
    df_eval = eval_keys.merge(us_df, on="Issue key", how="left")

    all_results = []

    for _, row in tqdm(df_eval.iterrows(), total=len(df_eval), desc=f"Evaluating [{criteria}]"):
        issue_key = row["Issue key"]
        story = row["story"]
        ac = clean_text(row["ac"])
        bg = row["bg"]
        expert = row[criteria_col]
        result_row = {"Issue key": issue_key, "BG": bg, "Story": story, "AC": ac, "Expert": expert}

        for model_cfg in models:
            model_name = model_cfg["model"]
            model_short = model_cfg["name"]

            for t in range(1, sc + 1):
                agent_score = -1
                try:
                    result = evaluate_user_story(model_name, prompt, story, ac, bg, 3, 1)
                    time.sleep(1)
                    col_prefix = f"{model_short}_{t}"
                    col_result = f"{model_short}_result_{t}"

                    if isinstance(result, dict):
                        val = result.get(f"violation")
                        if val is not None:
                            agent_score = 0 if val else 1

                    result_row[col_prefix] = agent_score
                    result_row[col_result] = json.dumps(result, indent=2, ensure_ascii=False)

                except Exception as e:
                    result_row[f"{model_short}_{t}"] = -1
                    result_row[f"{model_short}_result_{t}"] = json.dumps({"error": str(e)})

        all_results.append(result_row)

    current_time = datetime.now().strftime("%Y%m%d%H%M")
    out_df = pd.DataFrame(all_results)
    output_path = f"evaluation/{model_short}-{criteria}-{current_time}.xlsx"
    out_df.to_excel(output_path, index=False)
    print(f"✅ 保存至：{output_path}")