In [None]:
import pandas as pd
import numpy as np
import json
import random
import math
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import os
import sys
import asyncio

# --- 设置项目路径 ---
# 为了能顺利导入 backend 中的自定义模块 (例如 doubao llm client)
# 需要将项目根目录的 backend 文件夹加入到系统路径中
def setup_project_path():
    """
    动态查找项目根目录 yl_data_process 并将 backend 添加到 sys.path
    """
    try:
        # 在 .ipynb 环境中，__file__ 未定义，使用 os.getcwd()
        current_path = os.getcwd()
    except NameError:
        current_path = os.getcwd()

    project_root = current_path
    # 向上查找，直到找到 'yl_data_process'
    while os.path.basename(project_root) != 'yl_data_process':
        parent_path = os.path.dirname(project_root)
        if parent_path == project_root: # 到达文件系统根目录
            print("错误：无法找到项目根目录 'yl_data_process'。")
            print("请确保你的 notebook 文件位于 'yl_data_process' 项目文件夹内。")
            return None, None
        project_root = parent_path

    backend_path = os.path.join(project_root, 'backend')
    if backend_path not in sys.path:
        sys.path.append(backend_path)
        print(f"成功将 '{backend_path}' 添加到系统路径。")
    else:
        print(f"'{backend_path}' 已在系统路径中。")

    return project_root, backend_path

PROJECT_ROOT, BACKEND_PATH = setup_project_path()

# --- 导入自定义模块 ---
if BACKEND_PATH:
    try:
        from llms.doubao import DouBaoLLM
        print("成功导入 DouBaoLLM。")
        # 在这里初始化 LLM client
        # 请确保你的环境变量或配置文件中包含了豆包的 API Key
        # 例如: os.environ['DOUBAO_API_KEY'] = "YOUR_KEY"
        llm_client = DouBaoLLM()
    except ImportError as e:
        print(f"导入 DouBaoLLM 失败: {e}")
        print("请检查 backend/llms/doubao.py 文件是否存在且无误。")
        llm_client = None
    except Exception as e:
        print(f"初始化 DouBaoLLM 时出错: {e}")
        print("请检查 API Key 是否已正确配置。")
        llm_client = None
else:
    llm_client = None
    print("因未能设置项目路径，无法导入自定义模块。")


: 

In [None]:
# --- 2. 数据加载 ---
# 根据 data_analysis.ipynb 中的代码进行数据加载
# 确保 notebook 和 data 文件夹的相对路径正确
print("="*20, "2. 数据加载", "="*20)

# 假设 experiment.ipynb 位于 SelfDataProcess/Code/
# 那么数据文件夹路径就是 ../data/
DATA_PATH = os.path.join(PROJECT_ROOT, 'backend/Agent4Edu/SelfDataProcess/data/')

if not os.path.exists(DATA_PATH):
    print(f"错误: 数据路径不存在 -> {DATA_PATH}")
    print("请确认 'PROJECT_ROOT' 是否设置正确，以及数据文件是否已解压到指定位置。")
else:
    print(f"从以下路径加载数据: {DATA_PATH}")
    try:
        # 加载所有CSV文件
        questions_df = pd.read_csv(os.path.join(DATA_PATH, "Questions.csv"))
        question_choices_df = pd.read_csv(os.path.join(DATA_PATH, "Question_Choices.csv"))
        kcs_df = pd.read_csv(os.path.join(DATA_PATH, "KCs.csv"))
        kc_relationships_df = pd.read_csv(os.path.join(DATA_PATH, "KC_Relationships.csv"))
        question_kc_relationships_df = pd.read_csv(os.path.join(DATA_PATH, "Question_KC_Relationships.csv"))
        transactions_df = pd.read_csv(os.path.join(DATA_PATH, "Transaction.csv"))

        print("所有数据文件加载成功！")
        print(f" - Questions: {questions_df.shape}")
        print(f" - Question_Choices: {question_choices_df.shape}")
        print(f" - KCs: {kcs_df.shape}")
        print(f" - KC_Relationships: {kc_relationships_df.shape}")
        print(f" - Question_KC_Relationships: {question_kc_relationships_df.shape}")
        print(f" - Transaction: {transactions_df.shape}")

    except FileNotFoundError as e:
        print(f"加载文件时出错: {e}")
        print("请检查 data 文件夹中是否包含所有必需的 .csv 文件。")


In [None]:
# --- 3. 数据预处理 ---
# 将分散在不同表中的数据进行合并，构建一个包含所有实验所需信息的 "学生练习日志" DataFrame
print("="*20, "3. 数据预处理", "="*20)

# 步骤 1: 合并练习记录和问题内容
# transactions_df (学生练习行为) + questions_df (问题详情)
print("步骤 1/3: 合并练习记录和问题内容...")
merged_df = pd.merge(
    transactions_df,
    questions_df[['id', 'question_text', 'difficulty']],
    left_on='question_id',
    right_on='id',
    how='left'
)
# 重命名和清理
merged_df = merged_df.rename(columns={
    'question_text': 'exer_content',
    'answer_state': 'score',
    'id_x': 'log_id' # 原始 transaction id
}).drop(columns=['id_y'])

print(f"合并后 Shape: {merged_df.shape}")


# 步骤 2: 合并知识点信息
# 为每个问题关联一个主要的知识点名称 (know_name)
# 注意：一个问题可能关联多个知识点，这里我们简化处理，只取第一个关联的知识点
print("\n步骤 2/3: 合并知识点信息...")

# 首先获取 问题 -> 知识点ID 的映射
# .drop_duplicates() 确保每个问题只取一个知识点ID（取第一个）
question_to_kc_map = question_kc_relationships_df.drop_duplicates(subset=['question_id'])

# 然后获取 知识点ID -> 知识点名称 的映射
kc_id_to_name_map = kcs_df.set_index('id')['name']

# 将知识点ID映射到问题-知识点关系表
question_to_kc_map['know_name'] = question_to_kc_map['knowledgecomponent_id'].map(kc_id_to_name_map)
question_to_kc_map = question_to_kc_map.rename(columns={'knowledgecomponent_id': 'know_code'})

# 将知识点信息合并到主DataFrame
student_logs_df = pd.merge(
    merged_df,
    question_to_kc_map[['question_id', 'know_code', 'know_name']],
    on='question_id',
    how='left'
)

# score 布尔值转为 0/1
student_logs_df['score'] = student_logs_df['score'].astype(int)

print(f"合并后 Shape: {student_logs_df.shape}")
print("成功为每个练习记录关联了知识点。")


# 步骤 3: 按学生分组，构建最终的实验数据集
# 最终数据结构是一个字典，key是学生ID，value是该学生所有练习记录的DataFrame
print("\n步骤 3/3: 按学生ID分组数据...")
all_student_records = {}
for student_id, records in tqdm(student_logs_df.groupby('student_id')):
    # 确保记录按时间排序
    all_student_records[student_id] = records.sort_values(by='start_time').reset_index(drop=True)

# 筛选掉练习记录过少的学生（例如少于10条），避免无法划分训练/测试集
min_records_threshold = 10
original_student_count = len(all_student_records)
all_student_records = {
    sid: recs for sid, recs in all_student_records.items()
    if len(recs) >= min_records_threshold
}
filtered_student_count = len(all_student_records)

print(f"数据处理完成！")
print(f"原始学生数: {original_student_count}")
print(f"筛选后 (练习记录 >= {min_records_threshold}): {filtered_student_count} 名学生")

# 展示一个学生的样例数据
sample_student_id = list(all_student_records.keys())[0]
print(f"\n学生 {sample_student_id} 的练习记录 (前5条):")
display(all_student_records[sample_student_id].head())


In [None]:
# --- 4. 智能体 (Agent) 定义 ---
# 在这里，我们根据 project_desc.md 中的描述，实现智能体的核心模块：
# 1. Profile: 学生画像模块
# 2. Memory: 记忆模块
# 3. AgentAction: 行为模块
print("="*20, "4. Agent 核心模块定义", "="*20)


# ------------------- 1. Agent 配置参数 -------------------
# 从 project_desc.md 中提取的模拟参数
SIM_PARAMS = {
    'memory_source': 'real',
    'learning_effect': 'yes',       # 是否启用学习效应 (记忆强化)
    'forgetting_effect': 'yes',     # 是否启用遗忘效应
    'reflection_choice': 'yes',     # 是否启用反思机制 (在本实验中简化，主要关注预测)
    'sim_strategy': 'performance',
    'gpt_type': 0,
    'short_term_size': 5,          # 短期记忆容量
    'long_term_thresh': 3,         # 长期记忆阈值 (强化次数)
    'forget_lambda': 0.95          # 遗忘衰减系数阈值
}


# ------------------- 2. 学生画像 (Profile) 模块 -------------------
class Profile:
    """
    根据学生的历史练习记录，动态生成学生画像。
    """
    def __init__(self, student_id, history_df):
        self.student_id = student_id
        if history_df.empty:
            # 如果历史记录为空，则使用默认值
            self.activity = "medium"
            self.diversity = "medium"
            self.preference = "N/A"
            self.success_rate_val = 0.5
            self.success_rate = "medium"
            self.ability = "common"
        else:
            self._build_profile(history_df)

    def _build_profile(self, df):
        # 成功率
        self.success_rate_val = df['score'].mean()
        if self.success_rate_val > 0.6: self.success_rate = "high"
        elif self.success_rate_val > 0.3: self.success_rate = "medium"
        else: self.success_rate = "low"

        # 能力
        if self.success_rate_val > 0.5: self.ability = "good"
        elif self.success_rate_val > 0.4: self.ability = "common"
        else: self.ability = "poor"

        # 活跃度 (简化为练习次数)
        num_practices = len(df)
        if num_practices > 200: self.activity = "high" # 阈值基于EDA观察
        elif num_practices > 50: self.activity = "medium"
        else: self.activity = "low"

        # 多样性 (简化为接触的知识点广度)
        kc_diversity = df['know_code'].nunique() / kcs_df['id'].nunique()
        if kc_diversity > 0.75: self.diversity = "high"
        elif kc_diversity > 0.4: self.diversity = "medium"
        else: self.diversity = "low"

        # 偏好
        self.preference = df['know_name'].mode().iloc[0] if not df.empty else "N/A"

    def build_prompt(self):
        """生成用于LLM的System Prompt"""
        return (
            f"You are a student with {self.activity} activity, you maintain a {self.activity} level of online exercise activity and practice frequently.\n"
            f"You have {self.diversity} diversity, you explore diverse knowledge categories.\n"
            f"Most practiced concept: {self.preference}.\n"
            f"Success rate: {self.success_rate}.\n"
            f"Problem-solving ability: {self.ability}."
        )

# ------------------- 3. 记忆 (Memory) 模块 -------------------
class Memory:
    """
    实现三层记忆架构：事实记忆、短期记忆、长期记忆。
    """
    def __init__(self, KCG, know_name_map):
        self.factual = []  # 事实记忆: [content, concept_name, score, reinforcement_count, timestamp]
        self.long = {
            'significant_facts': [],
            'learning_status': [],
            'practiced_knowledge': set()
        }
        self.KCG = KCG
        self.know_name_map = know_name_map
        self.threshold = SIM_PARAMS['long_term_thresh']
        self.short_size = SIM_PARAMS['short_term_size']
        self.forget_lambda = SIM_PARAMS['forget_lambda']

    def write_factual(self, record, time_step):
        # record: (exer_content, know_name, score)
        # 初始强化计次为1
        self.factual.append([record[0], record[1], record[2], 1, time_step])
        self.long['practiced_knowledge'].add(record[1])

    def retrieve_short(self):
        return self.factual[-self.short_size:]

    def retrieve_long(self):
        return {
            'significant_facts': self.long['significant_facts'],
            'learning_status': self.long['learning_status'],
            'practiced_knowledge': list(self.long['practiced_knowledge'])
        }

    def reinforce(self, new_record, time_step):
        # new_record: (exer_content, know_name, score)
        # 1. 添加新事实
        self.write_factual(new_record, time_step)
        new_concept = new_record[1]

        # 2. 遍历旧事实，计算相似度并强化
        for fact in self.factual[:-1]: # 不包括刚刚添加的新事实
            old_concept = fact[1]
            # 基于KCG计算相似度 (简化：直接相连则相似)
            if (new_concept, old_concept) in self.KCG or \
               (old_concept, new_concept) in self.KCG or \
               new_concept == old_concept:
                fact[3] += 1 # 增加强化计次

        # 3. 检查是否有事实可以迁移到长期记忆
        existing_long_facts = {tuple(f[:3]) for f in self.long['significant_facts']}
        for fact in self.factual:
            if fact[3] >= self.threshold and tuple(fact[:3]) not in existing_long_facts:
                self.long['significant_facts'].append(fact)
                # print(f"[Memory] Fact promoted to long-term: {fact[1]}") # for debugging

    def forget(self, current_time_step):
        kept_facts = []
        for fact in self.long['significant_facts']:
            ts = fact[4] # 该记忆的时间戳
            # 指数衰减函数 P = 1 / (1 + e^-(delta_t))
            # delta_t 越大，P 越接近1 (越不容易忘)
            # 我们用 1-P 作为遗忘概率，如果 1-P > lambda, 就遗忘
            prob_keep = 1 / (1 + math.exp(-(current_time_step - ts)))
            if prob_keep >= self.forget_lambda:
                kept_facts.append(fact)
            # else:
                # print(f"[Memory] Fact forgotten: {fact[1]}") # for debugging
        self.long['significant_facts'] = kept_facts


# ------------------- 4. 行为 (Action) 模块 -------------------
class AgentAction:
    """
    执行四项核心任务，模拟学生的完整学习过程。
    """
    def __init__(self, profile, memory, llm_client):
        self.profile = profile
        self.memory = memory
        self.llm = llm_client

    def _build_prompt(self, practice, short_mem, long_mem):
        prompt = f"Recommended Exercise:\n"
        prompt += f"- Textual Content: {practice['exer_content']}\n"
        prompt += f"- Knowledge Concept (true): {practice['know_name']}\n"

        # Task 1
        prompt += "\nTask 1: Based on your Profile and past experiences, decide if you want to attempt this exercise. If it seems too difficult or you are not confident, output 'No'; otherwise, output 'Yes'.\n"

        # Short-term Memory
        if short_mem:
            prompt += "\nYour Short-term Memory (most recent exercises):\n"
            for idx, r in enumerate(short_mem, 1):
                correctness = 'Correct' if r[2] == 1 else 'Incorrect'
                prompt += f" Record {idx}: Content='{r[0][:50]}...', Concept='{r[1]}', Result={correctness}\n"

        # Task 2
        prompt += "\nTask 2: Identify the primary knowledge concept tested by this exercise from the list below. Output only the concept name.\n"
        # 动态生成选项: 正确答案 + 2个已练习过的知识点
        options = [practice['know_name']]
        other_practiced = [k for k in long_mem.get('practiced_knowledge', []) if k != practice['know_name']]
        options.extend(random.sample(other_practiced, min(2, len(other_practiced))))
        random.shuffle(options)
        for opt in options:
            prompt += f" - {opt}\n"

        # Long-term Memory
        if long_mem.get('significant_facts'):
            prompt += "\nYour Long-term Memory (important facts you've reinforced):\n"
            for idx, f in enumerate(long_mem['significant_facts'], 1):
                 prompt += f" Fact {idx}: You practiced on concept '{f[1]}' and your answer was {'Correct' if f[2]==1 else 'Incorrect'}. This is a significant memory.\n"

        # Task 3 & 4
        prompt += "\nTask 3: Propose a concise problem-solving idea and then give the final answer.\n"
        prompt += "Task 4: Predict whether you will answer this question correctly ('Yes' or 'No').\n"

        # Output format
        prompt += "\nOutput format must be exactly as follows:\n"
        prompt += "Task1: <Yes/No>\nTask2: <concept_name>\nTask3: <your idea and final answer>\nTask4: <Yes/No>"
        return prompt

    def _parse_response(self, resp_text):
        ans = {}
        lines = resp_text.strip().split('\n')
        for line in lines:
            try:
                if ':' in line:
                    key, value = line.split(':', 1)
                    key = key.strip().lower()
                    ans[key] = value.strip()
            except:
                continue # 忽略格式不正确的行
        # 确保所有任务都有值，以防解析失败
        for i in range(1, 5):
            task_key = f'task{i}'
            if task_key not in ans:
                ans[task_key] = "N/A" # 默认值
        return ans

    async def simulate_step(self, practice, time_step):
        # 1. 记忆检索
        short_mem = self.memory.retrieve_short()
        long_mem = self.memory.retrieve_long()

        # 2. 构建提示
        prompt = self._build_prompt(practice, short_mem, long_mem)
        messages = [
            {'role': 'system', 'content': self.profile.build_prompt()},
            {'role': 'user', 'content': prompt}
        ]

        # 3. LLM 调用
        raw_resp = ""
        try:
            raw_resp = await self.llm.chat(messages=messages, stream=False)
        except Exception as e:
            print(f"LLM API call failed: {e}")
            raw_resp = "Task1: N/A\nTask2: N/A\nTask3: N/A\nTask4: N/A" # 返回默认失败结果

        ans = self._parse_response(raw_resp)

        # 4. 记忆更新 (在外部循环中处理，因为需要先获得真实结果)
        # 返回预测结果
        return ans, raw_resp

print("Agent 核心模块定义完成！")


In [None]:
# --- 5. 实验设置与主循环 ---
print("="*20, "5. 实验设置与主循环", "="*20)

# ------------------- 1. 实验参数 -------------------
TEST_RATIO = 0.1
# 为快速测试，可以只选择一部分学生
# 设置为 None 则运行所有符合条件的学生
NUM_STUDENTS_TO_RUN = 5 # e.g., 5, 10, or None for all
RANDOM_STATE = 42 # for reproducibility

# ------------------- 2. 构建 KCG (知识概念图) -------------------
# KCG 用于记忆模块中的相似度计算
# know_name_map 用于 id 和 name 之间的转换
know_name_map = kcs_df.set_index('id')['name'].to_dict()
know_id_map = {v: k for k, v in know_name_map.items()}

# 将关系表中的 id 转换为 name
kcg_df = kc_relationships_df.copy()
kcg_df['from_kc_name'] = kcg_df['from_knowledgecomponent_id'].map(know_name_map)
kcg_df['to_kc_name'] = kcg_df['to_knowledgecomponent_id'].map(know_name_map)

# KCG 是一个包含了所有 (概念1, 概念2) 关联关系的集合
KCG = set(zip(kcg_df['from_kc_name'], kcg_df['to_kc_name']))
print(f"KCG 构建完成，包含 {len(KCG)} 条知识点关联。")


# ------------------- 3. 实验主函数 -------------------
async def run_experiment(student_ids):
    """
    对指定学生列表运行完整的 Agent 模拟实验。
    """
    all_results = []

    for student_id in tqdm(student_ids, desc="Simulating Students"):
        student_records_df = all_student_records[student_id]

        # 1. 数据集划分 (随机打乱)
        train_df, test_df = train_test_split(
            student_records_df,
            test_size=TEST_RATIO,
            random_state=RANDOM_STATE,
            shuffle=True
        )

        # 2. Agent 初始化
        profile = Profile(student_id, train_df)
        memory = Memory(KCG, know_name_map)
        agent = AgentAction(profile, memory, llm_client)

        # 3. 训练阶段 (填充 Agent 的记忆)
        time_step = 0
        for _, practice in train_df.iterrows():
            time_step += 1
            record = (practice['exer_content'], practice['know_name'], practice['score'])
            
            # 记忆强化
            if SIM_PARAMS['learning_effect'] == 'yes':
                memory.reinforce(record, time_step)
            else:
                 memory.write_factual(record, time_step)
            
            # 记忆遗忘
            if SIM_PARAMS['forgetting_effect'] == 'yes':
                memory.forget(time_step)

        # 4. 测试阶段 (进行预测)
        for _, practice in test_df.iterrows():
            time_step += 1
            
            # Agent 进行预测
            ans, raw_resp = await agent.simulate_step(practice, time_step)

            # 记录结果
            result = {
                'student_id': student_id,
                'question_id': practice['question_id'],
                'true_know_name': practice['know_name'],
                'true_score': practice['score'],
                'predicted_task1_attempt': ans.get('task1', 'N/A'),
                'predicted_task2_know_name': ans.get('task2', 'N/A'),
                'predicted_task3_answer': ans.get('task3', 'N/A'),
                'predicted_task4_score': ans.get('task4', 'N/A'),
                'llm_raw_response': raw_resp
            }
            all_results.append(result)

            # 测试后，同样需要将真实的练习记录更新到记忆中，以模拟连续学习过程
            record = (practice['exer_content'], practice['know_name'], practice['score'])
            if SIM_PARAMS['learning_effect'] == 'yes':
                memory.reinforce(record, time_step)
            else:
                memory.write_factual(record, time_step)
            if SIM_PARAMS['forgetting_effect'] == 'yes':
                memory.forget(time_step)

    return pd.DataFrame(all_results)


# ------------------- 4. 运行实验 -------------------
# 选取要运行的学生
student_ids_to_run = list(all_student_records.keys())
if NUM_STUDENTS_TO_RUN is not None:
    # 随机抽样N个学生
    random.seed(RANDOM_STATE)
    student_ids_to_run = random.sample(student_ids_to_run, NUM_STUDENTS_TO_RUN)

print(f"\n准备开始实验，将对 {len(student_ids_to_run)} 名学生进行模拟...")

# 运行异步主函数
# 在Jupyter环境中，可以直接await
# 如果是在普通脚本中，需要使用 asyncio.run()
# results_df = await run_experiment(student_ids_to_run)
# print("\n实验完成！")
# display(results_df.head())

# 注意：请在Jupyter环境中取消下面的注释并运行
# 如果llm_client未成功初始化，运行会报错
if llm_client:
    print("LLM客户端已准备就绪，可以开始运行实验。")
    print("请取消下一行代码的注释以启动异步实验。")
    # asyncio.run(run_experiment(student_ids_to_run))
else:
    print("LLM客户端未初始化，无法运行实验。请检查之前的步骤。")

# # ----------------- 在Jupyter中运行的示例代码 -----------------
# # 取消下面的注释来运行
# async def main():
#     global results_df
#     if llm_client:
#         print("开始运行实验...")
#         results_df = await run_experiment(student_ids_to_run)
#         print("\n实验完成！")
#         display(results_df)
#     else:
#         print("无法运行实验，LLM Client 未初始化。")

# # 在Jupyter Notebook中，可以直接运行 top-level await
# # 如果不行，可以用下面的方式
# # await main()
# # 或者
# # asyncio.run(main())


In [None]:
# --- 6. 结果评估 ---
print("="*20, "6. 结果评估", "="*20)

# 假设 `results_df` 是 `run_experiment` 函数成功运行后返回的 DataFrame
# 如果您还没有运行实验，可以创建一个假的 DataFrame 来测试评估逻辑
try:
    if 'results_df' not in globals() or results_df.empty:
        print("未找到实验结果 'results_df'。创建一个示例 DataFrame 以进行演示。")
        # 创建一个示例性的假数据
        data = {
            'student_id': [1, 1, 2, 2],
            'true_know_name': ['Data Model', 'Subset', 'Join', 'Data Model'],
            'true_score': [1, 0, 1, 1],
            'predicted_task1_attempt': ['Yes', 'yes', 'No', 'Yes.'],
            'predicted_task2_know_name': ['Data Model', 'CREATE TABLE', 'Join', 'Data Model'],
            'predicted_task4_score': ['Yes', 'No', 'no', ' Yes '],
        }
        results_df = pd.DataFrame(data)
        print("示例数据已创建。")
except NameError:
    # 捕获 NameError 以防 results_df 从未被定义
    print("未找到实验结果 'results_df'。创建一个示例 DataFrame 以进行演示。")
    data = {
        'student_id': [1, 1, 2, 2],
        'true_know_name': ['Data Model', 'Subset', 'Join', 'Data Model'],
        'true_score': [1, 0, 1, 1],
        'predicted_task1_attempt': ['Yes', 'yes', 'No', 'Yes.'],
        'predicted_task2_know_name': ['Data Model', 'CREATE TABLE', 'Join', 'Data Model'],
        'predicted_task4_score': ['Yes', 'No', 'no', ' Yes '],
    }
    results_df = pd.DataFrame(data)
    print("示例数据已创建。")


def evaluate_results(df):
    """
    计算并展示各项评估指标。
    """
    if df.empty:
        print("结果DataFrame为空，无法进行评估。")
        return

    print("--- 开始评估 ---")
    eval_df = df.copy()

    # 1. 数据清洗和规范化
    # 将 Yes/No 的预测结果统一转换为 1/0
    def normalize_yes_no(value):
        if isinstance(value, str):
            val_lower = value.strip().lower()
            if val_lower == 'yes':
                return 1
            if val_lower == 'no':
                return 0
        return np.nan # 返回 NaN 表示无法解析

    eval_df['pred_t1'] = eval_df['predicted_task1_attempt'].apply(normalize_yes_no)
    eval_df['pred_t4'] = eval_df['predicted_task4_score'].apply(normalize_yes_no)

    # 2. 计算各项任务的准确率
    
    # Task 1: 决策是否尝试 (真实值总是1，因为学生实际做了)
    # dropna() 用于处理无法解析的预测
    acc_t1 = (eval_df['pred_t1'] == 1).mean()
    
    # Task 2: 知识概念识别
    acc_t2 = (eval_df['predicted_task2_know_name'] == eval_df['true_know_name']).mean()

    # Task 4: 答题正确率预测 (核心指标)
    acc_t4 = (eval_df['pred_t4'] == eval_df['true_score']).mean()
    
    # 综合评估：我们将Task 4的准确率作为智能体对学生表现的总体预测准确度
    agent_overall_accuracy = acc_t4

    # 3. 打印结果报告
    print("\n--- 智能体表现评估报告 ---")
    print(f"总测试样本数: {len(eval_df)}")
    print("-" * 30)
    print(f"任务1 (尝试决策) 准确率: {acc_t1:.2%}")
    print(f"任务2 (知识点识别) 准确率: {acc_t2:.2%}")
    print(f"任务4 (表现预测) 准确率: {acc_t4:.2%}")
    print("-" * 30)
    print(f"==> 智能体综合评估准确率 (基于Task 4): {agent_overall_accuracy:.2%} <==")
    
    # 4. 显示带有评估结果的DataFrame
    print("\n--- 详细结果对比 (前10条) ---")
    display_cols = [
        'student_id',
        'true_know_name', 'predicted_task2_know_name',
        'true_score', 'pred_t4'
    ]
    display(eval_df[display_cols].head(10))
    
    # 5. 混淆矩阵 (针对Task 4)
    try:
        from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
        import matplotlib.pyplot as plt
        
        # 移除无法解析的行
        cm_df = eval_df[['true_score', 'pred_t4']].dropna()
        
        cm = confusion_matrix(cm_df['true_score'], cm_df['pred_t4'])
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['预测错误', '预测正确'])
        
        fig, ax = plt.subplots(figsize=(6, 6))
        disp.plot(ax=ax, cmap='Blues')
        ax.set_title('任务4: 表现预测混淆矩阵')
        plt.show()

    except ImportError:
        print("\n请安装 scikit-learn 和 matplotlib 以显示混淆矩阵: pip install scikit-learn matplotlib")
    except Exception as e:
        print(f"\n无法生成混淆矩阵: {e}")


# 对实验结果运行评估函数
evaluate_results(results_df)
