## 簡介 ##
此代碼用來讓LLM根據表格資料與使用者的提問要求，透過pipline與tree stucture，生成報導或分析資料

此篇研究只需提供
"main.txt"為使用者的大綱與簡短想法
"data_description.txt"為要分析的table columns所代表的意義
就可產生完整報導
(可使用在產生任何報導上不限於羽球)

# STEP 1

刪減不必要的columns

結果保留['rally', 'time', 'roundscore_A', 'roundscore_B', 'player', 'type', 'lose_reason', 'getpoint_player']

In [1]:
import os
api_key = os.getenv("Gemini_API")
if not api_key:
    print("❌ Gemini_API 環境變數未設定")

In [2]:
#正式
import dspy
import json
import re
from typing import List, Dict, Any, Optional, ClassVar
import os
from dataclasses import dataclass
import pandas as pd
from openai import OpenAI

class GeminiOpenAI(dspy.LM):
    def __init__(self, api_key, model_name="gemini-2.0-flash"):
        self.api_key = api_key
        self.model_name = model_name
        # 使用 Google 的 OpenAI 兼容端點
        self.client = OpenAI(
            api_key=api_key,
            base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
        )
        super().__init__(model=model_name)
     
    def __call__(self, messages=None, **kwargs):
        if messages is None:
            raise ValueError("Missing 'messages' argument")
         
        # Convert messages to OpenAI format
        if isinstance(messages, list):
            formatted_messages = []
            for msg in messages:
                if isinstance(msg, dict) and 'content' in msg:
                    role = msg.get('role', 'user')
                    formatted_messages.append({
                        'role': role,
                        'content': msg['content']
                    })
                else:
                    formatted_messages.append({
                        'role': 'user',
                        'content': str(msg)
                    })
        else:
            formatted_messages = [{'role': 'user', 'content': str(messages)}]
         
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=formatted_messages,
                **kwargs
            )
            
            if not response.choices or not response.choices[0].message.content:
                raise ValueError("Empty response from Gemini")
            
            return [{
                'text': response.choices[0].message.content,
                'logprobs': None
            }]
        except Exception as e:
            print(f"Error from Gemini model: {e}")
            return [{
                'text': "⚠️ Gemini API 回應失敗,可能已達限額或出現錯誤。",
                'logprobs': None
            }]
     
    def basic_request(self, prompt, **kwargs):
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[{'role': 'user', 'content': prompt}],
                **kwargs
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error from Gemini model: {e}")
            return "⚠️ 無法取得 Gemini 回應"

def setup_gemini_api(api_key, model_name="gemini-2.0-flash"):
    lm = GeminiOpenAI(api_key=api_key, model_name=model_name)
    dspy.settings.configure(lm=lm)
    return lm

def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin1') as file:
            return file.read()

def parse_list_from_response(response_text: str) -> List[str]:
    """
    Parse a Python list from various response formats including markdown code blocks
    """
    if not response_text or response_text.strip() == "":
        print("⚠️ 回應為空")
        return []
    
    # Remove leading/trailing whitespace
    text = response_text.strip()
    
    # Remove markdown code blocks
    text = re.sub(r'```(?:python|json)?\s*', '', text)
    text = re.sub(r'```\s*', '', text)
    
    # Remove any additional backticks
    text = text.strip('`').strip()
    
    # Try to find a list pattern in the text
    list_match = re.search(r'\[.*?\]', text, re.DOTALL)
    
    if list_match:
        list_text = list_match.group(0)
    else:
        print(f"⚠️ 無法在回應中找到列表格式")
        print(f"完整回應: {text[:200]}...")
        return []
    
    # Clean up the list text
    list_text = list_text.strip()
    
    # Try multiple parsing strategies
    try:
        # Strategy 1: Parse as-is, change to python list
        return json.loads(list_text)
    except json.JSONDecodeError:
        pass
    
    try:
        # Strategy 2: Convert single quotes to double quotes
        list_text_double = list_text.replace("'", '"')
        return json.loads(list_text_double)
    except json.JSONDecodeError:
        pass
    
    try:
        # Strategy 3: Manual parsing for simple cases
        # Remove brackets and split by comma
        content = list_text.strip('[]').strip()
        if not content:
            return []
        
        # Split by comma and clean each item
        items = []
        for item in content.split(','):
            item = item.strip().strip('"').strip("'").strip()
            if item:
                items.append(item)
        
        if items:
            print(f"✓ 使用手動解析成功")
            return items
    except Exception as e:
        print(f"⚠️ 手動解析失敗: {e}")
    
    print(f"❌ 所有解析方法都失敗了")
    print(f"原始文本: {list_text[:200]}")
    return []


def extract_news_relevant_fields(description_path: str, main_path: str, model_name="gemini-2.0-flash"):
    """
    從描述文件和大綱文件中提取相關欄位
    
    Args:
        description_path: 資料欄位描述文件路徑
        main_path: 大綱文件路徑
        model_name: 使用的模型名稱
    
    Returns:
        List[str]: 篩選出的欄位列表
    """
     
    lm = setup_gemini_api(api_key, model_name)
    main_content = read_text_file(main_path)
    description = read_text_file(description_path)
    
    prompt = f"""Using the following outline and list of data column descriptions, select only the columns that are useful for the outline.

## outline
{main_content}

## Data Column Descriptions:
{description}

---

Please return only a Python list of column names, like this:
['player_name', 'match_score', 'duration', ...]

Do not include explanations or any other text. Return only the list."""
     
    result = lm.basic_request(prompt)
    
    print(f"🔍 原始回應:\n{result}\n")
    
    selected_fields = parse_list_from_response(result)
    
    if selected_fields:
        print("✅ 篩選出的欄位:", selected_fields)
    else:
        print("❌ 未能成功解析欄位列表")
    
    return selected_fields

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 直接調用函式
fields = extract_news_relevant_fields("data_description.txt", "main.txt")
print("最終欄位清單:", fields)


🔍 原始回應:
```python
['roundscore_A', 'roundscore_B', 'player', 'getpoint_player', 'type', 'rally', 'time']
```

✅ 篩選出的欄位: ['roundscore_A', 'roundscore_B', 'player', 'getpoint_player', 'type', 'rally', 'time']
最終欄位清單: ['roundscore_A', 'roundscore_B', 'player', 'getpoint_player', 'type', 'rally', 'time']


In [4]:
df = pd.read_csv("set1.csv")
filtered_df = df[fields]
filtered_df.to_csv("filtered_set1.csv")

將挑選出的欄位及說明寫入filtered_data_description.txt

In [5]:
def extract_descriptions_for_fields(fields: List[str], desc_path: str, output_path: str):
    description_text = read_text_file(desc_path)

    field_desc = {}
    for line in description_text.splitlines():
        for field in fields:
            if line.lower().startswith(field.lower() + ":"):
                field_desc[field] = line.strip()

    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            for field in fields:
                f.write(field_desc.get(field, f"{field}: [Description not found]") + "\n")
        print(f"✅ 已將欄位描述寫入 {output_path}")
    except Exception as e:
        print(f"❌ 寫入失敗: {e}")


extract_descriptions_for_fields(fields, 'data_description.txt', "filtered_data_description.txt")

✅ 已將欄位描述寫入 filtered_data_description.txt


# STEP 2

藉由人為輸入問題與方向提示，給LLM做完整分析問題與方向之規劃

In [6]:
def generate_chain_of_thought_response(main_path: str, desc_path: str, output_path: str, model_name="gemini-2.0-flash"):
    """
    生成 Chain-of-Thought 分析回應
    
    Args:
        main_path: 大綱文件路徑
        desc_path: 資料欄位描述文件路徑
        output_path: 輸出文件路徑
        model_name: 使用的模型名稱
    
    Returns:
        str: 生成的回應內容,如果失敗則返回 None
    """

    lm = setup_gemini_api(api_key, model_name)

    main_content = read_text_file(main_path)
    description = read_text_file(desc_path)

    chain_prompt = f"""
You are a planning assistant.
Analyze the following outline and column descriptions.

## Outline & Ideas:
{main_content}

## Data Column Descriptions:
{description}

---

Step-by-step:
1. Reflect on the structure and meaning of the content.
2. Formulate relevant and meaningful questions or planning strategies.
3. Be explicit and detailed, use Chain-of-Thought reasoning.
4. Output all thoughts and questions in English only.
"""

    result = lm.basic_request(chain_prompt)

    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(result)
        print(f"✅ Response saved to: {output_path}")
        return result
    except Exception as e:
        print(f"❌ Failed to write output: {e}")
        return None

In [7]:
response = generate_chain_of_thought_response(
    main_path="main.txt",
    desc_path="filtered_data_description.txt",
    output_path="analyze_response.txt"
)

✅ Response saved to: analyze_response.txt


# STEP 3

請LLM根據"analyze_response.txt"思考可以使用的operation並將結果存於 "operations_info.json"

In [9]:
def analyze_operations(analyze_path: str, output_json: str) -> List[str]:
    lm = setup_gemini_api(api_key)
    analysis = read_text_file(analyze_path)

    prompt = f"""
You are a news journalist want to analyze data not forecaster.
Based on the following text analysis, identify multiple useful table operations
and describe the direct meaning of each operation.

## Text Analysis:
{analysis}

---

Please output a numbered list in this format:
1. write: If the table is clear or small enough, generates text based on the tables using the LLM.
2. select_row: Description
3. select_column: Description
4. operation_name: Description
5. operation_name: Description
...

IMPORTANT: operation must contain select_row, select_column, and write in the first three operation.

Give important operations and at most 15 operations.
operation_name should be different and each operation can not be similar.
operation can be apply on many columns is better.
Description just give the original definition of the operation name and give some useful functions name in pandas.
Only include operations and their descriptions. Be concise and clear.
"""

    response = lm.basic_request(prompt)

    operations = []
    operations_dict = {}

    try:
        for line in response.strip().split('\n'):
            if line.strip() == "":
                continue
            if "." in line:
                num, rest = line.split(".", 1)
                if ":" in rest:
                    name, desc = rest.strip().split(":", 1)
                    name = name.strip()
                    desc = desc.strip()
                    operations.append(name)
                    operations_dict[num.strip()] = {"operation": name, "description": desc}

        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(operations_dict, f, indent=2, ensure_ascii=False)

        print(f"✅ 操作清單與描述已儲存至 {output_json}")
        return operations

    except Exception as e:
        print(f"❌ 回應處理失敗: {e}\n原始回應:\n{response}")
        return []

ops = analyze_operations("analyze_response.txt", "operations_info.json")
print("\n✅ 操作名稱陣列:")
print(ops)

✅ 操作清單與描述已儲存至 operations_info.json

✅ 操作名稱陣列:
['write', 'select_row', 'select_column', '**group_by', '**aggregate', '**sort', '**join', '**calculate', '**pivot_table', '**window_function', '**value_counts', '**crosstab', '**shift', '**correlation', '**query']


# STEP 4

使LLM自動分析table選出合適的operation放入操作池(operations)

In [10]:
class OperationSignature(dspy.Signature):
    """Identify suitable operations for analyzing badminton match data."""
    data_description = dspy.InputField(desc="Overview and sample of the dataset")
    column_descriptions = dspy.InputField(desc="Descriptions of each column in the dataset")
    rules = dspy.InputField(desc="Rules for selecting operations")
    operations_list = dspy.OutputField(desc="A list of suitable operations number (e.g., [1, 2, 3, 4])")

def read_badminton_data(file_path):
    """
    讀取羽球比賽數據 CSV 文件
    
    Args:
        file_path: CSV 文件路徑
    
    Returns:
        pd.DataFrame: 讀取的數據
    """
    try:
        return pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        return pd.read_csv(file_path, encoding='latin1')


def read_json_file(file_path):
    """
    讀取 JSON 文件
    
    Args:
        file_path: JSON 文件路徑
    
    Returns:
        dict: JSON 數據
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin1') as file:
            return json.load(file)

def parse_column_descriptions(description_text):
    """
    解析欄位描述文本
    
    Args:
        description_text: 欄位描述文本
    
    Returns:
        dict: 欄位名稱到描述的映射
    """
    descriptions = {}
    pattern = r'''
        ^                # Line start
        (\w+)            # Column name
        :\s+             # Colon and space
        (.+?)            # Description text
        (?=\n\w+:\s+|\Z) # Lookahead for next column or end of file
    '''
    matches = re.findall(pattern, description_text, flags=re.M | re.X)
    for col_name, desc in matches:
        clean_desc = ' '.join(desc.split()).strip()
        descriptions[col_name] = clean_desc
    return descriptions

class BadmintonOperationSelector(dspy.Module):
    def __init__(self, required_operations=None):
        """
        初始化操作選擇器
        
        Args:
            required_operations: 必須包含的操作編號列表 (例如: [1, 2, 3])
        """
        super().__init__()
        self.chain_of_thought = dspy.ChainOfThought(OperationSignature)
        self.required_operations = required_operations or []

    def forward(self, data_description, column_descriptions, rules):
        result = self.chain_of_thought(
            data_description=data_description,
            column_descriptions=str(column_descriptions),
            rules=str(rules)
        )
        operations = self.extract_operations_from_result(result.operations_list)
        
        # 確保必需的操作被包含
        operations = self.ensure_required_operations(operations)
        
        return operations

    def extract_operations_from_result(self, operations_text):
        """
        從回應中提取操作編號列表
        支援多種格式:
        - [1, 2, 3, 4]
        - 1, 2, 3, 4
        - 1 2 3 4
        - Operation 1, Operation 2, etc.
        """
        operations = []
        
        # 移除 markdown 代碼塊標記
        operations_text = re.sub(r'```(?:python|json)?\s*', '', operations_text)
        operations_text = operations_text.strip('`').strip()
        
        # 嘗試解析 JSON 格式 [1, 2, 3]
        try:
            # 尋找方括號中的內容
            list_match = re.search(r'\[([^\]]+)\]', operations_text)
            if list_match:
                list_content = list_match.group(1)
                # 提取所有數字
                numbers = re.findall(r'\d+', list_content)
                operations = [int(num) for num in numbers]
                if operations:
                    return operations
        except:
            pass
        
        # 如果沒有方括號,嘗試直接提取所有數字
        numbers = re.findall(r'\d+', operations_text)
        if numbers:
            operations = [int(num) for num in numbers]
            return operations
        
        # 如果以上都失敗,嘗試逐行處理
        lines = operations_text.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
            # 提取該行中的所有數字
            line_numbers = re.findall(r'\d+', line)
            operations.extend([int(num) for num in line_numbers])
        
        # 去重並排序
        if operations:
            operations = sorted(list(set(operations)))
        
        return operations
    
    def ensure_required_operations(self, operations):
        """
        確保必需的操作被包含在操作列表中
        
        Args:
            operations: 當前的操作列表
        
        Returns:
            list: 包含必需操作的完整列表
        """
        # 轉換為集合以避免重複
        operations_set = set(operations)
        
        # 添加必需的操作
        for required_op in self.required_operations:
            operations_set.add(required_op)
        
        # 轉換回列表並排序
        return sorted(list(operations_set))


def analyze_badminton_match(data_path, column_desc_path, rules_path, 
                           model_name="gemini-2.0-flash-exp", 
                           required_operations=None):
    """
    分析羽球比賽數據並識別適合的操作
    
    Args:
        data_path: 比賽數據 CSV 文件路徑
        column_desc_path: 欄位描述文件路徑
        rules_path: 操作規則 JSON 文件路徑
        model_name: 使用的模型名稱
        required_operations: 必須包含的操作編號列表 (例如: [1, 2, 3])
    
    Returns:
        list: 識別出的操作編號列表 (整數)
    """
    
    # 設置默認的必需操作為 [1, 2, 3]
    if required_operations is None:
        required_operations = [1, 2, 3]
    
    print("Reading badminton match data...")
    try:
        match_data = read_badminton_data(data_path)
        columns_desc_content = read_text_file(column_desc_path)
        rules = read_json_file(rules_path)
    except Exception as e:
        print(f"❌ Error reading files: {e}")
        return []

    column_descriptions = parse_column_descriptions(columns_desc_content)
    setup_gemini_api(api_key, model_name)

    data_sample = match_data.head().to_string()
    data_description = f"""
    one match data:
    {data_sample}

    Data shape: {match_data.shape[0]} rows, {match_data.shape[1]} columns
    Columns: {', '.join(match_data.columns)}
    """

    selector = BadmintonOperationSelector(required_operations=required_operations)
    operations = selector.forward(data_description, column_descriptions, rules)

    print(f"✅ Identified {len(operations)} suitable operations:")
    print(f"   Required operations: {required_operations}")
    for i, op in enumerate(operations, 1):
        required_marker = " (Required)" if op in required_operations else ""
        print(f"{i}. Operation {op}{required_marker}")

    return operations

In [11]:
operations = analyze_badminton_match(
    data_path="filtered_set1.csv",
    column_desc_path="filtered_data_description.txt",
    rules_path="operations_info.json"
)
print("\nFinal operations array:", operations)

Reading badminton match data...
✅ Identified 8 suitable operations:
   Required operations: [1, 2, 3]
1. Operation 1 (Required)
2. Operation 2 (Required)
3. Operation 3 (Required)
4. Operation 4
5. Operation 5
6. Operation 8
7. Operation 11
8. Operation 12

Final operations array: [1, 2, 3, 4, 5, 8, 11, 12]


將所挑選出來的操作寫入"operations.json"

In [17]:
import json

# 從 JSON 檔案讀取 operations
original_operations_dict = read_json_file("operations_info.json")

# 你想要挑選的 operation 編號（根據實際需求修改這個 list）
selected_numbers = operations

def clean_operation_name(operation_text):
    """
    清理操作名稱，只保留英文字母、數字和底線
    移除所有特殊字符如 **, -, 等
    
    Args:
        operation_text: 原始操作名稱
    
    Returns:
        str: 清理後的操作名稱
    """
    # 移除所有非字母、數字、底線的字符
    cleaned = re.sub(r'[^a-zA-Z0-9_]', '', operation_text)
    return cleaned

filtered_operations = []
for new_number, original_number in enumerate(selected_numbers, start=1):
    # 將數字轉換為字串鍵來查找
    key = str(original_number)
    if key in original_operations_dict:
        op_data = original_operations_dict[key]
        
        # 清理 operation 名稱
        cleaned_operation = clean_operation_name(op_data["operation"])
        
        filtered_operations.append({
            "number": new_number,
            "operation": cleaned_operation,
            "description": op_data["description"]
        })

# 新的 JSON 結構
output_json = {
    "description": "Selected operations for badminton data analysis.",
    "requirements": [
        "The output must be based on the input data; do not hallucinate.",
        "Give me the list of numbers."
    ],
    "operations": filtered_operations
}

# 寫入 JSON 檔案
with open("operations.json", "w", encoding="utf-8") as f:
    json.dump(output_json, f, ensure_ascii=False, indent=2)

print(f"✅ operations.json has been created with {len(filtered_operations)} operations.")
print(f"Selected operations: {selected_numbers}")

✅ operations.json has been created with 8 operations.
Selected operations: [1, 2, 3, 4, 5, 8, 11, 12]


# STEP 5

根據真實table將操作重要性排序，若為排序後30%且非三種重要操作，則替除，保留'write' 'select_col' 'select_row'三個重要操作，到'selected_operations.json'

操作提取已完成!!

In [22]:
import os
import re
import json
import pandas as pd
import numpy as np

def load_operations_from_json(json_file_path):
    """
    Load operations from JSON file
    支援兩種格式:
    1. 舊格式: {"1": {"operation": "...", "description": "..."}, ...}
    2. 新格式: {"operations": [{"number": 1, "operation": "...", "description": "..."}, ...]}
    """
    try:
        data = read_json_file(json_file_path)
        
        operations_data = data['operations']
        
        # Create formatted operation strings for LLM processing
        operation_strings = []
        operation_details = []
        
        for op in operations_data:
            number = op.get('number', '')
            name = op.get('operation', '')
            description = op.get('description', '')
            
            # Format as: "number. name: description"
            if number and name and description:
                formatted_op = f"{number}. {name}: {description}"
                operation_strings.append(formatted_op)
                operation_details.append({
                    'number': number,
                    'operation': name,  # 統一使用 'operation' 鍵
                    'description': description,
                    'formatted': formatted_op
                })
        
        print(f"從 {json_file_path} 成功載入 {len(operation_strings)} 個操作")
        return operation_details, operation_strings
        
    except FileNotFoundError:
        print(f"錯誤: 找不到文件 {json_file_path}")
        return [], []
    except json.JSONDecodeError:
        print(f"錯誤: {json_file_path} 不是有效的 JSON 文件")
        return [], []
    except Exception as e:
        print(f"載入操作時發生錯誤: {e}")
        return [], []


def get_data_summary(dataframe):
    """
    Generate a comprehensive summary of the dataset
    """
    summary = f"資料集概要:\n- 總行數: {dataframe.shape[0]}\n- 總列數: {dataframe.shape[1]}\n- 欄位名稱: {', '.join(dataframe.columns)}\n\n各欄位資訊:\n"
    for col in dataframe.columns:
        summary += f"  - {col}: "
        if dataframe[col].dtype in ['object', 'string']:
            unique_vals = dataframe[col].unique()[:10]
            summary += f"類別型資料, 獨特值範例: {', '.join(map(str, unique_vals))}\n"
        else:
            summary += f"數值型資料, 範圍: {dataframe[col].min()} - {dataframe[col].max()}\n"
    return summary


def extract_operation_numbers_from_response(response):
    """
    從回應中提取操作編號列表
    支援多種格式
    """
    # 方法1: 匹配代碼塊中的數組
    pattern1 = r'```\s*\[([\d,\s]+)\]\s*```'
    match = re.search(pattern1, response)
    
    if match:
        array_str = match.group(1)
        operation_list = [int(num) for num in array_str.replace(' ', '').split(',') if num]
        print(f"提取到操作列表: {operation_list}")
        return operation_list
    
    # 方法2: 匹配普通方括號中的數組
    pattern2 = r'\[([\d,\s]+)\]'
    match = re.search(pattern2, response)
    
    if match:
        array_str = match.group(1)
        operation_list = [int(num) for num in array_str.replace(' ', '').split(',') if num]
        print(f"提取到操作列表: {operation_list}")
        return operation_list
    
    # 方法3: 提取所有數字
    numbers = re.findall(r'\b(\d+)\b', response)
    if numbers:
        operation_list = [int(num) for num in numbers]
        print(f"提取到操作列表: {operation_list}")
        return operation_list
    
    print("⚠️ 未找到排序數組")
    return []


def filter_badminton_operations(operation_details, operation_strings, df, api_key, 
                                outline_path='outline.txt', model_name="gemini-2.0-flash", 
                                max_retries=3):
    """
    使用 Gemini 根據重要性排序操作
    
    Args:
        operation_details: 操作詳細資訊列表
        operation_strings: 操作格式化字串列表
        df: 數據框
        api_key: API 金鑰
        outline_path: 大綱文件路徑
        model_name: 模型名稱
        max_retries: 最大重試次數
    
    Returns:
        tuple: (排序後的操作編號列表, 完整回應)
    """
    gemini = GeminiOpenAI(api_key=api_key, model_name=model_name)
    data_summary = get_data_summary(df)
    
    # 限制資料樣本大小
    data_sample = df.head(10).to_string()
    if len(data_sample) > 3000:
        data_sample = data_sample[:3000] + "...\n[資料已截斷]"
    
    outline = read_text_file(outline_path)
    
    print(f"操作數量: {len(operation_strings)}")
    
    prompt = f"""
我有一個撰寫新聞的大綱與比賽的資料集和 {len(operation_strings)} 個分析操作，請依據操作重要性排序(由高到低)。

大綱:
{outline}

資料樣本:
{data_sample}

資料集資訊:
{data_summary}

操作清單:
{chr(10).join(operation_strings)}

請先根據 chain-of-thought 分析，然後將操作編號根據重要性排序，每個編號僅在陣列中出現一次，陣列長度應為 {len(operation_strings)}。

最後請以以下格式輸出排序結果:
[1, 2, 3, ...]"""
    
    # 使用重試邏輯
    response = None
    for attempt in range(max_retries):
        try:
            print(f"嘗試 API 請求 (第 {attempt + 1}/{max_retries} 次)...")
            response = gemini.basic_request(prompt)
            
            # 檢查是否為錯誤回應
            if "⚠️" in response or not response:
                if attempt < max_retries - 1:
                    print(f"⚠️ 請求失敗，{3}秒後重試...")
                    import time
                    time.sleep(3)
                    continue
                else:
                    print(f"❌ API 回應錯誤，已達最大重試次數")
                    return [], response if response else "⚠️ 無法取得 Gemini 回應"
            
            # 成功獲得回應
            print("✅ 成功獲得 API 回應")
            break
            
        except Exception as e:
            print(f"⚠️ 請求發生異常: {e}")
            if attempt < max_retries - 1:
                print(f"3秒後重試...")
                import time
                time.sleep(3)
            else:
                print(f"❌ 已達最大重試次數")
                return [], f"⚠️ API 請求失敗: {e}"
    
    if not response:
        return [], "⚠️ 無法取得 Gemini 回應"
    
    return extract_operation_numbers_from_response(response), response


def create_selected_operations_json(operation_details, sorted_numbers, keep_percentage=0.7, 
                                    force_include=[1, 2, 3], output_path="selected_operations.json"):
    """
    創建選擇的操作 JSON 文件
    
    Args:
        operation_details: 操作詳細資訊列表
        sorted_numbers: 排序後的操作編號列表
        keep_percentage: 保留比例
        force_include: 強制包含的操作編號
        output_path: 輸出文件路徑
    
    Returns:
        list: 選擇的操作列表
    """
    if not sorted_numbers:
        print("⚠️ 警告: sorted_numbers 為空，無法創建操作列表")
        return []
    
    # 計算要保留的操作數量
    keep_count = max(len(force_include), int(keep_percentage * len(sorted_numbers)))
    
    # 選擇前 N 個操作
    selected_numbers = sorted_numbers[:keep_count]
    
    # 確保強制包含的操作在列表中
    selected_numbers = list(set(selected_numbers) | set(force_include))
    
    # 重新排序: 先按照 sorted_numbers 的順序，然後加上 force_include 中未出現的
    final_selected = []
    for num in sorted_numbers:
        if num in selected_numbers and num not in final_selected:
            final_selected.append(num)
    
    for num in force_include:
        if num not in final_selected:
            final_selected.append(num)
    
    print(f"選擇了 {len(final_selected)} 個操作 (保留比例: {keep_percentage*100:.0f}%)")
    print(f"選擇的操作編號: {final_selected}")
    
    # 創建操作編號到詳細資訊的映射
    operation_map = {int(detail['number']): detail for detail in operation_details}
    
    # 創建新的操作列表
    new_operations = []
    missing_operations = []
    
    for new_id, num in enumerate(final_selected, 1):
        if num in operation_map:
            detail = operation_map[num]
            new_operations.append({
                'number': new_id,
                'operation': detail['operation'],
                'description': detail['description']
            })
        else:
            missing_operations.append(num)
            print(f"⚠️ 警告: 找不到操作編號 {num}")
    
    if missing_operations:
        print(f"⚠️ 缺失的操作編號: {missing_operations}")
    
    output_json = {
        "description": "Selected operations for badminton data analysis.",
        "requirements": [
            "The output must be based on the input data; do not hallucinate.",
            "Give me the list of numbers."
        ],
        "operations": new_operations
    }

    # 寫入 JSON 文件
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_json, f, indent=2, ensure_ascii=False)
    
    print(f"✅ {output_path} has been created with {len(new_operations)} operations.")
    return new_operations


def read_badminton_data(file_path):
    """
    讀取羽球比賽數據 CSV 文件
    """
    try:
        return pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        return pd.read_csv(file_path, encoding='latin1')


# ==================== 主程式 ====================

# 載入操作
json_file_path = "operations.json"
operation_details, operation_strings = load_operations_from_json(json_file_path)

if not operation_details:
    print("❌ 無法載入操作，程式終止")
else:
    # 載入數據
    df = read_badminton_data("filtered_set1.csv")
    
    # 獲取 API 金鑰
    api_key = os.getenv("Gemini_API") 
    
    # 排序操作
    sorted_numbers, response = filter_badminton_operations(
        operation_details, 
        operation_strings, 
        df, 
        api_key, 
        outline_path='main.txt'
    )
    
    print(f"\n{'='*60}\n完整回應:\n{response}\n{'='*60}\n")
    print(f"排序後的操作編號: {sorted_numbers}")
    
    # 創建選擇的操作 JSON
    if sorted_numbers:
        selected_ops = create_selected_operations_json(
            operation_details,
            sorted_numbers,
            keep_percentage=0.7,
            force_include=[1, 2, 3],
            output_path="selected_operations.json"
        )
    else:
        print("❌ 無法提取排序結果，跳過創建 selected_operations.json")

從 operations.json 成功載入 8 個操作
操作數量: 8
嘗試 API 請求 (第 1/3 次)...
✅ 成功獲得 API 回應
提取到操作列表: [7, 8, 4, 5, 6, 3, 2, 1]

完整回應:
好的，我們來分析這些操作對於羽球新聞寫作的重要性，並給出排序。

**Chain-of-Thought:**

作為一位資深的羽球新聞記者，我的目標是利用比賽數據，產出有深度、有價值的報導。首先，我需要了解比賽的基本情況，然後深入挖掘數據中的模式和洞見。以下是我對各個操作的評估：

*   **`value_counts` (7):**  這個操作可以快速了解各種類型的球 (type) 出現的頻率，或者球員得分 (getpoint\_player) 的次數。這對於分析球員的打法偏好、戰術選擇，以及找出關鍵得分手段非常有幫助。能快速掌握球員或賽事的初步印象。這是基礎分析的關鍵。
*   **`crosstab` (8):** 這個操作可以幫助我們建立兩個或多個因素之間的關聯性。例如，我們可以比較不同球員在不同情況下使用的球種，或者分析發球後接發球方得分的機率。這種關聯性分析可以揭示更深層次的戰術策略。
*   **`group_by` (4) and `aggregate` (5):**  這兩個操作通常一起使用，可以將數據按照特定條件分組，然後計算各組的統計數據。例如，我們可以按照球員分組，計算他們的平均得分、平均回合數等。或者我們可以按照不同的回合數分組，計算不同回合的得分率等。這個操作可以幫助我們比較不同球員或不同回合之間的差異，找出關鍵因素。在賽事分析中，通常需要分組比較數據，例如比較勝負方的各項數據。
*   **`calculate` (6):** 這個操作可以用來計算新的數據指標，例如得分差、回合持續時間等等。這些指標可以幫助我們更深入地了解比賽的進程和球員的表現。例如，計算「侵略性比率」（殺球次數/總擊球次數）可以反映球員的進攻風格。
*   **`select_column` (3):** 選擇特定的數據列是進行任何分析的基礎。如果我們想分析球員的得分情況，就需要選擇`getpoint_player`這一列。其他操作基本上都要先進行欄位的選擇。
*   **`select_row` (2):** 根據條件篩選數據可以幫助我們

# STEP final

操作生成 (ContentPlanner)、安全執行 DataFrame 操作 (SafeDataFrameOperator)、樹結構追蹤 (TreeNode / TreeOfReport)、以及 文本生成 (TextGenerator)。

In [2]:
import pandas as pd
import json
import google.generativeai as genai
import os
import dspy
import ast
import re
from typing import List, Dict, Any, Optional, Set
import copy
import hashlib
import logging
from datetime import datetime
import sys
import builtins
# 設置日誌
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ===== 基於參考程式碼的函數 =====
def read_text_file(file_path):
    """讀取文本文件"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return "No file available"
    except Exception as e:
        logger.error(f"讀取文件錯誤: {e}")
        return "Error reading file"

def read_json_file(file_path):
    """讀取JSON文件"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin1') as file:
            return json.load(file)
    except FileNotFoundError:
        # 返回默認操作集合
        return [
            {"name": "select_column", "description": "選擇特定欄位"},
            {"name": "value_counts", "description": "計算值的頻次"},
            {"name": "groupby", "description": "按欄位分組"},
            {"name": "sort_values", "description": "排序數據"},
            {"name": "filter_rows", "description": "過濾行數據"},
            {"name": "write", "description": "撰寫分析文本"}
        ]

# ===== 改進的樹節點類別 =====
class TreeNode:
    """改進的樹節點類別，增加語意驗證和追蹤功能"""
    def __init__(self, level: int = 0, text: str = "", table: pd.DataFrame = None, operation: str = None):
        self.children: List['TreeNode'] = []
        self.level: int = level
        self.text: str = text
        self.table: pd.DataFrame = table if table is not None else pd.DataFrame()
        self.operation: str = operation
        self.parent: Optional['TreeNode'] = None
        self.operation_history: List[str] = []
        
        # 新增屬性用於改進功能
        self.node_id: str = self._generate_node_id()
        self.created_at: datetime = datetime.now()
        self.validation_errors: List[str] = []
        self.table_hash: str = self._calculate_table_hash()
        self.semantic_score: float = 0.0
        
    def _generate_node_id(self) -> str:
        """生成唯一節點ID"""
        content = f"{self.level}_{self.operation}_{datetime.now().isoformat()}"
        return hashlib.md5(content.encode()).hexdigest()[:8]
        
    def _calculate_table_hash(self) -> str:
        """計算表格內容的哈希值，用於檢測重複"""
        if self.table.empty:
            return ""
        try:
            return hashlib.md5(str(self.table.values.tobytes()).encode()).hexdigest()[:8]
        except:
            return ""
    
    def add_child(self, child: 'TreeNode'):
        """添加子節點並進行驗證"""
        if self._validate_child(child):
            child.parent = self
            self.children.append(child)
            logger.info(f"添加子節點: {child.node_id} to {self.node_id}")
        else:
            logger.warning(f"子節點驗證失敗: {child.validation_errors}")
    
    def _validate_child(self, child: 'TreeNode') -> bool:
        """驗證子節點的合理性"""
        errors = []
        
        # 檢查是否有重複的表格狀態
        if child.table_hash and child.table_hash == self.table_hash:
            if not child.operation.lower().startswith('write'):
                errors.append("表格內容未發生變化但非寫作操作")
        
        # 檢查操作是否邏輯合理
        if self._is_redundant_operation(child.operation):
            errors.append(f"檢測到冗餘操作: {child.operation}")
        
        child.validation_errors = errors
        return len(errors) == 0
    
    def _is_redundant_operation(self, operation: str) -> bool:
        """檢查操作是否冗餘"""
        if len(self.operation_history) < 2:
            return False
            
        # 檢查是否有相同操作在近期歷史中
        recent_ops = self.operation_history[-3:]  # 檢查最近3個操作
        op_name = operation.split('(')[0].lower()
        
        for hist_op in recent_ops:
            if hist_op.split('(')[0].lower() == op_name:
                return True
        return False
    
    def is_leaf(self) -> bool:
        """判斷是否為葉節點"""
        return len(self.children) == 0
    
    def to_dict(self) -> Dict[str, Any]:
        """將節點轉換為字典格式，用於可視化"""
        return {
            "node_id": self.node_id,
            "level": self.level,
            "operation": self.operation,
            "text_preview": self.text[:100] + "..." if len(self.text) > 100 else self.text,
            "table_shape": list(self.table.shape) if not self.table.empty else [0, 0],
            "table_columns": list(self.table.columns) if not self.table.empty else [],
            "children_count": len(self.children),
            "validation_errors": self.validation_errors,
            "semantic_score": self.semantic_score,
            "created_at": self.created_at.isoformat(),
            "table_hash": self.table_hash
        }

# ===== 改進的操作解析器 =====
class OperationParser:
    """專門負責解析和驗證操作的類別"""
    
    def __init__(self, valid_operations: Set[str]):
        """初始化，接受從 JSON 讀取的有效操作集合"""
        self.valid_operations = valid_operations
        logger.info(f"OperationParser 初始化，有效操作: {self.valid_operations}")
        
    def parse_operations(self, response_text: str) -> List[Dict[str, Any]]:
        """改進的操作解析，返回結構化結果"""
        try:
            parsed_operations = []
            
            # 多種解析策略
            operations = self._extract_operations_multiple_strategies(response_text)
            
            for op_str in operations:
                parsed_op = self._parse_single_operation(op_str)
                if parsed_op and self._validate_operation(parsed_op):
                    parsed_operations.append(parsed_op)
                else:
                    logger.warning(f"無效操作被忽略: {op_str}")
            
            return parsed_operations[:5]  # 限制最多5個操作
            
        except Exception as e:
            logger.error(f"解析操作失敗: {e}")
            return []
    
    def _extract_operations_multiple_strategies(self, text: str) -> List[str]:
        """使用多種策略提取操作"""
        operations = []
        
        # 策略1: 尋找方括號內容
        bracket_match = re.search(r'\[(.*?)\]', text, re.DOTALL)
        if bracket_match:
            content = bracket_match.group(1)
            # 使用正則提取函數調用格式
            pattern = r'([a-zA-Z_]+\([^)]*\))'
            ops = re.findall(pattern, content)
            operations.extend(ops)
        
        # 策略2: 逐行解析
        if not operations:
            lines = text.split('\n')
            for line in lines:
                line = line.strip()
                if line and not line.startswith('#') and '(' in line and ')' in line:
                    operations.append(line)
        
        # 策略3: 逗號分割
        if not operations:
            parts = text.replace('[', '').replace(']', '').split(',')
            for part in parts:
                part = part.strip()
                if part and '(' in part:
                    operations.append(part)
        
        return operations
    
    def _parse_single_operation(self, op_str: str) -> Optional[Dict[str, Any]]:
        """解析單個操作字符串"""
        try:
            # 移除多餘的字符
            op_str = op_str.strip().rstrip(',').strip()
            
            # 提取操作名稱和參數
            if '(' not in op_str:
                return {"name": op_str, "args": [], "raw": op_str}
            
            name_part = op_str.split('(')[0].strip()
            args_part = op_str[op_str.find('(')+1:op_str.rfind(')')].strip()
            
            # 解析參數
            args = []
            if args_part:
                # 簡單的參數分割（可以進一步改進）
                for arg in args_part.split(','):
                    arg = arg.strip().strip('\'"')
                    if arg:
                        args.append(arg)
            
            return {
                "name": name_part.lower(),
                "args": args,
                "raw": op_str
            }
            
        except Exception as e:
            logger.error(f"解析操作 '{op_str}' 失敗: {e}")
            return None
    
    def _validate_operation(self, operation: Dict[str, Any]) -> bool:
        """驗證操作的有效性"""
        name = operation.get("name", "").lower()
        
        # 檢查操作名稱是否有效（從 JSON 載入的操作）
        if name not in self.valid_operations:
            logger.warning(f"未知操作: {name}，不在操作池中")
            return False
        
        # 檢查特定操作的參數
        args = operation.get("args", [])
        
        if name in ['select_column', 'sort_values', 'groupby'] and not args:
            logger.warning(f"{name} 操作需要參數")
            return False
        
        return True

# ===== 改進的內容規劃器 =====
class ContentPlanner:
    def __init__(self, api_key, operations_config: Dict[str, Any]):
        self.api_key = api_key
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")
        
        # 從配置中提取操作信息
        self.operations_config = operations_config
        valid_ops = self._extract_valid_operations(operations_config)
        self.parser = OperationParser(valid_ops)
        
    def _extract_valid_operations(self, config: Dict[str, Any]) -> Set[str]:
        """從配置中提取有效操作名稱"""
        valid_ops = set()
        if isinstance(config, dict) and "operations" in config:
            for op in config["operations"]:
                if "operation" in op:
                    valid_ops.add(op["operation"].lower())
        elif isinstance(config, list):
            for op in config:
                if isinstance(op, dict) and "operation" in op:
                    valid_ops.add(op["operation"].lower())
        
        logger.info(f"從配置提取的有效操作: {valid_ops}")
        return valid_ops
        
    def generate_operations(self, tables, table_description, operation_description, 
                          operation_history, operation_pool, max_depth=5, max_degree=3, outline_path='main.txt'):
        """
        改進的操作生成，加入重複檢測和語意驗證
        """
        
        # 檢測近期操作，避免重複
        recent_operations = self._extract_recent_operations(operation_history)
        
        # 格式化操作描述（使用 selected_operations.json 的格式）
        formatted_operations = self._format_operations_for_prompt(operation_description)
        
        # 構建改進的提示詞
        prompt = f"""System : You are a content planner for the badminton game report. Please follow the outline. Please select candidate Operations and corresponding Arguments from the Operation Pool based on the input Tables and Operation History. These candidate Operations will be the next Operation in the Operation History.

# Requirements
1. Strictly adhere to the requirements.
2. The output must be in English.
3. The output must be based on the input data; do not hallucinate.
4. The length of Operation History must be less than or equal to {max_depth}.
5. The number of Operations must be less than or equal to {max_degree} and more than zero.
6. Only select Operations from the Operation Pool.
7. Arguments must match the format required by the corresponding Operations.
8. Operations & Arguments must follow this format: [operation_1(argument_1, ...), operation_2(argument_2, ...), operation_3(argument_3, ...), ...]
9. Only output Operations & Arguments!
10. If Table is big or Level is low, it should be more Operations include select_column or groupby not write.
11. If the length of Operation History is short, then more operations or more arguments.
12. Write operations do not need argument.
13. AVOID repeating recent operations: {recent_operations}
14. Prioritize operations that will meaningfully transform the data.
15. Arguments must be valid column names from the table.

# Outline
{read_text_file(outline_path) if os.path.exists(outline_path) else "Generate comprehensive badminton data analysis"}

# Table Description
{table_description}

# Available Operations
{formatted_operations}

User: # Test
## Tables
{tables}

## Operation History
{operation_history}

## Operation Pool
{operation_pool}

## Operations & Arguments"""

        try:
            logger.info("正在向Gemini發送請求...")
            response = self.model.generate_content(prompt)
            
            if response.text:
                logger.info("成功獲得Gemini回應")
                parsed_ops = self.parser.parse_operations(response.text.strip())
                return [op["raw"] for op in parsed_ops]  # 返回原始字符串格式
            else:
                logger.warning("Gemini回應為空")
                return []
                
        except Exception as e:
            logger.error(f"Gemini API請求失敗: {e}")
            return []
    
    def _format_operations_for_prompt(self, operation_description) -> str:
        """格式化操作描述供 LLM 使用"""
        if isinstance(operation_description, dict) and "operations" in operation_description:
            ops = operation_description["operations"]
        elif isinstance(operation_description, list):
            ops = operation_description
        else:
            return str(operation_description)
        
        formatted = []
        for op in ops:
            if isinstance(op, dict):
                num = op.get("number", "")
                name = op.get("operation", "")
                desc = op.get("description", "")
                formatted.append(f"{num}. {name}: {desc}")
        
        return "\n".join(formatted)
    
    def _extract_recent_operations(self, operation_history: List[str]) -> List[str]:
        """提取最近的操作名稱"""
        recent = []
        for op in operation_history[-3:]:  # 最近3個操作
            if '(' in op:
                name = op.split('(')[0].strip()
                recent.append(name)
        return recent

# ===== 安全的DataFrame操作器 =====
class SafeDataFrameOperator:
    """安全的DataFrame操作器，使用AST驗證而非直接exec"""
    
    def __init__(self, api_key):
        self.api_key = api_key
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")
        self.allowed_modules = {'pandas', 'numpy', 're'}
        self.allowed_functions = {
            'pd.read_csv', 'pd.DataFrame', 'df.head', 'df.tail', 'df.sort_values',
            'df.groupby', 'df.filter', 'df.select', 'df.drop', 'df.fillna',
            'df.to_csv', 'df.value_counts', 'df.describe', 'df.info'
        }

    def generate_code(self, operation, df_info, df_path="input_tmp.csv"):
        prompt = f"""
        你是一個專業的Python資料分析助手。欄位名稱以資料欄位類型提供為主，根據以下要求生成操作DataFrame的程式碼：

        要執行的操作: {operation}

        CSV數據集: {df_path}

        資料欄位類型:
        {df_info}

        生成要求：
        1. 讀取CSV數據集，並存入DataFrame後，使用要執行的操作後，將修改後的DataFrame存入'tmp.csv'
        2. 只使用pandas基本操作，避免複雜的自定義函數
        3. 確保代碼安全，不包含文件系統操作（除了指定的CSV讀寫）
        4. 撰寫完整python code，包含錯誤處理

        輸出格式：
        ```python
        # 你的程式碼
        ```
        """
        return self._retry_generate(prompt)

    def _retry_generate(self, prompt, max_retries=2):
        """帶重試的生成請求"""
        for attempt in range(max_retries):
            try:
                response = self.model.generate_content(prompt)
                if response.text:
                    return response.text.strip()
            except Exception as e:
                logger.warning(f"生成代碼失敗 (嘗試 {attempt+1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    import time
                    time.sleep(1)
        return ""

    def safe_execute(self, code: str, df: pd.DataFrame) -> pd.DataFrame:
        """安全執行生成的代碼"""
        try:
            # 提取代碼塊
            code_block = re.search(r'```python\n(.*?)\n```', code, re.DOTALL)
            #print(f'python code: {code_block}')
            if code_block:
                code = code_block.group(1)

            # AST安全驗證
            if not self._validate_code_safety(code):
                logger.error("代碼安全驗證失敗")
                return df

            # 寫入暫存 CSV 檔案
            df.to_csv("input_tmp.csv", index=False)

            allowed_builtin_names = [
                'int', 'float', 'str', 'bool', 'list', 'dict', 'set', 'tuple',
                'len', 'range', 'enumerate', 'zip', 'min', 'max', 'sum', 'abs',
                'print',
                'Exception', 'TypeError', 'ValueError', 'KeyError', 'IndexError',
                'FileNotFoundError', 'ZeroDivisionError', 'AttributeError', 'ImportError','__import__'
            ]       

            safe_globals = {
                'pd': pd,
                '__name__': '__main__',
                '__builtins__': {name: getattr(builtins, name) for name in allowed_builtin_names}
            }

            safe_locals = {}

            # 執行代碼
            exec(code, safe_globals, safe_locals)

            # 讀取結果
            if os.path.exists("tmp.csv"):
                result_df = pd.read_csv("tmp.csv")
                logger.info(f"操作成功，結果形狀: {result_df.shape}")
                return result_df
            else:
                logger.warning("未生成結果文件，返回原始DataFrame")
                return df

        except Exception as e:
            error_msg = f"執行錯誤: {str(e)}"
            print(error_msg)
            print("錯誤代碼如下：\n" + "-" * 30)
            print(code)  # ✅ 輸出造成錯誤的程式碼
            print("-" * 30)
            logger.error(error_msg)
            sys.exit(1)



    def _validate_code_safety(self, code: str) -> bool:
        """使用AST驗證代碼安全性"""
        try:
            tree = ast.parse(code)
            
            for node in ast.walk(tree):
                # 檢查危險的函數調用
                if isinstance(node, ast.Call):
                    if isinstance(node.func, ast.Name):
                        func_name = node.func.id
                        if func_name in ['exec', 'eval', 'compile', '__import__', 'open']:
                            logger.error(f"檢測到危險函數: {func_name}")
                            return False
                
                # 檢查文件操作（除了允許的CSV操作）
                if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
                    if hasattr(node.func, 'attr'):
                        attr_name = node.func.attr
                        if attr_name in ['system', 'popen', 'subprocess']:
                            logger.error(f"檢測到系統調用: {attr_name}")
                            return False
                
                # 檢查導入語句
                if isinstance(node, ast.Import):
                    for alias in node.names:
                        if alias.name not in self.allowed_modules:
                            logger.error(f"檢測到不允許的模組導入: {alias.name}")
                            return False
            
            return True
            
        except SyntaxError as e:
            logger.error(f"代碼語法錯誤: {e}")
            return False
        except Exception as e:
            logger.error(f"AST驗證失敗: {e}")
            return False

# ===== 文本生成器 =====
import time

class TextGenerator:
    def __init__(self, api_key, table_description=""):
        self.api_key = api_key
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel("gemini-2.0-flash")
        self.table_description = table_description

    def extract_highlights_from_table(self, table: pd.DataFrame) -> str:
        try:
            if 'lose_reason' in table.columns:
                top_reason = table['lose_reason'].value_counts().idxmax()
            else:
                top_reason = "無資料"
            if 'getpoint_player' in table.columns:
                top_player = table['getpoint_player'].value_counts().idxmax()
            else:
                top_player = "未知球員"
            return f"最多失分原因為「{top_reason}」，得分最多的是 {top_player}。"
        except:
            return ""

    def extract_table_features(self, table: pd.DataFrame) -> str:
        summary = []
        for col in table.columns:
            dtype = str(table[col].dtype)
            line = f"欄位「{col}」類型：{dtype}"

            # 顯示常見值僅限類別型欄位
            if table[col].nunique() <= 10 or dtype == 'object' or pd.api.types.is_categorical_dtype(table[col]):
                top_values = table[col].value_counts().head(3).to_dict()
                line += f"，常見值：{list(top_values.keys())}"
            summary.append(line)
        return "\n".join(summary)

    def _retry_generate(self, prompt, max_retries=3, delay_seconds=30):
        for attempt in range(max_retries):
            try:
                response = self.model.generate_content(prompt)
                if response.text:
                    return response.text.strip()
            except Exception as e:
                err = str(e)
                logger.error(f"Gemini 回應失敗: {err}")
                if "429" in err:
                    logger.info(f"已達配額限制，等待 {delay_seconds} 秒後重試 ({attempt+1}/{max_retries})...")
                    time.sleep(delay_seconds)
                else:
                    break
        return "⚠️ 寫作請求失敗：API 限制或其他錯誤"

    def generate_text_for_write_operation(self, table: pd.DataFrame, operation_history: List[str]) -> str:
        table_str = table.to_string()
        WRITE_TOKENS = 50
        TABLE_FORMAT = "Pandas DataFrame as plain text"
        highlight_summary = self.extract_highlights_from_table(table)
        table_feature_summary = self.extract_table_features(table)

        prompt = f"""
System :
You are a professional content writer for the badminton game report .
Please write the Report based on the input Table, just pick one or two lightspots.

# Requirements
1. Strictly adhere to the requirements .
2. The output must be in 中文 .
3. The output must be based on the input data ; do not hallucinate .
4. The Table format is {TABLE_FORMAT}.
5. The Report can only describe the content included in the Tables and cannot describe anything not included in the Tables .
6. The Report must consist of only one paragraph .
7. The number of tokens in the Report must be within {WRITE_TOKENS}.
8. 請專注描述得分與失分模式、關鍵欄位趨勢或球員亮點。
9. 請模仿比賽轉播員或教練的語氣描述，句式自然、有節奏感。
10. 請特別觀察球種之間的連續轉換，例如 放小球 接 殺球 等，找出其中有效得分或不尋常的組合並描述。

# Highlights Summary
{highlight_summary}

# Table Features
{table_feature_summary}

# Table Description
{self.table_description}

User :
# Test
## Tables
{table_str}
## Report
"""
        return self._retry_generate(prompt)

    def merge_child_texts(self, child_texts: List[str], parent_operation: str) -> str:
        if not child_texts:
            return ""

        GENERATING_TOKENS = 100
        reports_str = "\n".join([f"- {txt}" for txt in child_texts])
        prompt = f"""
System :
You are a content generator for the badminton game report .
Please merge and rewrite a New Report based on the input Reports .

# Requirements
1. Strictly adhere to the requirements .
2. The output must be in 中文 .
3. The output must be based on the input data ; do not hallucinate .
4. The New Report must include all the content from the input Reports ; do not omit any information .
5. The New Report must follow the order of the input Reports .
6. The number of tokens in the New Report must be within {GENERATING_TOKENS}.
7. 請依序整合每段內容，形成結構清晰的段落，包括亮點、失誤模式與球員貢獻。

User :
# Test
## Reports
{reports_str}
## New Report
"""
        return self._retry_generate(prompt)

# ===== OperationParser._validate_operation 強化參數驗證（補入 df 欄位比對） =====
def validate_operation_with_columns(operation: Dict[str, Any], df_columns: List[str]) -> bool:
    name = operation.get("name", "").lower()
    args = operation.get("args", [])

    # 檢查操作名稱是否有效
    if name not in {
        'select_column', 'select_row', 'sort', 'calculate',
        'group_by', 'value_counts', 'aggregate', 'crosstab', 'pivot_table', 'write'
    }:
        return False

    # 僅針對需參數操作檢查欄位
    if name in ['select_column', 'sort', 'group_by']:
        for arg in args:
            if arg not in df_columns:
                return False

    return True



# ===== 改進的TreeOfReport類別 =====
class TreeOfReport:
    def __init__(self, api_key: str, max_depth: int = 5, max_degree: int = 5):
        self.api_key = api_key
        self.max_depth = max_depth
        self.max_degree = max_degree

        # 載入配置檔案
        self.load_configurations()

        # 初始化改進的組件（傳入 operations_config）
        self.content_planner = ContentPlanner(api_key, self.operation_description)
        self.df_operator = SafeDataFrameOperator(api_key)
        self.text_generator = TextGenerator(api_key, table_description=self.table_description)
        
        # 新增追蹤功能
        self.execution_log: List[Dict[str, Any]] = []
        self.node_registry: Dict[str, TreeNode] = {}

    def load_configurations(self):
        """載入配置文件"""
        self.table_description = read_text_file("filtered_data_description.txt")
        if not self.table_description or self.table_description == "No file available":
            self.table_description = "數據分析表格，包含各種欄位用於分析"

        # 讀取 selected_operations.json
        self.operation_description = read_json_file("selected_operations.json")
        
        # 根據 JSON 結構提取操作池
        if isinstance(self.operation_description, dict) and "operations" in self.operation_description:
            self.operation_pool = [op['operation'] for op in self.operation_description['operations']]
        elif isinstance(self.operation_description, list):
            self.operation_pool = [op['operation'] for op in self.operation_description if 'operation' in op]
        else:
            self.operation_pool = ['value_counts', 'crosstab', 'pivot_table', 'groupby', 'write']

        logger.info(f"載入操作池: {self.operation_pool}")


    def build_tree(self, root_table: pd.DataFrame) -> TreeNode:
        """改進的樹構建，加入完整的追蹤和驗證"""
        root = TreeNode(level=0, text="資料分析報告", table=root_table, operation="root(None)")
        root.operation_history = ['root(None)']
        self.node_registry[root.node_id] = root
        
        queue = [root]
        
        while queue:
            current_node = queue.pop(0)
            
            # 記錄處理日誌
            self._log_node_processing(current_node)

            if current_node.operation.lower().startswith('write'):
                continue

            if current_node.level >= self.max_depth:
                write_node = self.create_child_node(current_node, 'write()')
                if write_node:
                    current_node.add_child(write_node)
                continue

            logger.info(f"處理節點 - Level: {current_node.level}, Operation: {current_node.operation}")

            tables_str = current_node.table.to_string()
            operations = self.content_planner.generate_operations(
                tables=tables_str,
                table_description=self.table_description,
                operation_description=self.operation_description,
                operation_history=current_node.operation_history,
                operation_pool=self.operation_pool,
                max_depth=self.max_depth,
                max_degree=self.max_degree
            )

            logger.info(f"生成操作: {operations}")

            for operation in operations[:self.max_degree]:
                if operation.strip():
                    child_node = self.create_child_node(current_node, operation)
                    if child_node:
                        current_node.add_child(child_node)
                        queue.append(child_node)

        self.generate_all_texts(root)
        return root
    
    def _log_node_processing(self, node: TreeNode):
        """記錄節點處理日誌"""
        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "node_id": node.node_id,
            "level": node.level,
            "operation": node.operation,
            "table_shape": list(node.table.shape) if not node.table.empty else [0, 0],
            "validation_errors": node.validation_errors
        }
        self.execution_log.append(log_entry)
    
    def create_child_node(self, parent: TreeNode, operation: str) -> Optional[TreeNode]:
        """改進的子節點創建，加入完整驗證"""
        try:
            # 建立新的操作歷史
            new_operation_history = parent.operation_history + [operation]
            
            # 檢查是否為 write 操作
            if operation.lower().startswith('write'):
                text = self.text_generator.generate_text_for_write_operation(
                    parent.table,
                    new_operation_history
                )
                child = TreeNode(
                    level=parent.level + 1,
                    text=text,
                    table=parent.table.copy(),
                    operation=operation
                )
                child.operation_history = new_operation_history
                self.node_registry[child.node_id] = child
                logger.info(f"創建 write 節點: {operation}")
                return child
            else:
                # 其他操作：執行數據操作
                df_info = f"Shape: {parent.table.shape}\nColumns: {list(parent.table.columns)}\nData types:\n{parent.table.dtypes.to_string()}"
                code = self.df_operator.generate_code(operation, df_info)
                
                if code:
                    result_df = self.df_operator.safe_execute(code, parent.table)
                    child = TreeNode(
                        level=parent.level + 1,
                        text="",
                        table=result_df,
                        operation=operation
                    )
                    child.operation_history = new_operation_history
                    self.node_registry[child.node_id] = child
                    logger.info(f"創建數據操作節點: {operation}, 結果形狀: {result_df.shape}")
                    return child
                else:
                    logger.warning(f"無法生成操作代碼: {operation}")
                    return None
        
        except Exception as e:
            logger.error(f"創建子節點失敗: {e}")
            return None
    
    def generate_all_texts(self, node: TreeNode):
        """遞歸生成所有節點的文本"""
        for child in node.children:
            self.generate_all_texts(child)
        
        if node.is_leaf() and not node.text and node.operation and not node.operation.lower().startswith('write'):
            node.text = self.text_generator.generate_text_for_write_operation(
                node.table, 
                node.operation_history
            )
            print(f'node table: {node.table}')
        elif node.children:
            child_texts = [child.text for child in node.children if child.text.strip()]
            if child_texts:
                merged_text = self.text_generator.merge_child_texts(
                    child_texts, 
                    node.operation or "root"
                )
                if node.text:
                    node.text = node.text + "\n\n" + merged_text
                else:
                    node.text = merged_text
        logger.info(f'節點 {node.node_id} 文本生成完成')
        print(f'node.table: {node.table}')
        print(f'節點文本: {node.text}')
        
    def export_tree_structure(self, root: TreeNode, output_path: str = "tree_structure.json"):
        """導出樹結構為JSON格式，用於可視化和分析"""
        def node_to_dict(node: TreeNode) -> Dict[str, Any]:
            result = node.to_dict()
            result["children"] = [node_to_dict(child) for child in node.children]
            return result
        
        tree_data = {
            "metadata": {
                "export_time": datetime.now().isoformat(),
                "total_nodes": len(self.node_registry),
                "max_depth": self.max_depth,
                "max_degree": self.max_degree
            },
            "execution_log": self.execution_log,
            "tree": node_to_dict(root)
        }
        
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(tree_data, f, indent=2, ensure_ascii=False)
            logger.info(f"樹結構已導出至: {output_path}")
        except Exception as e:
            logger.error(f"導出樹結構失敗: {e}")
    
    def generate_execution_report(self) -> str:
        """生成執行過程報告"""
        total_nodes = len(self.node_registry)
        error_nodes = sum(1 for node in self.node_registry.values() if node.validation_errors)
        
        report = f"""
# Tree-of-Report 執行報告

## 統計信息
- 總節點數: {total_nodes}
- 錯誤節點數: {error_nodes}
- 樹最大深度: {self.max_depth}
- 最大分支度: {self.max_degree}

## 節點分布
"""
        
        # 按層級統計節點
        level_counts = {}
        for node in self.node_registry.values():
            level = node.level
            level_counts[level] = level_counts.get(level, 0) + 1
        
        for level, count in sorted(level_counts.items()):
            report += f"- Level {level}: {count} 個節點\n"
        
        # 錯誤摘要
        if error_nodes > 0:
            report += "\n## 驗證錯誤摘要\n"
            for node in self.node_registry.values():
                if node.validation_errors:
                    report += f"- 節點 {node.node_id} ({node.operation}): {'; '.join(node.validation_errors)}\n"
        
        return report

    def generate_report(self, node: TreeNode, level: int = 0) -> str:
        """改進的報告生成"""
        if node.level == 0:
            prompt = f"""
            你是一位新聞記者，根據以下分析總結，請撰寫一篇賽事新聞報導，提供全面深入的分析，統整成新聞報導，文辭中過多直接使用欄位名稱與直接次數統計，用player_A與player_B表示兩球員，用生動的文句描述，勿出現累贅的句子，請從分析總結中提取轉換，禁止出現幻覺。
            請用繁體中文撰寫，保持邏輯清晰，資訊準確。

            分析總結:
            {node.text}
            """
            final_text = self.text_generator._retry_generate(prompt)
            
            # 保存多種格式的報告
            with open("tree_of_report.txt", "w", encoding="utf-8") as f:
                f.write(final_text)
            
            # 導出樹結構
            self.export_tree_structure(node)
            
            # 生成執行報告
            exec_report = self.generate_execution_report()
            with open("execution_report.md", "w", encoding="utf-8") as f:
                f.write(exec_report)
                print("finish generate report")
            
            return final_text
        else:
            logger.info(f'generate report from not root')
            indent = "  " * level
            report = f"{indent}{'#' * (level + 1)} {node.operation or 'Root'}\n\n"

            if node.text:
                report += f"{indent}{node.text}\n\n"

            if node.table is not None and not node.table.empty and level < 2:
                report += f"{indent}**資料摘要:** Shape {node.table.shape}\n"
                if len(node.table) <= 10:
                    report += f"{indent}```\n{node.table.to_string()}\n{indent}```\n\n"
                else:
                    report += f"{indent}```\n{node.table.head().to_string()}\n{indent}```\n\n"

            for child in node.children:
                report += self.generate_report(child, level + 1)

            return report


# ===== 主程序 =====
def main():
    """改進的主函數"""
    
    # 設置API密鑰
    api_key = os.getenv("Gemini_API")

    logger.info("Tree-of-Report for Data Analysis (改進版)")
    logger.info("="*50)
    
    logger.info("正在載入數據...")
    
    # 讀取CSV檔案

    TABLES = pd.read_csv('filtered_set1.csv')
    logger.info(f"成功載入CSV: {TABLES.shape[0]} 行, {TABLES.shape[1]} 列")

    
    # 設置參數
    MAX_DEPTH = 3
    MAX_DEGREE = 4
    
    logger.info(f"最大深度: {MAX_DEPTH}")
    logger.info(f"最大分支度: {MAX_DEGREE}")
    
    # 初始化改進的 Tree-of-Report
    tree_report = TreeOfReport(api_key, max_depth=MAX_DEPTH, max_degree=MAX_DEGREE)
    
    # 建構報告樹
    logger.info("開始建構報告樹...")
    start_time = datetime.now()
    
    try:
        root = tree_report.build_tree(TABLES)
        
        # 生成最終報告
        logger.info("生成最終報告...")
        final_report = tree_report.generate_report(root)
        
        # 輸出報告
        logger.info("\n" + "="*50)
        logger.info("TREE-OF-REPORT 最終報告")
        logger.info("="*50)
        print(final_report)
        
        # 儲存報告
        with open('tree_of_report.md', 'w', encoding='utf-8') as f:
            f.write("# Tree-of-Report 數據分析報告 (改進版)\n\n")
            f.write(final_report)
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        logger.info(f"報告生成完成，耗時: {duration:.2f} 秒")
        logger.info("生成的文件:")
        logger.info("- tree_of_report.md: 最終報告")
        logger.info("- tree_of_report.txt: 純文本報告")
        logger.info("- tree_structure.json: 樹結構數據")
        logger.info("- execution_report.md: 執行過程報告")
        logger.info("- tree_visualization.html: 可視化頁面")
        
    except Exception as e:
        logger.error(f"程序執行失敗: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        # 清理暫存檔案
        for temp_file in ['input_tmp.csv', 'tmp.csv']:
            if os.path.exists(temp_file):
                try:
                    os.remove(temp_file)
                    logger.info(f"清理暫存檔案: {temp_file}")
                except:
                    pass

if __name__ == "__main__":
    main()

2025-09-29 15:50:10,531 - INFO - Tree-of-Report for Data Analysis (改進版)
2025-09-29 15:50:10,532 - INFO - 正在載入數據...
2025-09-29 15:50:10,535 - INFO - 成功載入CSV: 315 行, 8 列
2025-09-29 15:50:10,535 - INFO - 最大深度: 3
2025-09-29 15:50:10,536 - INFO - 最大分支度: 4
2025-09-29 15:50:10,537 - INFO - 載入操作池: ['value_counts', 'crosstab', 'group_by', 'aggregate', 'calculate', 'select_column', 'select_row', 'write']
2025-09-29 15:50:10,538 - INFO - 從配置提取的有效操作: {'crosstab', 'select_row', 'aggregate', 'write', 'calculate', 'group_by', 'select_column', 'value_counts'}
2025-09-29 15:50:10,538 - INFO - OperationParser 初始化，有效操作: {'crosstab', 'select_row', 'aggregate', 'write', 'calculate', 'group_by', 'select_column', 'value_counts'}
2025-09-29 15:50:10,539 - INFO - 開始建構報告樹...
2025-09-29 15:50:10,539 - INFO - 處理節點 - Level: 0, Operation: root(None)
2025-09-29 15:50:10,554 - INFO - 正在向Gemini發送請求...
2025-09-29 15:50:12,113 - INFO - 成功獲得Gemini回應
2025-09-29 15:50:12,114 - INFO - 生成操作: ['select_column(type, getpoint_pl

DataFrame 已成功保存到 'tmp.csv'


2025-09-29 15:50:19,100 - INFO - 操作成功，結果形狀: (18, 2)
2025-09-29 15:50:19,100 - INFO - 創建數據操作節點: value_counts(type), 結果形狀: (18, 2)
2025-09-29 15:50:19,101 - INFO - 添加子節點: e67d99b0 to c659592a


value_counts('type') 操作完成，結果已保存到 tmp.csv


2025-09-29 15:50:21,082 - INFO - 操作成功，結果形狀: (2, 2)
2025-09-29 15:50:21,083 - INFO - 創建數據操作節點: value_counts(getpoint_player), 結果形狀: (2, 2)
2025-09-29 15:50:21,083 - INFO - 添加子節點: 5c4a956c to c659592a
2025-09-29 15:50:23,243 - INFO - 操作成功，結果形狀: (2, 2)
2025-09-29 15:50:23,243 - INFO - 創建數據操作節點: value_counts(player), 結果形狀: (2, 2)
2025-09-29 15:50:23,245 - INFO - 添加子節點: fd387980 to c659592a
2025-09-29 15:50:23,245 - INFO - 處理節點 - Level: 1, Operation: select_column(type, getpoint_player, player)
2025-09-29 15:50:23,249 - INFO - 正在向Gemini發送請求...


value_counts('player') 操作完成，結果已保存到 tmp.csv


2025-09-29 15:50:23,770 - INFO - 成功獲得Gemini回應
2025-09-29 15:50:23,772 - INFO - 生成操作: ['value_counts(type)', 'value_counts(getpoint_player)']
2025-09-29 15:50:26,337 - INFO - 操作成功，結果形狀: (0, 2)
2025-09-29 15:50:26,338 - INFO - 創建數據操作節點: value_counts(type), 結果形狀: (0, 2)
2025-09-29 15:50:26,339 - INFO - 添加子節點: e4f7540c to 94f1f6c1


value_counts('type') 操作完成，結果已保存到 tmp.csv


2025-09-29 15:50:28,691 - INFO - 操作成功，結果形狀: (0, 2)
2025-09-29 15:50:28,692 - INFO - 創建數據操作節點: value_counts(getpoint_player), 結果形狀: (0, 2)
2025-09-29 15:50:28,692 - INFO - 添加子節點: 2539f1c4 to 94f1f6c1
2025-09-29 15:50:28,693 - INFO - 處理節點 - Level: 1, Operation: value_counts(type)
2025-09-29 15:50:28,695 - INFO - 正在向Gemini發送請求...
2025-09-29 15:50:29,270 - INFO - 成功獲得Gemini回應
2025-09-29 15:50:29,271 - INFO - 生成操作: ['select_column(type)', 'value_counts(type)']
2025-09-29 15:50:31,871 - INFO - 操作成功，結果形狀: (18, 1)
2025-09-29 15:50:31,872 - INFO - 創建數據操作節點: select_column(type), 結果形狀: (18, 1)
2025-09-29 15:50:31,872 - INFO - 添加子節點: 8e8199c9 to e67d99b0


已成功將結果寫入 tmp.csv


2025-09-29 15:50:33,746 - INFO - 操作成功，結果形狀: (18, 2)
2025-09-29 15:50:33,747 - INFO - 創建數據操作節點: value_counts(type), 結果形狀: (18, 2)
2025-09-29 15:50:33,749 - INFO - 處理節點 - Level: 1, Operation: value_counts(getpoint_player)
2025-09-29 15:50:33,751 - INFO - 正在向Gemini發送請求...


value_counts() 操作已成功執行，結果已儲存到 tmp.csv


2025-09-29 15:50:34,401 - INFO - 成功獲得Gemini回應
2025-09-29 15:50:34,402 - INFO - 生成操作: ['select_column(count)', 'write()']
2025-09-29 15:50:37,482 - INFO - 操作成功，結果形狀: (2, 1)
2025-09-29 15:50:37,483 - INFO - 創建數據操作節點: select_column(count), 結果形狀: (2, 1)
2025-09-29 15:50:37,484 - INFO - 添加子節點: 3770f29b to 5c4a956c


DataFrame successfully processed and saved to tmp.csv


2025-09-29 15:50:38,631 - INFO - 創建 write 節點: write()
2025-09-29 15:50:38,632 - INFO - 添加子節點: 3b4e895a to 5c4a956c
2025-09-29 15:50:38,633 - INFO - 處理節點 - Level: 1, Operation: value_counts(player)
2025-09-29 15:50:38,636 - INFO - 正在向Gemini發送請求...
2025-09-29 15:50:39,218 - INFO - 成功獲得Gemini回應
2025-09-29 15:50:39,219 - INFO - 生成操作: ['select_column(count)', 'write()']
2025-09-29 15:50:41,965 - INFO - 操作成功，結果形狀: (2, 1)
2025-09-29 15:50:41,966 - INFO - 創建數據操作節點: select_column(count), 結果形狀: (2, 1)
2025-09-29 15:50:41,967 - INFO - 添加子節點: 4b781957 to fd387980
2025-09-29 15:50:42,034 - ERROR - Gemini 回應失敗: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15
Please retry in 18.42660804s. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"


Successfully selected column 'count' and saved to tmp.csv


2025-09-29 15:51:13,250 - INFO - 創建 write 節點: write()
2025-09-29 15:51:13,253 - INFO - 添加子節點: 6dcccdbd to fd387980
2025-09-29 15:51:13,255 - INFO - 處理節點 - Level: 2, Operation: value_counts(type)
2025-09-29 15:51:13,261 - INFO - 正在向Gemini發送請求...
2025-09-29 15:51:13,961 - INFO - 成功獲得Gemini回應
2025-09-29 15:51:13,962 - INFO - 生成操作: ['write()']
2025-09-29 15:51:15,235 - INFO - 創建 write 節點: write()
2025-09-29 15:51:15,236 - INFO - 添加子節點: ed4afb4a to e4f7540c
2025-09-29 15:51:15,237 - INFO - 處理節點 - Level: 2, Operation: value_counts(getpoint_player)
2025-09-29 15:51:15,239 - INFO - 正在向Gemini發送請求...
2025-09-29 15:51:15,806 - INFO - 成功獲得Gemini回應
2025-09-29 15:51:15,808 - INFO - 生成操作: ['write()']
2025-09-29 15:51:16,623 - INFO - 創建 write 節點: write()
2025-09-29 15:51:16,624 - INFO - 添加子節點: 792193fb to 2539f1c4
2025-09-29 15:51:16,624 - INFO - 處理節點 - Level: 2, Operation: select_column(type)
2025-09-29 15:51:16,626 - INFO - 正在向Gemini發送請求...
2025-09-29 15:51:17,253 - INFO - 成功獲得Gemini回應
2025-09-29 15

value_counts 操作完成，結果已儲存到 tmp.csv


2025-09-29 15:51:22,236 - INFO - 成功獲得Gemini回應
2025-09-29 15:51:22,238 - INFO - 生成操作: ['write()']
2025-09-29 15:51:23,564 - INFO - 創建 write 節點: write()
2025-09-29 15:51:23,565 - INFO - 添加子節點: c88576a2 to 3770f29b
2025-09-29 15:51:23,566 - INFO - 處理節點 - Level: 2, Operation: select_column(count)
2025-09-29 15:51:23,567 - INFO - 正在向Gemini發送請求...
2025-09-29 15:51:24,060 - INFO - 成功獲得Gemini回應
2025-09-29 15:51:24,061 - INFO - 生成操作: ['write()']
2025-09-29 15:51:25,223 - INFO - 創建 write 節點: write()
2025-09-29 15:51:25,226 - INFO - 添加子節點: 7879c583 to 4b781957
2025-09-29 15:51:26,810 - INFO - 創建 write 節點: write()
2025-09-29 15:51:26,812 - INFO - 添加子節點: 4fd755a1 to f32472b3
2025-09-29 15:51:26,813 - INFO - 節點 ed4afb4a 文本生成完成


node.table: Empty DataFrame
Columns: [type, count]
Index: []
節點文本: 本場比賽數據缺失較多，得分與失分模式均不明顯，球員表現也難以評估。


2025-09-29 15:51:27,532 - INFO - 節點 e4f7540c 文本生成完成
2025-09-29 15:51:27,533 - INFO - 節點 792193fb 文本生成完成


node.table: Empty DataFrame
Columns: [type, count]
Index: []
節點文本: 本場比賽數據缺失較多，得分與失分模式均不明顯，球員表現也難以評估。
node.table: Empty DataFrame
Columns: [getpoint_player, count]
Index: []
節點文本: 本場比賽數據缺失，暫無得分模式或球員亮點可供分析。期待後續比賽能有更多精彩數據呈現。


2025-09-29 15:51:28,258 - INFO - 節點 2539f1c4 文本生成完成
2025-09-29 15:51:28,328 - ERROR - Gemini 回應失敗: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15
Please retry in 32.137442598s. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 32
}
]
2025-09-29 15:51:28,329 - INFO - 已達配額限制，等待 30 秒後重試 (1/3)...


node.table: Empty DataFrame
Columns: [getpoint_player, count]
Index: []
節點文本: 本場比賽數據缺失，暫無得分模式或球員亮點可供分析。期待後續比賽能有更多精彩數據呈現。


2025-09-29 15:51:59,920 - INFO - 節點 94f1f6c1 文本生成完成
2025-09-29 15:51:59,922 - INFO - 節點 39dcc001 文本生成完成


node.table: Empty DataFrame
Columns: [Unnamed: 0, roundscore_A, roundscore_B, player, getpoint_player, type, rally, time]
Index: []
節點文本: 本場比賽數據缺失較多，得分與失分模式均不明顯，球員表現也難以評估，暫無得分模式或球員亮點可供分析。期待後續比賽能有更多精彩數據呈現。
node.table:      type
0      長球
1      殺球
2      挑球
3      切球
4      推球
5     放小球
6     擋小球
7    未知球種
8      勾球
9     發長球
10    發短球
11  後場抽平球
12   過度切球
13   防守回抽
14     撲球
15     點扣
16   防守回挑
17     平球
節點文本: 比賽中，常見球路包括長球、殺球與挑球。值得注意的是，由於「無資料」導致的失分情況較多，而未知球員卻是得分的主要來源，這或許暗示著比賽中存在一些難以預測的變數。


2025-09-29 15:52:01,368 - INFO - 節點 8e8199c9 文本生成完成


node.table:      type
0      長球
1      殺球
2      挑球
3      切球
4      推球
5     放小球
6     擋小球
7    未知球種
8      勾球
9     發長球
10    發短球
11  後場抽平球
12   過度切球
13   防守回抽
14     撲球
15     點扣
16   防守回挑
17     平球
節點文本: 比賽中常見長球、殺球與挑球。值得注意的是，因「無資料」導致失分較多，未知球員卻是主要得分來源，暗示比賽中存在難以預測的變數。


2025-09-29 15:52:02,525 - INFO - 節點 e67d99b0 文本生成完成
2025-09-29 15:52:02,527 - INFO - 節點 c88576a2 文本生成完成


node.table:      type  count
0      長球     55
1      殺球     36
2      挑球     35
3      切球     31
4      推球     31
5     放小球     28
6     擋小球     20
7    未知球種     16
8      勾球     12
9     發長球     10
10    發短球     10
11  後場抽平球      7
12   過度切球      6
13   防守回抽      5
14     撲球      5
15     點扣      4
16   防守回挑      2
17     平球      2
節點文本: 比賽中常見長球、殺球與挑球等技術運用。值得注意的是，因「無資料」導致失分較多，而未知球員卻是主要得分來源，暗示比賽中存在難以預測的變數。
node.table:    count
0     21
1     15
節點文本: 目前比分，A隊21分，B隊15分。A隊持續領先，但仍需小心應對，把握每一次得分機會。


2025-09-29 15:52:03,282 - INFO - 節點 3770f29b 文本生成完成
2025-09-29 15:52:03,285 - INFO - 節點 3b4e895a 文本生成完成


node.table:    count
0     21
1     15
節點文本: 目前比分A隊21分，B隊15分。A隊持續領先，仍需小心應對，把握每一次得分機會，穩住優勢。
node.table:   getpoint_player  count
0               A     21
1               B     15
節點文本: A選手在比賽中得分次數高達21次，相較之下，B選手僅有15次得分。A選手的進攻更具威脅性，是本場比賽的亮點之一。


2025-09-29 15:52:04,390 - INFO - 節點 5c4a956c 文本生成完成
2025-09-29 15:52:04,393 - INFO - 節點 7879c583 文本生成完成


node.table:   getpoint_player  count
0               A     21
1               B     15
節點文本: A隊目前以21:15領先B隊，但仍需謹慎，把握得分機會以穩固優勢。A選手表現亮眼，得分高達21次，進攻更具威脅性，而B選手僅得15分。
node.table:    count
0    158
1    157
節點文本: 本場比賽雙方互有攻守，各有150餘次攻防回合，但具體得分手段尚不明確，仍需更多數據分析才能鎖定勝負關鍵。


2025-09-29 15:52:05,163 - INFO - 節點 4b781957 文本生成完成
2025-09-29 15:52:05,165 - INFO - 節點 6dcccdbd 文本生成完成


node.table:    count
0    158
1    157
節點文本: 本場比賽雙方攻防激烈，共有超過150次攻防回合。然而，具體的得分方式尚不明確，需要進一步的數據分析來確定勝負的關鍵因素。
node.table:   player  count
0      B    158
1      A    157
節點文本: 本場比賽，B選手的擊球次數稍多，共158次，而A選手也不遑多讓，有157次擊球。雙方在場上你來我往，互不相讓。


2025-09-29 15:52:06,239 - INFO - 節點 fd387980 文本生成完成


node.table:   player  count
0      B    158
1      A    157
節點文本: 本場比賽雙方攻防激烈，共有超過150次攻防回合。B選手擊球158次，A選手157次，可見雙方互不相讓。具體得分方式尚不明確，需進一步數據分析以確定勝負關鍵。


2025-09-29 15:52:07,808 - INFO - 節點 c659592a 文本生成完成
2025-09-29 15:52:07,814 - INFO - 生成最終報告...


node.table:      Unnamed: 0  roundscore_A  roundscore_B player getpoint_player  type  \
0             0             1             0      B             NaN   發長球   
1             1             1             0      A             NaN    切球   
2             2             1             0      B             NaN    挑球   
3             3             1             0      A             NaN    長球   
4             4             1             0      B             NaN    殺球   
..          ...           ...           ...    ...             ...   ...   
310         310            21            15      B             NaN  未知球種   
311         311            21            15      A             NaN    切球   
312         312            21            15      B             NaN    挑球   
313         313            21            15      A             NaN    長球   
314         314            21            15      B               A    長球   

     rally      time  
0        1  00:05:47  
1        1  00:05:49  
2     

2025-09-29 15:52:11,708 - INFO - 樹結構已導出至: tree_structure.json
2025-09-29 15:52:11,710 - INFO - 
2025-09-29 15:52:11,710 - INFO - TREE-OF-REPORT 最終報告
2025-09-29 15:52:11,713 - INFO - 報告生成完成，耗時: 121.17 秒
2025-09-29 15:52:11,714 - INFO - 生成的文件:
2025-09-29 15:52:11,715 - INFO - - tree_of_report.md: 最終報告
2025-09-29 15:52:11,715 - INFO - - tree_of_report.txt: 純文本報告
2025-09-29 15:52:11,716 - INFO - - tree_structure.json: 樹結構數據
2025-09-29 15:52:11,717 - INFO - - execution_report.md: 執行過程報告
2025-09-29 15:52:11,718 - INFO - - tree_visualization.html: 可視化頁面
2025-09-29 15:52:11,720 - INFO - 清理暫存檔案: input_tmp.csv
2025-09-29 15:52:11,721 - INFO - 清理暫存檔案: tmp.csv


finish generate report
## 羽球激戰正酣！A隊暫領先，勝負仍存變數

**（本報訊）** 一場激烈的羽球對決正在上演，A隊目前以21:15暫時領先B隊。儘管比賽數據略有缺失，難以完全掌握場上瞬息萬變的局勢，但雙方你來我往的攻防，已讓觀眾看得目不暇給。

從有限的數據來看，這場比賽充滿了長球、殺球和挑球等羽球基本技術的運用。然而，「無資料」失分的情況頻繁出現，暗示場上存在著許多難以預測的變數，也讓比賽更添懸念。令人意外的是，數據中「未知球員」卻是主要得分來源，更凸顯了比賽的複雜性。

A隊選手表現突出，以精湛的球技和更具威脅性的進攻，獨攬21分，成為球隊領先的關鍵人物。反觀B隊選手，雖奮力追趕，但目前僅拿下15分。

可以肯定的是，這場比賽的攻防極為激烈，雙方共計進行了超過150次攻防回合，可見戰況之膠著。B隊選手全場共擊球158次，A隊選手也不甘示弱，擊球157次，充分展現了雙方互不相讓的決心。

雖然A隊目前握有領先優勢，但比賽尚未結束，勝負仍充滿變數。A隊必須保持謹慎，牢牢把握每一個得分機會，才能穩固優勢，最終贏得勝利。而B隊若能找出突破點，將有機會逆轉局勢。究竟鹿死誰手，讓我們拭目以待！

**（後續報導將持續關注比賽進展，並嘗試取得更詳盡的數據，以深入分析雙方的得分模式與勝負關鍵。）**


In [84]:
import os
import pandas as pd
import numpy as np
import google.generativeai as genai
import time
from datetime import datetime

# === 寫作風格詞彙 ===
BADMINTON_TERMS = {
    'net': '網前失誤', 'out': '出界', 'long': '過底線', 'smash': '殺球',
    'clear': '高遠球', 'drop': '切球', 'drive': '平抽球', 'serve': '發球', 'return': '回球'
}
ACTION_VERBS = ['展現', '發揮', '掌握', '運用', '施展', '控制', '主導', '壓制', '突破', '創造', '締造', '奠定', '確立', '鞏固', '扭轉', '逆轉']
TECHNICAL_TERMS = ['lose_reason', 'getpoint_player', 'type', 'column', 'row']

# === Gemini 模型初始化 ===
def init_model(api_key: str):
    genai.configure(api_key=api_key)
    return genai.GenerativeModel("gemini-2.0-flash")

# === 品質評估 ===
def assess_text_quality(text: str) -> float:
    score = 0.0
    if 30 <= len(text) <= 120:
        score += 0.2
    score += min(0.2, sum(1 for t in BADMINTON_TERMS.values() if t in text) * 0.1)
    score += min(0.2, sum(1 for v in ACTION_VERBS if v in text) * 0.05)
    if not any(t in text for t in TECHNICAL_TERMS):
        score += 0.2
    if '，' in text or '。' in text:
        score += 0.2
    return round(min(score, 1.0), 2)

# === 主流程：重複3次生成並評估 ===
def generate_best_of_three(df: pd.DataFrame, api_key: str):
    model = init_model(api_key)
    table_str = df.to_string(index=False)

    prompt_template = f"""
你是一位專業體育新聞記者，擅長撰寫羽球比賽報導。
請根據以下數據表格撰寫賽事描述，使用繁體中文，避免出現技術欄位名稱。

# 賽事數據表格：
{table_str}

請撰寫描述：
"""

    results = []
    for i in range(3):
        try:
            print(f"⏳ 第 {i+1}/3 次生成...")
            response = model.generate_content(prompt_template)
            time.sleep(1)
            text = response.text.strip() if response.text else "⚠️ 無內容"
        except Exception as e:
            text = f"⚠️ 生成錯誤: {e}"
        score = assess_text_quality(text)
        results.append({'index': i+1, 'text': text, 'score': score})

    # 選出最佳結果
    best = max(results, key=lambda x: x['score'])

    # 輸出到檔案
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_name = f"best_of_three_report_{timestamp}.txt"
    with open(file_name, "w", encoding="utf-8") as f:
        for r in results:
            f.write(f"[版本 {r['index']}] 品質分數: {r['score']}\n{r['text']}\n\n")
        f.write(f"🏆 最佳版本為第 {best['index']} 次，分數: {best['score']}\n")
        f.write(best['text'])

    print("\n✅ 所有版本已生成")
    for r in results:
        print(f"[{r['index']}] 分數: {r['score']} → {r['text']}")
    print(f"\n🏆 最佳版本是第 {best['index']} 次：{best['text']}")
    print(f"✔️ 已儲存至：{file_name}")
    return best

# === 測試入口 ===
if __name__ == "__main__":
    api_key = os.getenv("Gemini_API")
    if not api_key:
        raise RuntimeError("請設置 Gemini_API 環境變數")

    df = pd.read_csv("filtered_set1.csv")
    

    generate_best_of_three(df, api_key)


⏳ 第 1/3 次生成...
⏳ 第 2/3 次生成...
⏳ 第 3/3 次生成...

✅ 所有版本已生成
[1] 分數: 0.75 → 這場羽球賽事可謂高潮迭起，雙方你來我往，互不相讓。從比賽伊始，雙方便展開了激烈的攻防轉換，發球、過渡球、到進攻，每個回合都充滿了變數。可以看到球員A率先取得領先，一路將比分拉開，一度取得11:6的優勢。然而，球員B並沒有輕易放棄，展現了頑強的韌性，逐漸將比分追趕上來。

比賽中，雙方選手都力圖在前場尋找機會，短球的運用頻繁，小球與挑球的搭配也考驗著雙方的技術。一些回合的拉鋸非常長，球員們不斷地進行攻防轉換，後場的強力擊球與前場的精巧控制相互交織，呈現出精彩的對抗場面。失誤也偶爾出現，掛網、出界等情況讓比賽更具懸念。

比賽後半段，球員B逐漸找到狀態，憑藉積極的跑動和抓住機會的能力，將比分反超，最終以21:15的比分贏得了勝利。整場比賽節奏緊湊，雙方都展現了高超的羽球技藝和頑強的鬥志，是一場值得回味的精彩對決。
[2] 分數: 0.6 → 這場羽球賽事戰況膠著，雙方你來我往，互不相讓。比賽初段，雙方都以試探性的發球開局，隨後球路變化多端，有時是輕巧的網前小球，有時是力道十足的後場重擊，看得出雙方選手都在積極尋找對方的破綻。

比賽中，選手A一度取得領先，但選手B韌性十足，緊咬比分。在多拍來回中，雙方都展現了極佳的防守能力，多次將看似必殺的球路化解。網前的細膩手法和後場的強力進攻交織，讓觀眾看得目不暇給。

在關鍵時刻，選手A利用一次精準的判斷，讓對手措手不及，成功得分。然而，選手B也毫不示弱，隨即以一記漂亮的落地得分還以顏色。比分交替上升，比賽氣氛也越發緊張。

最終，選手A穩住陣腳，憑藉著穩定的發揮和關鍵時刻的果斷進攻，成功拿下分數。但選手B的表現也同樣精彩，雖敗猶榮。整場比賽高潮迭起，充分展現了羽球運動的魅力。觀眾們也為這場精彩的對決獻上了熱烈的掌聲。
[3] 分數: 0.75 → 這場羽球賽事可謂高潮迭起，雙方選手你來我往，攻防轉換節奏快速。開局雙方互有領先，比分交替上升，首局前半段A選手稍佔優勢，一度將比分拉開至2:1，但B選手隨即展開反擊，利用精準的落點控制和強勢的進攻，將比分追平。

比賽中，我們可以看到多回合的精采對決。例如第三分，雙方選手經過多次的短球、挑球、長球、抽球、切球等戰術運用，足足來回了17拍才由A選手