## 0. Cuda Test

In [1]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

%pip install numpy

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

import numpy
print(numpy.__version__)

2.7.0+cu118
True
2.3.1


## 1. Setup

### 1.1 Install packages

In [3]:
%pip install git+https://github.com/brandonstarxel/chunking_evaluation.git --quiet
%pip install hf_xet --quiet
%pip install bitsandbytes --quiet
%pip install --upgrade transformers --quiet
%pip install --upgrade chromadb --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### 1.2 Import Required Modules

In [4]:
import os
import sys

# 取得目前筆記本檔案的絕對路徑
notebook_path = os.path.abspath('chunking_exp.ipynb')

# 取得筆記本所在的目錄（即專案根目錄）
project_root = os.path.dirname(notebook_path)

# 將專案根目錄（包含你的本地 chunking_evaluation 資料夾）
# 插入到 sys.path 的最前面，確保優先載入本地版本
sys.path.insert(0, project_root)
print(f"已將專案根目錄 '{project_root}' 優先加入到 Python 模組搜尋路徑。")


from chunking_evaluation.chunking import FixedTokenChunker, RecursiveTokenChunker
from chunking_evaluation import GeneralEvaluation, SyntheticEvaluation
from chunking_evaluation.utils import bge_m3_token_count, get_bge_m3_embedding_function
from chunking_evaluation.evaluation_framework.general_evaluation_data.DatasetAnalyzer import DatasetAnalyzer
import pandas as pd
from IPython.display import display, clear_output
import http.client
import json


已將專案根目錄 'd:\ArtificialIntelligenceCustomerService\code\exp' 優先加入到 Python 模組搜尋路徑。


## 2. Create Chunkers

### 2.1 RecursiveTokenChunker & FixedTokenChunker

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunkers = [
    # chunk_size = 512
    RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", "", "。", "！", "？", "，", "."]
    ),
    RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=50,
        length_function=len,
        separators=["\n\n", "\n", " ", "", "。", "！", "？", "，", "."]
    ),
]


## 3. Prepare Corpora

In [6]:
'''
def download_text(book_id, file_name, directory):
    conn = http.client.HTTPSConnection("www.gutenberg.org")
    url = f"/files/{book_id}/{book_id}-0.txt"

    conn.request("GET", url)
    response = conn.getresponse()

    if response.status == 200:
        text = response.read().decode('utf-8')

        # Create directory if it does not exist
        os.makedirs(directory, exist_ok=True)

        # Save the text to the specified file within the directory
        file_path = os.path.join(directory, file_name)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text)
        print(f"Book '{file_name}' downloaded and saved successfully in '{directory}'.")
    else:
        print(f"Failed to download the book. Status code: {response.status}")


# Define the directory to save the books
directory = "corpora"

def download_example_texts(download=False, directory="./corpora"):
    if download:
        books = {
            1661: "the_adventures_of_sherlock_holmes.txt",
            1342: "pride_and_prejudice.txt", 
            174: "the_picture_of_dorian_gray.txt"
        }
        print("開始下載範例文本...")

        for book_id, file_name in books.items():
            try:
                download_text(book_id, file_name, directory)
                print(f"✅ 成功下載: {file_name}")
            except Exception as e:
                print(f"❌ 下載失敗 {file_name}: {e}")
        
        print("範例文本下載完成！")
    else:
        print("跳過下載範例文本。")

download_example_texts(download=False)
'''

'\ndef download_text(book_id, file_name, directory):\n    conn = http.client.HTTPSConnection("www.gutenberg.org")\n    url = f"/files/{book_id}/{book_id}-0.txt"\n\n    conn.request("GET", url)\n    response = conn.getresponse()\n\n    if response.status == 200:\n        text = response.read().decode(\'utf-8\')\n\n        # Create directory if it does not exist\n        os.makedirs(directory, exist_ok=True)\n\n        # Save the text to the specified file within the directory\n        file_path = os.path.join(directory, file_name)\n        with open(file_path, "w", encoding="utf-8") as file:\n            file.write(text)\n        print(f"Book \'{file_name}\' downloaded and saved successfully in \'{directory}\'.")\n    else:\n        print(f"Failed to download the book. Status code: {response.status}")\n\n\n# Define the directory to save the books\ndirectory = "corpora"\n\ndef download_example_texts(download=False, directory="./corpora"):\n    if download:\n        books = {\n           

## 4. Initialize the Evaluation Environment

### 4.1 Setup Path

In [7]:
import os
import glob
import pandas as pd
from typing import List, Dict

# 確保你的 chunking_evaluation 模組已經載入
from chunking_evaluation import GeneralEvaluation, SyntheticEvaluation
from chunking_evaluation.evaluation_framework.general_evaluation_data.DatasetAnalyzer import DatasetAnalyzer

# 設定路徑
corpora_directory = "./chunking_evaluation/evaluation_framework/general_evaluation_data/corpora"
queries_csv_path = "./chunking_evaluation/evaluation_framework/general_evaluation_data/generated_queries_and_excerpts.csv"


### 4.2 Already have corpora & queries csv

In [8]:

# 檢查 CSV 檔案是否存在
if os.path.exists(queries_csv_path):
    print(f"✅ 偵測到評估資料集 CSV 檔案 '{queries_csv_path}' 已存在。")
    print("將使用 GeneralEvaluation 進行評估。")
    evaluation = GeneralEvaluation()
else:
    print(f"🚧 未偵測到評估資料集 CSV 檔案，將使用 SyntheticEvaluation 進行生成。")

🚧 未偵測到評估資料集 CSV 檔案，將使用 SyntheticEvaluation 進行生成。


### 4.3 Generate queries if haven't

In [9]:
%pip install --upgrade ipywidgets --quiet
%pip install accelerate --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [10]:

# 檢查 CSV 檔案是否存在
if os.path.exists(queries_csv_path):
    print(f"✅ 偵測到評估資料集 CSV 檔案 '{queries_csv_path}' 已存在。")
    print("將使用 GeneralEvaluation 進行評估。")
else:
    print(f"🚧 未偵測到評估資料集 CSV 檔案，將使用 SyntheticEvaluation 進行生成。")

    # 進行資料取樣以生成評估資料集
    analyzer = DatasetAnalyzer(corpora_directory)
    analyzer.analyze_folder()
    analyzer.generate_report()
    sample = analyzer.get_stratified_sample(100)
    
    # 建立檔案路徑清單
    corpora_paths_list = [os.path.normpath(file_info['file_path']).replace('\\\\', '/') for file_info in sample]
    corpora_id_paths = {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in corpora_paths_list
    }

    print(f"\n🎯 分層取樣完成，選擇了 {len(sample)} 個檔案。")
    print("將使用 SyntheticEvaluation 生成評估資料集並進行評估。")

    # 初始化 SyntheticEvaluation
    evaluation = SyntheticEvaluation(corpora_id_paths, queries_csv_path)

    # 生成評估資料集
    print(f"🚀 正在生成查詢與摘錄...")
    evaluation.generate_queries_and_excerpts(approximate_excerpts=True, num_rounds=1, queries_per_corpus=1)
    print(f"✔️ 查詢生成完成，已儲存至 '{queries_csv_path}'")
    #evaluation.repair_csv_references()
    #evaluation.validate_csv_integrity()

🚧 未偵測到評估資料集 CSV 檔案，將使用 SyntheticEvaluation 進行生成。
🔍 開始分析資料夾: ./chunking_evaluation/evaluation_framework/general_evaluation_data/corpora
📁 找到 1194 個檔案

📄 [1/1194] 處理: 113_2_人文學院_運算思維與程式設計[101005]_孟淑慧.html
📄 [11/1194] 處理: 113_2_國語文學系_兒童文學概論[122088]_陳昭吟.html
📄 [21/1194] 處理: 113_2_國語文學系_女性文學(二)[122103]_陳昭吟.html
📄 [31/1194] 處理: 113_2_國語文學系_書學通論[122104]_莊千慧.html
📄 [41/1194] 處理: 113_2_國語文學系_現代華文文學(二)[122124]_龔韻蘅.html
📊 已處理 50/1194 個檔案 (4.2%)
📄 [51/1194] 處理: 113_2_國語文學系_詞曲選(二)[122091]_林慧真.html
📄 [61/1194] 處理: 113_2_國語文學系_閱讀教學專題研究(二)[222004]_陳光明.html
📄 [71/1194] 處理: 113_2_國語文學系國語文教學與應用碩士班_民間文學專題研究(二)[122067]_林登順.html
📄 [81/1194] 處理: 113_2_師資培育中心_國民小學教學實習[107038]_黃振恭.html
📄 [91/1194] 處理: 113_2_師資培育中心_國民小學體育教材教法(雙語教學課程)[107052]_陳瑩璟.html
📊 已處理 100/1194 個檔案 (8.4%)
📄 [101/1194] 處理: 113_2_師資培育中心_教育行政[107011]_張正平.html
📄 [111/1194] 處理: 113_2_師資培育中心_綜合活動領域教材教法[107043]_陳志賢.html
📄 [121/1194] 處理: 113_2_師資培育中心_輔導原理與實務[107018]_鄧守娟.html
📄 [131/1194] 處理: 113_2_幼兒教育學系_幼兒園行政與法規[170045]_陳怡靖.html
📄 [141/1194] 處理: 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

根據 max_memory={0: '12GiB', 'cpu': '8GiB'} 預估裝置分佈中...
預估的裝置分佈: OrderedDict({'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0, 'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0, 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.layers.28': 'cpu', 'model.layers.29': 'cpu', 'model.layers.30': 'cpu', 'model.layers.31': 'cpu', 'model.norm': 'cpu', 'model.rotary_emb': 'cpu', 'lm_head': 'cpu'})
正在載入 meta-llama/Llama-3.1-8B-Instruct 模型...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


meta-llama/Llama-3.1-8B-Instruct 模型已成功載入到裝置: {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0, 'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0, 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.layers.28': 'cpu', 'model.layers.29': 'cpu', 'model.layers.30': 'cpu', 'model.layers.31': 'cpu', 'model.norm': 'cpu', 'model.rotary_emb': 'cpu', 'lm_head': 'cpu'}
🚀 正在生成查詢與摘錄...
讀取檔案失敗 113_2_通識教育中心_療癒書法[106138]_簡月娟: [Errno 2] No such file or directory: '113_2_通識教育中心_療癒書法[106138]_簡月娟'
文件 113_2_通識教育中心

In [11]:
# Apply filter to remove queries with poor excerpts
#evaluation.filter_poor_excerpts(threshold=0.36)

# Apply filter to remove duplicates
#evaluation.filter_duplicates(threshold=0.6)

### 4.4 Finished Initialization

In [12]:
print("\n🎉 評估環境初始化完成。")


🎉 評估環境初始化完成。


## 5. Run the evaluation

In [13]:
import torch
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 是否可用: {torch.cuda.is_available()}")
print(f"CUDA 裝置數量: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"當前使用的 CUDA 裝置: {torch.cuda.current_device()}")
    print(f"CUDA 裝置名稱: {torch.cuda.get_device_name(0)}")

PyTorch 版本: 2.7.0+cu118
CUDA 是否可用: True
CUDA 裝置數量: 1
當前使用的 CUDA 裝置: 0
CUDA 裝置名稱: NVIDIA GeForce RTX 4070 Ti SUPER


In [14]:
import shutil
import pandas as pd
from IPython.display import display

# 刪除舊的 Chroma DB，以確保每次評估都是從頭開始
chroma_db_path = "./chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db"
if os.path.exists(chroma_db_path):
    print(f"🔄 正在刪除舊的 Chroma DB: {chroma_db_path}")
    shutil.rmtree(chroma_db_path)

# ====== 將載入嵌入函式的程式碼移到迴圈外面 ======
print("🚀 準備載入嵌入模型，此過程僅執行一次...")
embedding_function = get_bge_m3_embedding_function()
print("✅ 嵌入模型載入完成！")
print("--------------------------------------------------")
# ======================================================

results = []

for chunker in chunkers:
    print(f"⚙️ 正在使用 {chunker.__class__.__name__} 進行評估...")

    # 執行評估時，直接使用已經載入好的 embedding_function
    result = evaluation.run(
        chunker,
        embedding_function,  # 這裡使用已經載入好的函式
        retrieve=5,
        db_to_save_chunks=chroma_db_path
    )

    # 清理結果並新增 chunker 資訊
    chunk_size = chunker._chunk_size if hasattr(chunker, '_chunk_size') else 0
    chunk_overlap = chunker._chunk_overlap if hasattr(chunker, '_chunk_overlap') else 0
    result['chunker'] = chunker.__class__.__name__ + f"_{chunk_size}_{chunk_overlap}"

    results.append(result)

    # 在每次迭代後，顯示當前 chunker 的結果
    current_df = pd.DataFrame([result])
    print("\n--- 當前 Chunking 策略結果 ---")
    display(current_df)
    print("----------------------------\n")

print("\n✅ 所有分塊器評估完成！")

# 顯示包含所有結果的總結 DataFrame
final_df = pd.DataFrame(results)
print("=== 最終總結報告 ===")
display(final_df)

🚀 準備載入嵌入模型，此過程僅執行一次...
正在載入 BGE-M3 模型到裝置: cuda
✅ 嵌入模型載入完成！
--------------------------------------------------
⚙️ 正在使用 RecursiveCharacterTextSplitter 進行評估...
Created collection:  BGE_M3_EmbeddingFunction_RecursiveCharacterTextSplitter_512_50


✅ 正在處理語料庫檔案並切割: 0it [00:00, ?it/s]


❌ 錯誤：沒有找到可處理的區塊。
Collection 'auto_chunk' does not exist, proceeding to create.
New collection 'auto_chunk' created.


✅ 正在處理語料庫檔案並切割: 0it [00:00, ?it/s]

❌ 錯誤：沒有找到可處理的區塊。
❌ 錯誤：無法獲取區塊中繼資料，評估將無法進行。

--- 當前 Chunking 策略結果 ---





Unnamed: 0,chunker
0,RecursiveCharacterTextSplitter_512_50


----------------------------

⚙️ 正在使用 RecursiveCharacterTextSplitter 進行評估...
Created collection:  BGE_M3_EmbeddingFunction_RecursiveCharacterTextSplitter_1024_50


✅ 正在處理語料庫檔案並切割: 0it [00:00, ?it/s]


❌ 錯誤：沒有找到可處理的區塊。
Existing collection 'auto_chunk' deleted.
New collection 'auto_chunk' created.


✅ 正在處理語料庫檔案並切割: 0it [00:00, ?it/s]

❌ 錯誤：沒有找到可處理的區塊。
❌ 錯誤：無法獲取區塊中繼資料，評估將無法進行。

--- 當前 Chunking 策略結果 ---





Unnamed: 0,chunker
0,RecursiveCharacterTextSplitter_1024_50


----------------------------


✅ 所有分塊器評估完成！
=== 最終總結報告 ===


Unnamed: 0,chunker
0,RecursiveCharacterTextSplitter_512_50
1,RecursiveCharacterTextSplitter_1024_50
