## 0. Cuda Test

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

%pip install numpy

In [None]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

import numpy
print(numpy.__version__)

## 1. Setup

### 1.1 Install packages

In [None]:
%pip install git+https://github.com/brandonstarxel/chunking_evaluation.git --quiet
%pip install hf_xet --quiet
%pip install bitsandbytes --quiet
%pip install --upgrade transformers --quiet
%pip install --upgrade chromadb --quiet

### 1.2 Import Required Modules

In [None]:
import os
import sys

# 取得目前筆記本檔案的絕對路徑
notebook_path = os.path.abspath('chunking_exp.ipynb')

# 取得筆記本所在的目錄（即專案根目錄）
project_root = os.path.dirname(notebook_path)

# 將專案根目錄（包含你的本地 chunking_evaluation 資料夾）
# 插入到 sys.path 的最前面，確保優先載入本地版本
sys.path.insert(0, project_root)
print(f"已將專案根目錄 '{project_root}' 優先加入到 Python 模組搜尋路徑。")


from chunking_evaluation.chunking import FixedTokenChunker, RecursiveTokenChunker
from chunking_evaluation import GeneralEvaluation, SyntheticEvaluation
from chunking_evaluation.utils import bge_m3_token_count, get_bge_m3_embedding_function
from chunking_evaluation.evaluation_framework.general_evaluation_data.DatasetAnalyzer import DatasetAnalyzer
import pandas as pd
from IPython.display import display, clear_output
import http.client
import json


### 1.3 Setup Embedding Function

## 2. Create Chunkers

### 2.1 RecursiveTokenChunker & FixedTokenChunker

In [None]:
chunkers = [
    # chunk_size = 512
    RecursiveTokenChunker(chunk_size=512, chunk_overlap=50, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=512, chunk_overlap=50),

    RecursiveTokenChunker(chunk_size=512, chunk_overlap=100, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=512, chunk_overlap=100),

    RecursiveTokenChunker(chunk_size=512, chunk_overlap=150, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=512, chunk_overlap=150),

    RecursiveTokenChunker(chunk_size=512, chunk_overlap=200, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=512, chunk_overlap=200),
    '''
    # chunk_size = 1024
    RecursiveTokenChunker(chunk_size=1024, chunk_overlap=50, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=1024, chunk_overlap=50),

    RecursiveTokenChunker(chunk_size=1024, chunk_overlap=100, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=1024, chunk_overlap=100),

    RecursiveTokenChunker(chunk_size=1024, chunk_overlap=150, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=1024, chunk_overlap=150),

    RecursiveTokenChunker(chunk_size=1024, chunk_overlap=200, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=1024, chunk_overlap=200),

    # chunk_size = 2048
    RecursiveTokenChunker(chunk_size=2048, chunk_overlap=50, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=2048, chunk_overlap=50),

    RecursiveTokenChunker(chunk_size=2048, chunk_overlap=100, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=2048, chunk_overlap=100),

    RecursiveTokenChunker(chunk_size=2048, chunk_overlap=150, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=2048, chunk_overlap=150),

    RecursiveTokenChunker(chunk_size=2048, chunk_overlap=200, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=2048, chunk_overlap=200),

    # chunk_size = 4096
    RecursiveTokenChunker(chunk_size=4096, chunk_overlap=50, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=4096, chunk_overlap=50),

    RecursiveTokenChunker(chunk_size=4096, chunk_overlap=100, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=4096, chunk_overlap=100),

    RecursiveTokenChunker(chunk_size=4096, chunk_overlap=150, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=4096, chunk_overlap=150),

    RecursiveTokenChunker(chunk_size=4096, chunk_overlap=200, length_function=bge_m3_token_count),
    FixedTokenChunker(chunk_size=4096, chunk_overlap=200),
    '''
]

## 3. Prepare Corpora

In [None]:
def download_text(book_id, file_name, directory):
    conn = http.client.HTTPSConnection("www.gutenberg.org")
    url = f"/files/{book_id}/{book_id}-0.txt"

    conn.request("GET", url)
    response = conn.getresponse()

    if response.status == 200:
        text = response.read().decode('utf-8')

        # Create directory if it does not exist
        os.makedirs(directory, exist_ok=True)

        # Save the text to the specified file within the directory
        file_path = os.path.join(directory, file_name)
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text)
        print(f"Book '{file_name}' downloaded and saved successfully in '{directory}'.")
    else:
        print(f"Failed to download the book. Status code: {response.status}")


# Define the directory to save the books
directory = "corpora"

def download_example_texts(download=False, directory="./corpora"):
    if download:
        books = {
            1661: "the_adventures_of_sherlock_holmes.txt",
            1342: "pride_and_prejudice.txt", 
            174: "the_picture_of_dorian_gray.txt"
        }
        print("開始下載範例文本...")

        for book_id, file_name in books.items():
            try:
                download_text(book_id, file_name, directory)
                print(f"✅ 成功下載: {file_name}")
            except Exception as e:
                print(f"❌ 下載失敗 {file_name}: {e}")
        
        print("範例文本下載完成！")
    else:
        print("跳過下載範例文本。")

download_example_texts(download=False)

## 4. Initialize the Evaluation Environment

In [None]:
import os
import glob
import pandas as pd
from typing import List, Dict

# Define your corpus folder path
corpora_directory = "./chunking_evaluation/evaluation_framework/general_evaluation_data/corpora"
# Define the path to check
queries_backup_path = "./chunking_evaluation/evaluation_framework/general_evaluation_data/generated_queries_and_excerpts_backup.csv"
# Define the target output path
queries_csv_path = "./chunking_evaluation/evaluation_framework/general_evaluation_data/generated_queries_and_excerpts.csv"

# --- Core logic modification ---
corpora_paths_list = []

# Check if the backup file exists
if os.path.exists(queries_backup_path):
    print(f"✅ Backup file '{queries_backup_path}' already exists.")
    print(f"📖 Reading file paths from the backup file.")
    
    # Read the 'corpus_id' column from the CSV backup file
    backup_df = pd.read_csv(queries_backup_path)
    
    # Standardize the paths and extract the unique list
    corpora_paths_list = [os.path.normpath(path).replace('\\', '/') for path in backup_df['corpus_id'].unique()]
else:
    print(f"🚧 Backup file '{queries_backup_path}' not found, starting sampling.")

    analyzer = DatasetAnalyzer(corpora_directory)
    analyzer.analyze_folder()
    analyzer.generate_report()
    sample = analyzer.get_stratified_sample(100)

    # Extract the file path list and ensure it's in a standardized format
    corpora_paths_list = [os.path.normpath(file_info['file_path']).replace('\\', '/') for file_info in sample]

    print(f"\n🎯 Stratified sampling complete, selected {len(sample)} files:")
    for i, file_info in enumerate(sample, 1):
        print(f"  {i:2d}. {file_info['category']} - {file_info['filename']}")

# --- Rest of the code remains the same ---
print(f"\n✅ The file paths to be processed have been saved in corpora_paths_list")
print(f"📁 The generated queries will be saved to: {queries_csv_path}")

## 5. Generate Queries and Excerpts

### 5.1 Memory manage

In [None]:
%pip install --upgrade ipywidgets --quiet
%pip install accelerate --quiet

In [None]:
print("corpora_paths_list:", corpora_paths_list)
corpora_id_paths = {
    os.path.splitext(os.path.basename(path))[0]: path 
    for path in corpora_paths_list
}

print("Corpora ID Paths:", corpora_id_paths)

# Initialize the evaluation
evaluation = SyntheticEvaluation(corpora_id_paths, queries_csv_path)



### 5.2 Generate dataset

In [None]:
# Generate queries and excerpts, and save to CSV
if os.path.exists(queries_backup_path):
    print(f"✅ 備份檔案 '{queries_backup_path}' 已存在，跳過查詢生成步驟。")
else:
    print(f"🚀 正在生成查詢與摘錄...")
    evaluation.generate_queries_and_excerpts(approximate_excerpts=True, num_rounds=1, queries_per_corpus=1)
    print(f"✔️ 查詢生成完成，已儲存至 '{queries_csv_path}'")
    evaluation.repair_csv_references() 
    evaluation.validate_csv_integrity()
#evaluation.debug_full_output("./corpora/113_2_資訊工程學系_微積分(二)[159040]_陳榮銘.html", use_approx=True, save_to_file=True)


In [None]:
# Apply filter to remove queries with poor excerpts
#evaluation.filter_poor_excerpts(threshold=0.36)

# Apply filter to remove duplicates
#evaluation.filter_duplicates(threshold=0.6)

## 6. Run the evaluation

In [None]:
import shutil

chroma_db_path = "./chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db"


first_chunker = chunkers[0]
first_result = evaluation.run(first_chunker, get_bge_m3_embedding_function(), retrieve=5, db_to_save_chunks=chroma_db_path)
print(first_result)



'''
# Initialize evaluation
evaluation = GeneralEvaluation()

results = []

# Initialize an empty DataFrame
df = pd.DataFrame()

# Display the DataFrame
display_handle = display(df, display_id=True)

for chunker in chunkers:
    result = evaluation.run(chunker, get_bge_m3_embedding_function(), retrieve=5, db_to_save_chunks=chroma_db_path)
    del result['corpora_scores']  
    chunk_size = chunker._chunk_size if hasattr(chunker, '_chunk_size') else 0
    chunk_overlap = chunker._chunk_overlap if hasattr(chunker, '_chunk_overlap') else 0
    result['chunker'] = chunker.__class__.__name__ + f"_{chunk_size}_{chunk_overlap}"
    results.append(result)

    # Update the DataFrame
    df = pd.DataFrame(results)
    clear_output(wait=True)
    display_handle.update(df)

'''