In [4]:
import re
import uuid
import logging
import json
import os

# 1. 配置 Logging
# 在 Jupyter 中，使用 stream=sys.stdout 确保日志输出在单元格下方
import sys
logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s: %(message)s',
    stream=sys.stdout, 
    force=True
)

# 2. 读取 book.md 文件
file_path = 'book.md'

if not os.path.exists(file_path):
    logging.error(f"找不到文件: {file_path}，请确认文件名和路径是否正确。")
else:
    with open(file_path, 'r', encoding='utf-8') as f:
        book_content = f.read()
    logging.info(f"成功读取文件: {file_path}, 总字符数: {len(book_content)}")

INFO: 成功读取文件: book.md, 总字符数: 86910


In [5]:
def debug_book_chunking(md_text):
    # 按标题行切分章节 (匹配 #, ##, ### 等)
    # 使用正向预查 (?=\n#+ ) 保证标题本身被保留在 section 内容里
    sections = re.split(r'\n(?=#+ )', md_text)
    
    final_chunks = []
    logging.info(f"--- 调试开始：共检测到 {len(sections)} 个章节 ---")

    for i, section in enumerate(sections):
        section = section.strip()
        if not section: continue
        
        # Parent Context 就是这一章的所有原始文本
        parent_context = section
        
        # 将章节内容按段落（双换行）切分
        paragraphs = [p.strip() for p in section.split('\n\n') if p.strip()]
        
        # 记录当前章节的概况
        title_line = section.split('\n')[0][:50]
        logging.info(f"正在处理章节 [{i+1}]: {title_line}... (包含 {len(paragraphs)} 段)")

        for p_idx, para in enumerate(paragraphs):
            # 过滤：如果这一段本身就是标题行，跳过（不作为 Child Embedding）
            if para.startswith('#'):
                continue
            
            # 长度检测：这是解决 413 报错的关键
            char_len = len(para)
            if char_len > 3000:
                logging.warning(f"  ⚠️ 发现超长段落! [第 {i+1} 章, 第 {p_idx} 段] 长度: {char_len} 字符")
                # 预览前100个字符定位问题
                logging.warning(f"     预览: {para[:100]}...")
            
            final_chunks.append({
                "chunk_index": len(final_chunks),
                "child_paragraph": para,
                "metadata": {
                    "parent_section": parent_context # RAG召回后喂给LLM的完整上下文
                }
            })
            
    logging.info(f"--- 调试完成：最终生成 {len(final_chunks)} 个有效分块 ---")
    return final_chunks

# 执行调试
if 'book_content' in locals():
    all_chunks = debug_book_chunking(book_content)

INFO: --- 调试开始：共检测到 79 个章节 ---
INFO: 正在处理章节 [1]: # 5... (包含 1 段)
INFO: 正在处理章节 [2]: # Requirements and Architecture for AI Pipelines... (包含 6 段)
INFO: 正在处理章节 [3]: # Development pipelines... (包含 14 段)
INFO: 正在处理章节 [4]: # Data store requirements... (包含 2 段)
INFO: 正在处理章节 [5]: # Data volume and velocity... (包含 2 段)
INFO: 正在处理章节 [6]: # Data formats and processing approaches... (包含 2 段)
INFO: 正在处理章节 [7]: # Timeliness and technology selection... (包含 2 段)
INFO: 正在处理章节 [8]: # Non-functional requirements and governance... (包含 2 段)
INFO: 正在处理章节 [9]: # Support operations and specialized stores... (包含 5 段)
INFO: 正在处理章节 [10]: # Algorithmic development components... (包含 2 段)
INFO: 正在处理章节 [11]: # Data quality checks... (包含 3 段)
INFO: 正在处理章节 [12]: # Data transforms... (包含 3 段)
INFO: 正在处理章节 [13]: # Data summary... (包含 3 段)
INFO: 正在处理章节 [14]: # Model building, tuning, and verification... (包含 6 段)
INFO: 正在处理章节 [15]: # Configuration control... (包含 2 段)
INFO: 正在处理章节 [16]: # Machine learning performance... (包

In [7]:
# 查看前 2 个分块的详细结构
if all_chunks:
    test_idx = 0
    print("\n" + "="*60)
    print(f"【分块验证 - Index {test_idx}】")
    print(f"Child (去 Embedding 的文本):\n{all_chunks[test_idx]['child_paragraph']}")
    print("-" * 30)
    print(f"Parent Metadata (喂给 LLM 的上下文 - 截取前200字):\n{all_chunks[test_idx]['metadata']['parent_section'][:200]}...")
    print("="*60)


【分块验证 - Index 0】
Child (去 Embedding 的文本):
Machine learning model development fundamentally differs from traditional software engineering in its experimental and iterative nature. While software engineers typically design systems based on well-defined specifications, data scientists must navigate the inherent uncertainties of data characteristics, feature relevance, and model behavior. This necessitates a systematic yet flexible approach to model creation, optimization, and validation that accommodates the unique challenges of AI development.
------------------------------
Parent Metadata (喂给 LLM 的上下文 - 截取前200字):
# Requirements and Architecture for AI Pipelines

Machine learning model development fundamentally differs from traditional software engineering in its experimental and iterative nature. While softwar...


In [8]:
output_filename = "debug_chunks.json"

try:
    with open(output_filename, "w", encoding="utf-8") as f:
        # indent=4 让 JSON 有层级缩进，方便肉眼查看
        # ensure_ascii=False 保证中文字符正常显示而非 \u 编码
        json.dump(all_chunks, f, indent=4, ensure_ascii=False)
    
    logging.info(f"✅ 调试结果已保存至: {os.path.abspath(output_filename)}")
    logging.info(f"您可以现在打开该文件查看 'child_paragraph' 与 'parent_section' 的映射关系。")
except Exception as e:
    logging.error(f"❌ 保存 JSON 失败: {e}")

INFO: ✅ 调试结果已保存至: c:\Users\RONGZHEN CHEN\Desktop\Projects\multimodual-rag\rag_project\debug_chunks.json
INFO: 您可以现在打开该文件查看 'child_paragraph' 与 'parent_section' 的映射关系。


In [9]:
import hashlib
import json
import re
import logging

def get_hash(text):
    """Generates a short unique ID for a section of text."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def optimize_chunking_with_hashing(md_text):
    sections = re.split(r'\n(?=#+ )', md_text)
    
    vector_data = []      # For ChromaDB
    parent_map = {}       # For the lookup file
    
    for section in sections:
        section = section.strip()
        if not section: continue
        
        # 1. Create the Hash for this section
        section_hash = get_hash(section)
        parent_map[section_hash] = section
        
        # 2. Split into paragraphs
        paragraphs = [p.strip() for p in section.split('\n\n') if p.strip()]
        
        for para in paragraphs:
            if para.startswith('#'): continue
            
            vector_data.append({
                "child_text": para,
                "parent_hash": section_hash  # Only store the hash here!
            })
            
    return vector_data, parent_map

# --- Run the optimization ---
with open('book.md', 'r', encoding='utf-8') as f:
    content = f.read()

vector_list, doc_map = optimize_chunking_with_hashing(content)

# Save the Vector Data (Small)
with open('vector_ingest.json', 'w', encoding='utf-8') as f:
    json.dump(vector_list, f, indent=4, ensure_ascii=False)

# Save the Parent Map (The "Database")
with open('parent_store.json', 'w', encoding='utf-8') as f:
    json.dump(doc_map, f, indent=4, ensure_ascii=False)

print(f"Vector items: {len(vector_list)}")
print(f"Unique Parent sections: {len(doc_map)}")

Vector items: 223
Unique Parent sections: 79
