## 基础配置

In [23]:
# 从.env读取API Token，获取思源笔记的笔记本列表
import requests
import json
from pathlib import Path
from dotenv import load_dotenv
import os

# 加载.env文件中的环境变量
load_dotenv()

# 思源笔记API配置
SIYUAN_API_URL = os.getenv("SIYUAN_API_URL", "http://127.0.0.1:6806")  # 从.env读取，默认值为本地地址
SIYUAN_API_TOKEN = os.getenv("SIYUAN_API_TOKEN")

# API请求头
headers = {
    "Authorization": f"Token {SIYUAN_API_TOKEN}",
    "Content-Type": "application/json"
}

def call_siyuan_api(api_path, payload=None):
    """
    调用思源笔记API的通用函数

    :param api_path: API路径
    :param payload: 请求体数据(可选)
    :return: API响应的data部分
    """
    try:
        if payload is None:
            payload = {}

        response = requests.post(
            f"{SIYUAN_API_URL}{api_path}",
            headers=headers,
            json=payload,
            timeout=30
        )
        response.raise_for_status()

        json_response = response.json()
        if json_response.get("code") != 0:
            print(f"API调用错误: {json_response.get('msg')}")
            return None

        return json_response.get("data")

    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None
    except Exception as e:
        print(f"未知错误: {e}")
        return None


## 获取笔记本列表

In [24]:

def get_notebooks():
    """获取所有笔记本列表"""
    print("正在获取笔记本列表...")

    # 检查API配置是否存在
    if not SIYUAN_API_TOKEN:
        print("错误: 未找到SIYUAN_API_TOKEN，请检查.env文件")
        return None

    print(f"API地址: {SIYUAN_API_URL}")
    print(f"API Token: {SIYUAN_API_TOKEN[:10]}...{SIYUAN_API_TOKEN[-4:] if len(SIYUAN_API_TOKEN) > 14 else SIYUAN_API_TOKEN}")
    print()

    data = call_siyuan_api("/api/notebook/lsNotebooks")

    if data and "notebooks" in data:
        notebooks = data["notebooks"]
        print(f"成功获取到 {len(notebooks)} 个笔记本:")
        print("-" * 60)

        for notebook in notebooks:
            status = "已关闭" if notebook.get("closed", False) else "已打开"
            icon = notebook.get("icon", "📁")
            print(f"📖 {notebook['name']}")
            print(f"   ID: {notebook['id']}")
            print(f"   状态: {status}")
            print(f"   排序: {notebook.get('sort', 0)}")
            print("-" * 60)

        return notebooks
    else:
        print("获取笔记本列表失败")
        return None

# 执行获取笔记本列表
notebooks = get_notebooks()


正在获取笔记本列表...
API地址: http://127.0.0.1:6806
API Token: 6bc792eg8b...x1p3

成功获取到 2 个笔记本:
------------------------------------------------------------
📖 QuincyZou
   ID: 20250614224007-27h8mrb
   状态: 已打开
   排序: 0
------------------------------------------------------------
📖 剪藏笔记本
   ID: 20250618232716-bvztpaf
   状态: 已打开
   排序: 0
------------------------------------------------------------


## 获取笔记本包含的文档列表

In [30]:
# 获取一个笔记本下的文档列表
first_notebook = notebooks[1]

# 配置参数：设置获取文档的最大层级深度（-1表示无限制）
MAX_DEPTH = 3  # 可以修改这个值来控制层级深度，设为-1表示获取所有层级

print(f"正在获取笔记本 '{first_notebook['name']}' 下的文档列表...")
print(f"笔记本ID: {first_notebook['id']}")
print(f"最大层级深度: {'无限制' if MAX_DEPTH == -1 else MAX_DEPTH}")
print("-" * 60)

# 使用API获取文档树结构
doc_tree_data = call_siyuan_api("/api/filetree/listDocTree", {
    "notebook": first_notebook['id'],
    "path": "/"
})

if doc_tree_data and "tree" in doc_tree_data:
    tree = doc_tree_data["tree"]
    print(f"找到 {len(tree)} 个根文档/文件夹:")
    print()

    # 用于记录所有文档信息的列表
    all_docs_info = []

    # 首先收集指定深度内的所有文档ID
    def collect_doc_ids_with_depth(nodes, current_depth=0):
        """递归收集指定深度内的所有文档ID"""
        doc_ids = []
        for node in nodes:
            doc_id = node.get("id", "")
            if doc_id:
                doc_ids.append(doc_id)

            # 检查是否需要继续递归（根据深度限制）
            if "children" in node and (MAX_DEPTH == -1 or current_depth < MAX_DEPTH):
                doc_ids.extend(collect_doc_ids_with_depth(node["children"], current_depth + 1))
        return doc_ids

    print("正在收集指定深度内的文档ID...")
    all_doc_ids = collect_doc_ids_with_depth(tree)
    print(f"收集到 {len(all_doc_ids)} 个文档ID")

    # 批量获取所有文档的路径
    print("正在批量获取文档路径...")
    doc_paths = {}

    for doc_id in all_doc_ids:
        result = call_siyuan_api("/api/filetree/getHPathByID", {"id": doc_id})
        if result:
            doc_paths[doc_id] = result
        else:
            doc_paths[doc_id] = "未知路径"

    print(f"成功获取 {len(doc_paths)} 个文档的路径")

    def parse_doc_name_from_path(hpath):
        """从人类可读路径中解析文档名称"""
        if not hpath or hpath == "未知路径":
            return "未知文档"

        # 从路径中提取最后一部分作为文档名称
        parts = hpath.strip('/').split('/')
        if parts and parts[-1]:
            return parts[-1]
        return "未知文档"

    def get_doc_info(doc_id):
        """获取文档的名称和路径信息"""
        hpath = doc_paths.get(doc_id, "未知路径")
        doc_name = parse_doc_name_from_path(hpath)
        return doc_name, hpath

    def print_doc_tree(nodes, level=0):
        """递归打印文档树结构"""
        # 检查是否超过最大深度限制
        if MAX_DEPTH != -1 and level > MAX_DEPTH:
            return

        for node in nodes:
            doc_id = node.get("id", "")

            # 从路径解析获取文档信息
            doc_title, doc_hpath = get_doc_info(doc_id)

            # 检查是否有子文档
            has_children = "children" in node and len(node["children"]) > 0
            folder_icon = "📁" if has_children else "📄"

            # 创建可视化的层级缩进
            if level == 0:
                indent = ""
                tree_symbol = ""
            else:
                # 使用树状符号来表示层级关系
                indent = "│   " * (level - 1)
                tree_symbol = "├── "

            # 打印文档信息（增强的可视化层级）
            print(f"{indent}{tree_symbol}{folder_icon} {doc_title}")
            print(f"{indent}{'    ' if level > 0 else ''}   📍 ID: {doc_id}")
            # print(f"{indent}{'    ' if level > 0 else ''}   📊 层级: {level}")
            print(f"{indent}{'    ' if level > 0 else ''}   📂 路径: {doc_hpath}")

            # 记录文档信息到列表中
            doc_info = {
                "id": doc_id,
                "name": doc_title,
                "level": level,
                "has_children": has_children,
                "hpath": doc_hpath,
                "parent_path": "/".join(doc_hpath.strip('/').split('/')[:-1]) if doc_hpath != "未知路径" else "root"
            }
            all_docs_info.append(doc_info)

            # 递归处理子文档（考虑深度限制）
            if has_children and (MAX_DEPTH == -1 or level < MAX_DEPTH):
                print_doc_tree(node["children"], level + 1)

            print()

    print_doc_tree(tree)

    # 统计文档数量
    def count_docs(nodes):
        count = 0
        for node in nodes:
            count += 1
            if "children" in node:
                count += count_docs(node["children"])
        return count

    total_docs = count_docs(tree)
    print(f"总计文档数量: {total_docs}")
    print(f"记录的文档信息数量: {len(all_docs_info)}")

    # 打印文档统计信息
    print("\n=== 文档统计信息 ===")
    level_counts = {}
    for doc in all_docs_info:
        level = doc['level']
        level_counts[level] = level_counts.get(level, 0) + 1

    for level in sorted(level_counts.keys()):
        print(f"第{level}层文档: {level_counts[level]}个")

    # 打印所有文档的name属性
    print("\n=== 所有文档名称列表 ===")
    for i, doc in enumerate(all_docs_info, 1):
        indent = "  " * doc['level']
        print(f"{i:3d}. {indent}{doc['name']}")

    # 保存文档树结构到JSON文件
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    # 保存原始API数据
    output_file = output_dir / f"doc_tree_{first_notebook['name']}.json"
    with output_file.open('w', encoding='utf-8') as f:
        json.dump(doc_tree_data, f, ensure_ascii=False, indent=2)

    # 保存整理后的文档信息
    docs_info_file = output_dir / f"docs_info_{first_notebook['name']}.json"
    with docs_info_file.open('w', encoding='utf-8') as f:
        json.dump(all_docs_info, f, ensure_ascii=False, indent=2)

    # 保存可视化格式的文档树结构
    visual_file = output_dir / f"visual_tree_{first_notebook['name']}.txt"

    def save_visual_tree(nodes, f, level=0):
        """递归保存可视化的文档树结构到文件"""
        # 检查是否超过最大深度限制
        if MAX_DEPTH != -1 and level > MAX_DEPTH:
            return

        for node in nodes:
            doc_id = node.get("id", "")

            # 从路径解析获取文档信息
            doc_title, doc_hpath = get_doc_info(doc_id)

            # 检查是否有子文档
            has_children = "children" in node and len(node["children"]) > 0
            folder_icon = "📁" if has_children else "📄"

            # 创建可视化的层级缩进
            if level == 0:
                indent = ""
                tree_symbol = ""
            else:
                # 使用树状符号来表示层级关系
                indent = "│   " * (level - 1)
                tree_symbol = "├── "

            # 写入文档信息（增强的可视化层级）
            f.write(f"{indent}{tree_symbol}{folder_icon} {doc_title}\n")
            f.write(f"{indent}{'    ' if level > 0 else ''}   📍 ID: {doc_id}\n")
            # f.write(f"{indent}{'    ' if level > 0 else ''}   📊 层级: {level}\n")
            f.write(f"{indent}{'    ' if level > 0 else ''}   📂 路径: {doc_hpath}\n")

            # 递归处理子文档（考虑深度限制）
            if has_children and (MAX_DEPTH == -1 or level < MAX_DEPTH):
                save_visual_tree(node["children"], f, level + 1)

            f.write("\n")

    with visual_file.open('w', encoding='utf-8') as f:
        f.write(f"笔记本: {first_notebook['name']}\n")
        f.write(f"笔记本ID: {first_notebook['id']}\n")
        f.write(f"最大层级深度: {'无限制' if MAX_DEPTH == -1 else MAX_DEPTH}\n")
        f.write(f"总文档数: {len(all_docs_info)}\n")
        f.write(f"成功获取路径数: {len(doc_paths)}\n")
        f.write("=" * 60 + "\n\n")

        # 保存可视化树结构
        save_visual_tree(tree, f)

    # 保存路径解析结果（文档ID和路径的映射）
    paths_file = output_dir / f"doc_paths_{first_notebook['name']}.json"
    with paths_file.open('w', encoding='utf-8') as f:
        json.dump(doc_paths, f, ensure_ascii=False, indent=2)

            # 打印路径解析统计信息
    print(f"\n=== 路径解析统计 ===")
    print(f"总文档数: {len(all_doc_ids)}")
    print(f"成功获取路径的文档数: {len(doc_paths)}")
    print(f"未获取到路径的文档数: {len(all_doc_ids) - len(doc_paths)}")

    # 统计路径深度分布
    depth_distribution = {}
    for doc in all_docs_info:
        depth = doc['level']
        depth_distribution[depth] = depth_distribution.get(depth, 0) + 1

    print(f"\n=== 层级深度分布 ===")
    for depth in sorted(depth_distribution.keys()):
        print(f"第{depth}层: {depth_distribution[depth]}个文档")

    print(f"\n文档树结构已保存到: {output_file}")
    print(f"文档信息已保存到: {docs_info_file}")
    print(f"可视化文档树已保存到: {visual_file}")
    print(f"文档名称列表已保存到: {names_file}")
    print(f"路径解析结果已保存到: {paths_file}")

else:
    print("获取文档树失败，尝试使用SQL查询...")

    # 备用方案：使用SQL查询获取文档列表
    sql_query = f"SELECT id, content, hpath FROM blocks WHERE box = '{first_notebook['id']}' AND type = 'd' ORDER BY created"

    sql_result = call_siyuan_api("/api/query/sql", {"stmt": sql_query})

    if sql_result:
        print(f"通过SQL查询找到 {len(sql_result)} 个文档:")
        print("-" * 60)

        for doc in sql_result:
            print(f"📄 {doc.get('content', '无标题')}")
            print(f"   ID: {doc.get('id', '')}")
            print(f"   路径: {doc.get('hpath', '')}")
            print("-" * 40)

        # 保存SQL查询结果
        output_dir = Path("output")
        output_dir.mkdir(exist_ok=True)
        sql_output_file = output_dir / f"docs_sql_{first_notebook['name']}.json"
        with sql_output_file.open('w', encoding='utf-8') as f:
            json.dump(sql_result, f, ensure_ascii=False, indent=2)

        print(f"SQL查询结果已保存到: {sql_output_file}")
    else:
        print("SQL查询也失败了")


正在获取笔记本 '剪藏笔记本' 下的文档列表...
笔记本ID: 20250618232716-bvztpaf
最大层级深度: 3
------------------------------------------------------------
找到 3 个根文档/文件夹:

正在收集指定深度内的文档ID...
收集到 6945 个文档ID
正在批量获取文档路径...
成功获取 6945 个文档的路径
📁 Web收集箱
   📍 ID: 20250614230655-rw72vei
   📂 路径: /Web收集箱
├── 📄 PromptPilot：提示词优化工程终结者？
       📍 ID: 20250615144650-fneylg8
       📂 路径: /Web收集箱/PromptPilot：提示词优化工程终结者？

├── 📁 思源笔记使用技巧
       📍 ID: 20250621071829-zjnz7iu
       📂 路径: /Web收集箱/思源笔记使用技巧
│   ├── 📄 想省大钱？思源笔记第三方同步 S3 手把手教程（使用七牛云对象存储 Kodo）（2024.4.25）
│          📍 ID: 20250615083750-vl9nn96
│          📂 路径: /Web收集箱/思源笔记使用技巧/想省大钱？思源笔记第三方同步 S3 手把手教程（使用七牛云对象存储 Kodo）（2024.4.25）

│   ├── 📄 SQL 小助手
│          📍 ID: 20250615093833-odybz7u
│          📂 路径: /Web收集箱/思源笔记使用技巧/SQL 小助手

│   ├── 📄 阶梯式动态数据库：SuperRef 与动态数据库应用初探
│          📍 ID: 20250615094744-4laa9uh
│          📂 路径: /Web收集箱/思源笔记使用技巧/阶梯式动态数据库：SuperRef 与动态数据库应用初探

│   ├── 📄 插件开发 Quick Start
│          📍 ID: 20250615094808-r6lumo0
│          📂 路径: /Web收集箱/思源笔记使用技巧/插件开发 Quick

## 获取文档的属性以及包含的块及内容

In [32]:
# 获取文档的属性以及包含的块及内容

def get_document_blocks(doc_id):
    """
    获取指定文档的所有块内容

    :param doc_id: 文档ID
    :return: 块列表
    """
    print(f"正在获取文档 {doc_id} 的所有块...")

    # 使用SQL查询获取文档的所有子块
    sql_query = f"""
    SELECT id, type, subtype, content, markdown, parent_id, sort, created, updated
    FROM blocks
    WHERE root_id = '{doc_id}'
    ORDER BY sort
    """

    data = call_siyuan_api("/api/query/sql", {"stmt": sql_query})

    if data and len(data) > 0:
        print(f"找到 {len(data)} 个块")
        return data
    else:
        print("未找到任何块")
        return []

def display_blocks(blocks):
    """
    格式化显示块内容

    :param blocks: 块列表
    """
    if not blocks:
        print("没有块可显示")
        return

    print("=" * 80)
    print(f"文档块结构 (共 {len(blocks)} 个块)")
    print("=" * 80)

    for i, block in enumerate(blocks, 1):
        block_type = block.get('type', 'unknown')
        subtype = block.get('subtype', '')
        content = block.get('content', '').strip()
        markdown = block.get('markdown', '').strip()
        parent_id = block.get('parent_id', '')
        block_id = block.get('id', '')
        created = block.get('created', '')
        updated = block.get('updated', '')

        # 根据块类型显示不同的图标
        type_icons = {
            'd': '📄',  # 文档
            'p': '📝',  # 段落
            'h': '📑',  # 标题
            'l': '📋',  # 列表
            'i': '🖼️',  # 列表项
            'b': '📦',  # 引用块
            'c': '💾',  # 代码块
            's': '🔄',  # 超级块
            't': '📊',  # 表格
            'm': '🔢',  # 数学公式
            'html': '🌐',  # HTML块
            'widget': '🔧',  # 挂件
            'iframe': '🖼️',  # iframe
            'video': '🎥',  # 视频
            'audio': '🔊',  # 音频
        }

        icon = type_icons.get(block_type, '📄')
        type_desc = f"{block_type}" + (f"/{subtype}" if subtype else "")

        print(f"\n{i:3d}. {icon} 块类型: {type_desc}")
        print(f"     📍 ID: {block_id}")
        print(f"     👆 父块: {parent_id}")
        print(f"     📅 创建: {created}")
        print(f"     🔄 更新: {updated}")

        if content:
            # 限制显示长度，避免输出过长
            display_content = content if len(content) <= 200 else content[:200] + "..."
            print(f"     💬 内容: {display_content}")

        if markdown and markdown != content:
            display_markdown = markdown if len(markdown) <= 200 else markdown[:200] + "..."
            print(f"     📝 Markdown: {display_markdown}")

        print("-" * 60)

def get_block_attributes(block_id):
    """
    获取指定块的属性

    :param block_id: 块 ID
    :return: 属性字典
    """
    payload = {"id": block_id}
    return call_siyuan_api("/api/attr/getBlockAttrs", payload)

def save_blocks_to_file(blocks, doc_id, output_dir="output"):
    """
    保存块信息到文件

    :param blocks: 块列表
    :param doc_id: 文档ID
    :param output_dir: 输出目录
    """
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # 保存完整的块信息到JSON文件
    json_file = output_path / f"blocks_{doc_id}.json"
    with json_file.open('w', encoding='utf-8') as f:
        json.dump(blocks, f, ensure_ascii=False, indent=2)

    # 保存可读格式到文本文件
    txt_file = output_path / f"blocks_{doc_id}.txt"
    with txt_file.open('w', encoding='utf-8') as f:
        f.write(f"文档 {doc_id} 的块结构\n")
        f.write("=" * 60 + "\n\n")

        for i, block in enumerate(blocks, 1):
            f.write(f"{i:3d}. 块类型: {block.get('type', 'unknown')}\n")
            f.write(f"     ID: {block.get('id', '')}\n")
            f.write(f"     父块: {block.get('parent_id', '')}\n")
            f.write(f"     内容: {block.get('content', '').strip()}\n")
            if block.get('markdown'):
                f.write(f"     Markdown: {block.get('markdown', '').strip()}\n")
            f.write("-" * 40 + "\n")

    print(f"\n文件已保存:")
    print(f"  JSON格式: {json_file}")
    print(f"  文本格式: {txt_file}")

# 示例：指定要获取的文档ID
# 您可以修改这个ID为您想要查看的文档ID
DOC_ID = "20250618232924-ed8wf0w"  # 示例文档ID，请根据需要修改

print(f"开始获取文档 {DOC_ID} 的所有块内容...")
print()

# 获取块内容
blocks = get_document_blocks(DOC_ID)

if blocks:
    # 显示块内容
    display_blocks(blocks)

    attributes = get_block_attributes(DOC_ID)
    print(attributes)
    # 保存到文件
    save_blocks_to_file(blocks, DOC_ID)

    # 统计信息
    type_counts = {}
    for block in blocks:
        block_type = block.get('type', 'unknown')
        type_counts[block_type] = type_counts.get(block_type, 0) + 1

    print(f"\n📊 块类型统计:")
    for block_type, count in sorted(type_counts.items()):
        print(f"  {block_type}: {count} 个")
else:
    print("未能获取到块内容")


开始获取文档 20250618232924-ed8wf0w 的所有块内容...

正在获取文档 20250618232924-ed8wf0w 的所有块...
找到 45 个块
文档块结构 (共 45 个块)

  1. 📄 块类型: d
     📍 ID: 20250618232924-ed8wf0w
     👆 父块: 
     📅 创建: 20250618232924
     🔄 更新: 20250618232924
     💬 内容: 这个大模型，真的治好了我的论文阅读障碍
------------------------------------------------------------

  2. 🌐 块类型: html
     📍 ID: 20250618232925-ksxmn77
     👆 父块: 20250618232924-ed8wf0w
     📅 创建: 20250618232925
     🔄 更新: 20250618232925
     💬 内容: <div>
<!DOCTYPE html>
</div>
------------------------------------------------------------

  3. 🌐 块类型: html
     📍 ID: 20250618232925-lcpavs3
     👆 父块: 20250618232924-ed8wf0w
     📅 创建: 20250618232925
     🔄 更新: 20250618232925
     💬 内容: <div>
<html>
<head>
    <meta charset="utf-8">
    <title>这个大模型，真的治好了我的论文阅读障碍</title>
</head>
<body>
<!doctype html>
<html>
  <head>
    <meta charset="utf-8">
    <head></head>
  </head>
  <body>
   ...
------------------------------------------------------------

  4. 📝 块类型: p
     📍 ID: 202506182329

In [29]:
# 获取文档块的kramdown源码
doc_id = "20250621081403-hx0zl8u"
payload = {"id": doc_id}
data = call_siyuan_api("/api/block/getBlockKramdown", payload)

def print_dict_structure(data, indent=0):
    """递归打印字典结构，每一层都有适当的缩进"""
    if isinstance(data, dict):
        for key, value in data.items():
            print("  " * indent + f"{key}:")
            if isinstance(value, (dict, list)):
                print_dict_structure(value, indent + 1)
            else:
                print("  " * (indent + 1) + str(value))
    elif isinstance(data, list):
        for i, item in enumerate(data):
            print("  " * indent + f"[{i}]:")
            if isinstance(item, (dict, list)):
                print_dict_structure(item, indent + 1)
            else:
                print("  " * (indent + 1) + str(item))
    else:
        print("  " * indent + str(data))

print_dict_structure(data)


id:
  20250621081403-hx0zl8u
kramdown:
  > 为知笔记迁移文档自定义属性：
> abstract: <nil>
> url: [https://www.toutiao.com/article/7517174713618104851/?log_from=396a8c9db2481_1750471891848](https://www.toutiao.com/article/7517174713618104851/?log_from=396a8c9db2481_1750471891848)
> created: 2025-06-09 23:04:19 +0000 UTC
> accessed: 2025-06-12 12:31:36 +0000 UTC
> modified: <nil>
> {: id="20250621135634-nggyytp" updated="20250621135634"}
>
{: id="20250621135634-nu0eacg" updated="20250621135634"}

# 苹果重磅论文翻车！被指测试方法有大问题……网友：Cook 该裁员了！
{: updated="20250621105403" id="20250621081403-clxqrix"}

2025-06-09 17:43·[人工智能学家](https://www.toutiao.com/c/user/token/MS4wLjABAAAAopm85qB5EHed92CocYuWSjILG4hacNLOMeKOxKXTHGk/?source=tuwen_detail)
{: id="20250621081403-8gvt7q6" updated="20250615183338"}

来源：AGI Hunt
{: id="20250621081403-tgzjxng" updated="20250615183338"}

**苹果的AI 研究团队这次真的翻车了！**
{: id="20250621081403-ghxk2dy" updated="20250615183338"}

他们最近发布的一篇论文引发了AI 圈的集体吐槽，原因竟然是**测试方法出了大问题**。
{: id="20250621081403-gba