diff --git a/CHANGES.md b/CHANGES.md index eac05b9..f36d3ea 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,6 +7,10 @@ `{CREATE,ALTER} [FOREIGN] TABLE [AS]` and `COPY {FROM,TO}` commands. - Inventory: Added information about SQL data types and about how to import example datasets using CrateDB Toolkit. +- Bundle: Fixed semantics of `llms.txt` vs. `llms-full.txt`, see [ABOUT-39]. +- Bundle: Generated `outline.html` for improved inspection by humans + +[ABOUT-39]: https://github.com/crate/about/issues/39 ## v0.0.4 - 2025-05-16 - Outline: `cratedb-about outline` now understands `--url` option to use diff --git a/src/cratedb_about/bundle/llmstxt.py b/src/cratedb_about/bundle/llmstxt.py index fb6d60f..21cb791 100644 --- a/src/cratedb_about/bundle/llmstxt.py +++ b/src/cratedb_about/bundle/llmstxt.py @@ -2,11 +2,13 @@ import logging import shutil from importlib import resources +from importlib.abc import Traversable from pathlib import Path from markdown import markdown from cratedb_about import CrateDbKnowledgeOutline +from cratedb_about.outline import OutlineDocument from cratedb_about.util import get_hostname, get_now logger = logging.getLogger(__name__) @@ -15,11 +17,17 @@ @dataclasses.dataclass class LllmsTxtBuilder: """ - Build llms.txt files for CrateDB. + Generate llms.txt files. + + This is a base class intended to be subclassed. The non-init fields + (outline, readme_md, outline_yaml) should be initialized by subclasses. """ outline_url: str outdir: Path + outline: OutlineDocument = dataclasses.field(init=False) + readme_md: Traversable = dataclasses.field(init=False) + outline_yaml: Traversable = dataclasses.field(init=False) def run(self): logger.info(f"Creating bundle. Format: llms-txt. Output directory: {self.outdir}") @@ -29,19 +37,25 @@ def run(self): self.copy_readme() self.copy_sources() - outline = CrateDbKnowledgeOutline.load(self.outline_url) - Path(self.outdir / "outline.md").write_text(outline.to_markdown()) - # TODO: Explore how to optimize this procedure that both steps do not need - # to acquire and process data redundantly. - Path(self.outdir / "llms.txt").write_text(outline.to_llms_txt()) - Path(self.outdir / "llms-full.txt").write_text(outline.to_llms_txt(optional=True)) + # Generate llms-txt resources. + # See also https://github.com/crate/about/issues/39 + # + # - The `llms.txt` is just a Markdown file, unexpanded. It is essentially a sitemap, + # listing all the pages in the documentation. + # - The `llms-full.txt` contains the entire documentation, expanded from the `llms.txt` + # file. Note this may exceed the context window of your LLM. + Path(self.outdir / "llms.txt").write_text(self.outline.to_markdown()) + Path(self.outdir / "llms-full.txt").write_text(self.outline.to_llms_txt(optional=True)) return self def copy_readme(self): + """ + Provide README / "About" information to the bundle, in Markdown and HTML formats. + """ readme_md = self.outdir / "readme.md" shutil.copy( - str(resources.files("cratedb_about.bundle") / "readme.md"), + str(self.readme_md), readme_md, ) try: @@ -52,7 +66,28 @@ def copy_readme(self): logger.warning(f"Failed to generate HTML readme: {e}") def copy_sources(self): + """ + Provide the source document in the original YAML format, but also converted to HTML. + The intermediary Markdown format is already covered by the `llms.txt` file itself. + """ shutil.copy( - str(resources.files("cratedb_about.outline") / "cratedb-outline.yaml"), + str(self.outline_yaml), self.outdir / "outline.yaml", ) + try: + Path(self.outdir / "outline.html").write_text(self.outline.to_html()) + except Exception as e: + logger.warning(f"Failed to generate HTML outline: {e}") + + +@dataclasses.dataclass +class CrateDbLllmsTxtBuilder(LllmsTxtBuilder): + """ + Generate llms.txt files for CrateDB. + """ + + readme_md: Traversable = resources.files("cratedb_about.bundle") / "readme.md" + outline_yaml: Traversable = resources.files("cratedb_about.outline") / "cratedb-outline.yaml" + + def __post_init__(self): + self.outline = CrateDbKnowledgeOutline.load(self.outline_url) diff --git a/src/cratedb_about/cli.py b/src/cratedb_about/cli.py index 9c28d89..97fc5f8 100644 --- a/src/cratedb_about/cli.py +++ b/src/cratedb_about/cli.py @@ -5,7 +5,7 @@ import click from pueblo.util.cli import boot_click -from cratedb_about.bundle.llmstxt import LllmsTxtBuilder +from cratedb_about.bundle.llmstxt import CrateDbLllmsTxtBuilder from cratedb_about.outline import CrateDbKnowledgeOutline from cratedb_about.query.core import CrateDbKnowledgeConversation from cratedb_about.query.model import Example @@ -95,7 +95,7 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None: """ if format_ != "llm": raise click.BadOptionUsage("format", f"Invalid output format: {format_}", ctx=ctx) - LllmsTxtBuilder(outline_url=url, outdir=outdir).run() + CrateDbLllmsTxtBuilder(outline_url=url, outdir=outdir).run() logger.info("Ready.") diff --git a/src/cratedb_about/outline/model.py b/src/cratedb_about/outline/model.py index 681d719..42f0689 100644 --- a/src/cratedb_about/outline/model.py +++ b/src/cratedb_about/outline/model.py @@ -4,6 +4,7 @@ from attr import Factory from attrs import define +from markdown import markdown from cratedb_about.util import DictTools, Dumpable, Metadata, get_cache_client @@ -79,6 +80,12 @@ def to_markdown(self) -> str: buffer.write("\n") return buffer.getvalue().strip() + def to_html(self) -> str: + """ + Convert outline into HTML format using Markdown as an intermediate step. + """ + return markdown(self.to_markdown()) + def to_llms_txt(self, optional: bool = False) -> str: """ Convert this outline into the llms.txt format. diff --git a/tests/test_cli.py b/tests/test_cli.py index 1a28837..7d36b0e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -65,7 +65,7 @@ def test_cli_bundle_success(caplog, tmp_path): assert (tmp_path / "readme.md").exists() assert (tmp_path / "readme.html").exists() assert (tmp_path / "outline.yaml").exists() - assert (tmp_path / "outline.md").exists() + assert (tmp_path / "outline.html").exists() assert (tmp_path / "llms.txt").exists() assert (tmp_path / "llms-full.txt").exists()