Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
`{CREATE,ALTER} [FOREIGN] TABLE [AS]` and `COPY {FROM,TO}` commands.
- Inventory: Added information about SQL data types and about how to import
example datasets using CrateDB Toolkit.
- Bundle: Fixed semantics of `llms.txt` vs. `llms-full.txt`, see [ABOUT-39].
- Bundle: Generated `outline.html` for improved inspection by humans

[ABOUT-39]: https://github.com/crate/about/issues/39

## v0.0.4 - 2025-05-16
- Outline: `cratedb-about outline` now understands `--url` option to use
Expand Down
53 changes: 44 additions & 9 deletions src/cratedb_about/bundle/llmstxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import logging
import shutil
from importlib import resources
from importlib.abc import Traversable
from pathlib import Path

from markdown import markdown

from cratedb_about import CrateDbKnowledgeOutline
from cratedb_about.outline import OutlineDocument
from cratedb_about.util import get_hostname, get_now

logger = logging.getLogger(__name__)
Expand All @@ -15,11 +17,17 @@
@dataclasses.dataclass
class LllmsTxtBuilder:
"""
Build llms.txt files for CrateDB.
Generate llms.txt files.

This is a base class intended to be subclassed. The non-init fields
(outline, readme_md, outline_yaml) should be initialized by subclasses.
"""

outline_url: str
outdir: Path
outline: OutlineDocument = dataclasses.field(init=False)
readme_md: Traversable = dataclasses.field(init=False)
outline_yaml: Traversable = dataclasses.field(init=False)

def run(self):
logger.info(f"Creating bundle. Format: llms-txt. Output directory: {self.outdir}")
Expand All @@ -29,19 +37,25 @@
self.copy_readme()
self.copy_sources()

outline = CrateDbKnowledgeOutline.load(self.outline_url)
Path(self.outdir / "outline.md").write_text(outline.to_markdown())
# TODO: Explore how to optimize this procedure that both steps do not need
# to acquire and process data redundantly.
Path(self.outdir / "llms.txt").write_text(outline.to_llms_txt())
Path(self.outdir / "llms-full.txt").write_text(outline.to_llms_txt(optional=True))
# Generate llms-txt resources.
# See also https://github.com/crate/about/issues/39
#
# - The `llms.txt` is just a Markdown file, unexpanded. It is essentially a sitemap,
# listing all the pages in the documentation.
# - The `llms-full.txt` contains the entire documentation, expanded from the `llms.txt`
# file. Note this may exceed the context window of your LLM.
Path(self.outdir / "llms.txt").write_text(self.outline.to_markdown())
Path(self.outdir / "llms-full.txt").write_text(self.outline.to_llms_txt(optional=True))

return self

def copy_readme(self):
"""
Provide README / "About" information to the bundle, in Markdown and HTML formats.
"""
readme_md = self.outdir / "readme.md"
shutil.copy(
str(resources.files("cratedb_about.bundle") / "readme.md"),
str(self.readme_md),
readme_md,
)
try:
Expand All @@ -52,7 +66,28 @@
logger.warning(f"Failed to generate HTML readme: {e}")

def copy_sources(self):
"""
Provide the source document in the original YAML format, but also converted to HTML.
The intermediary Markdown format is already covered by the `llms.txt` file itself.
"""
shutil.copy(
str(resources.files("cratedb_about.outline") / "cratedb-outline.yaml"),
str(self.outline_yaml),
self.outdir / "outline.yaml",
)
try:
Path(self.outdir / "outline.html").write_text(self.outline.to_html())
except Exception as e:
logger.warning(f"Failed to generate HTML outline: {e}")

Check warning on line 80 in src/cratedb_about/bundle/llmstxt.py

View check run for this annotation

Codecov / codecov/patch

src/cratedb_about/bundle/llmstxt.py#L79-L80

Added lines #L79 - L80 were not covered by tests


@dataclasses.dataclass
class CrateDbLllmsTxtBuilder(LllmsTxtBuilder):
"""
Generate llms.txt files for CrateDB.
"""

readme_md: Traversable = resources.files("cratedb_about.bundle") / "readme.md"
outline_yaml: Traversable = resources.files("cratedb_about.outline") / "cratedb-outline.yaml"

def __post_init__(self):
self.outline = CrateDbKnowledgeOutline.load(self.outline_url)
4 changes: 2 additions & 2 deletions src/cratedb_about/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import click
from pueblo.util.cli import boot_click

from cratedb_about.bundle.llmstxt import LllmsTxtBuilder
from cratedb_about.bundle.llmstxt import CrateDbLllmsTxtBuilder
from cratedb_about.outline import CrateDbKnowledgeOutline
from cratedb_about.query.core import CrateDbKnowledgeConversation
from cratedb_about.query.model import Example
Expand Down Expand Up @@ -95,7 +95,7 @@ def bundle(ctx: click.Context, url: str, format_: str, outdir: Path) -> None:
"""
if format_ != "llm":
raise click.BadOptionUsage("format", f"Invalid output format: {format_}", ctx=ctx)
LllmsTxtBuilder(outline_url=url, outdir=outdir).run()
CrateDbLllmsTxtBuilder(outline_url=url, outdir=outdir).run()
logger.info("Ready.")


Expand Down
7 changes: 7 additions & 0 deletions src/cratedb_about/outline/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from attr import Factory
from attrs import define
from markdown import markdown

from cratedb_about.util import DictTools, Dumpable, Metadata, get_cache_client

Expand Down Expand Up @@ -79,6 +80,12 @@ def to_markdown(self) -> str:
buffer.write("\n")
return buffer.getvalue().strip()

def to_html(self) -> str:
"""
Convert outline into HTML format using Markdown as an intermediate step.
"""
return markdown(self.to_markdown())

def to_llms_txt(self, optional: bool = False) -> str:
"""
Convert this outline into the llms.txt format.
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_cli_bundle_success(caplog, tmp_path):
assert (tmp_path / "readme.md").exists()
assert (tmp_path / "readme.html").exists()
assert (tmp_path / "outline.yaml").exists()
assert (tmp_path / "outline.md").exists()
assert (tmp_path / "outline.html").exists()
assert (tmp_path / "llms.txt").exists()
assert (tmp_path / "llms-full.txt").exists()

Expand Down