diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..ac401ef --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..29edb96 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,65 @@ +name: "Tests" + +on: + push: + branches: [ main ] + pull_request: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + test: + name: " + Python ${{ matrix.python-version }} + " + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ['ubuntu-latest'] + python-version: [ + '3.9', + '3.13', + ] + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python-version }} + UV_SYSTEM_PYTHON: true + + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Install `sponge` + run: sudo apt-get --yes install moreutils + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + cache-dependency-glob: | + pyproject.toml + cache-suffix: ${{ matrix.python-version }} + enable-cache: true + version: "latest" + + - name: Set up project + run: | + uv pip install --editable='.[develop,test]' + + - name: Run linter and software tests + run: | + poe check + poe build + cratedb-about --version + cratedb-about list-questions diff --git a/.gitignore b/.gitignore index 28a49b5..937df2d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ .venv* *.egg-info *.lock +bdist.* +__pycache__ diff --git a/CHANGES.md b/CHANGES.md index 1f312bc..408d460 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,3 +4,5 @@ - Established project layout - Added source files (`cratedb-overview.md`), generator program wrapper (`uv run poe build`), and build artifacts (`llms-ctx.txt` and `llms-ctx-full.txt`) +- Added CLI program `cratedb-about` with subcommands `ask` and `list-questions` + for ad hoc conversations about CrateDB diff --git a/README.md b/README.md index 51be41b..766f528 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,20 @@ to relevant resources in the spirit of a curated knowledge backbone. ## Usage -To rebuild the `llms.txt` files, acquire the sources of the repository, -and invoke the build command. +Install `cratedb-about` package. +```shell +uv tool install --upgrade 'cratedb-about @ git+https://github.com/crate/about' +``` + +Ask questions about CrateDB. +```shell +export OPENAI_API_KEY= +cratedb-about ask "CrateDB does not seem to provide an AUTOINCREMENT feature?" +``` +If you are running out of questions, get inspired by the standard library. ```shell -git clone https://github.com/crate/about cratedb-about -cd cratedb-about -uv run poe build +cratedb-about list-questions ``` diff --git a/docs/backlog.md b/docs/backlog.md new file mode 100644 index 0000000..ab82719 --- /dev/null +++ b/docs/backlog.md @@ -0,0 +1,10 @@ +# Backlog + +## Iteration +1 +- JSON/YAML/Markdown output + +## Iteration +2 +- Unlock Discourse: https://community.cratedb.com/raw/1015 +- Unlock HTML resources: https://www.urltoany.com/url-to-markdown. + => Find the best standalone program. +- Unlock GitHub projects: https://github.com/mattduck/gh2md diff --git a/docs/sandbox.md b/docs/sandbox.md new file mode 100644 index 0000000..8b32e98 --- /dev/null +++ b/docs/sandbox.md @@ -0,0 +1,22 @@ +# Sandbox + +Acquire the source code repository. +```shell +git clone https://github.com/crate/about cratedb-about +cd cratedb-about +``` + +Rebuild all the `llms.txt` files. +```shell +uv run poe build +``` + +Ask questions about CrateDB. +```shell +uvx --with-editable=. cratedb-about ask "CrateDB does not seem to provide an AUTOINCREMENT feature?" +``` + +If you are running out of questions, get inspired by the standard library. +```shell +uvx --with-editable=. cratedb-about list-questions +``` diff --git a/pyproject.toml b/pyproject.toml index 4f1569a..5b1b24f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,9 +69,12 @@ dynamic = [ "version", ] dependencies = [ + "claudette", "click<9", "llms-txt==0.0.4", + "openai", "poethepoet<1", + "requests<3", ] optional-dependencies.develop = [ "mypy<1.16", @@ -83,6 +86,8 @@ urls.Changelog = "https://github.com/crate/about/blob/main/CHANGES.md" urls.Issues = "https://github.com/crate/about/issues" urls.Repository = "https://github.com/crate/about" +scripts.cratedb-about = "cratedb_about.cli:cli" + [tool.ruff] line-length = 100 @@ -118,7 +123,10 @@ lint.select = [ ] [tool.mypy] -exclude = [ ] +mypy_path = "src" +packages = [ + "cratedb_about", +] check_untyped_defs = true ignore_missing_imports = true implicit_optional = true diff --git a/src/cratedb_about/cli.py b/src/cratedb_about/cli.py new file mode 100644 index 0000000..8c87e34 --- /dev/null +++ b/src/cratedb_about/cli.py @@ -0,0 +1,47 @@ +import typing as t + +import click + +from cratedb_about.core import CrateDBConversation +from cratedb_about.model import Example + + +@click.group() +@click.version_option() +@click.pass_context +def cli(ctx: click.Context) -> None: + pass + + +@cli.command() +@click.argument("question", type=str, required=False) +@click.option("--backend", type=click.Choice(["openai", "claude"]), default="openai") +def ask(question: str, backend: t.Literal["claude", "openai"]): + """ + Ask questions about CrateDB. + + Requires: + - OpenAI backend: Set OPENAI_API_KEY environment variable + - Claude backend: Set ANTHROPIC_API_KEY environment variable + """ + wizard = CrateDBConversation( + backend=backend, + use_knowledge=True, + ) + if not question: + # Use the AUTOINCREMENT question or fall back to the first question if not found + default_question = next( + (q for q in Example.questions if "AUTOINCREMENT" in q), + Example.questions[0] if Example.questions else "What is CrateDB?", + ) + question = default_question + click.echo(f"Question: {question}\nAnswer:\n") + click.echo(wizard.ask(question)) + + +@cli.command() +def list_questions(): + """ + List a few example questions about CrateDB. + """ + click.echo("\n".join(Example.questions)) diff --git a/src/cratedb_about/core.py b/src/cratedb_about/core.py new file mode 100644 index 0000000..6f82984 --- /dev/null +++ b/src/cratedb_about/core.py @@ -0,0 +1,148 @@ +# Derived from: https://llmstxt.org/domains.html +import dataclasses +import os +import sys +import typing as t + +from cratedb_about.model import Settings + +# Import backends conditionally to avoid errors if dependencies are missing +CLAUDE_AVAILABLE = False +OPENAI_AVAILABLE = False + +try: + from claudette import Chat, contents, models + + CLAUDE_AVAILABLE = True +except ImportError: + pass + +try: + from openai import OpenAI + from openai.types.responses import ResponseInputTextParam + from openai.types.responses.response_input_param import Message + from openai.types.shared_params import Reasoning + + OPENAI_AVAILABLE = True +except ImportError: + pass + + +@dataclasses.dataclass +class CrateDBConversation: + """ + Manage conversations about CrateDB. + + Requires: + - OPENAI_API_KEY environment variable when using "openai" backend + - ANTHROPIC_API_KEY environment variable when using "claude" backend + """ + + backend: t.Literal["claude", "openai"] = "openai" + use_knowledge: bool = True + + def __post_init__(self): + """Validate configuration.""" + if self.backend == "openai" and not OPENAI_AVAILABLE: + raise ImportError("The 'openai' package is required when using the OpenAI backend") + if self.backend == "claude" and not CLAUDE_AVAILABLE: + raise ImportError("The 'claudette' package is required when using the Claude backend") + if self.backend == "openai" and not os.environ.get("OPENAI_API_KEY"): + raise ValueError( + "OPENAI_API_KEY environment variable is required when using 'openai' backend" + ) + if self.backend == "claude" and not os.environ.get("ANTHROPIC_API_KEY"): + raise ValueError( + "ANTHROPIC_API_KEY environment variable is required when using 'claude' backend" + ) + + def ask(self, question: str) -> str: + """ + Ask a question about CrateDB using the configured LLM backend. + + Args: + question: The question to ask about CrateDB + + Returns: + str: The response from the LLM + + Raises: + NotImplementedError: If an unsupported backend is specified + ValueError: If required environment variables are missing + RuntimeError: If there's an error communicating with the LLM API + """ + if self.backend == "openai": + return self.ask_gpt(question) + if self.backend == "claude": + return self.ask_claude(question) + raise NotImplementedError("Please select an available LLM backend") + + def ask_claude(self, question: str) -> str: + # FIXME: API does not provide lookup by name. + model = models[1] # Sonnet 3.5 + chat = Chat(model, sp=Settings.instructions) + if self.use_knowledge: + try: + chat(Settings.get_prompt()) + except Exception as e: + print(f"Warning: Failed to load knowledge context: {e}", file=sys.stderr) # noqa: T201 + try: + result = chat(question) + return contents(result) + except Exception as e: + raise RuntimeError(f"Claude API error: {e}") from e + + def ask_gpt(self, question: str) -> str: + """ + Ask the machine, enriched with CrateDB context, catalyzed through OpenAI's GPT. + + Models like o3 and o4-mini are reasoning models. + https://platform.openai.com/docs/guides/reasoning + + The OpenAI API provides different kinds of roles for messages. Let's use the + `developer` role to relay information on top of the user's question. + + - https://community.openai.com/t/the-system-role-how-it-influences-the-chat-behavior/87353 + - https://community.openai.com/t/understanding-role-management-in-openais-api-two-methods-compared/253289 + - https://community.openai.com/t/how-is-developer-message-better-than-system-prompt/1062784 + """ + + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + input_messages: t.List[Message] = [] + if self.use_knowledge: + try: + prompt = Settings.get_prompt() + if prompt: + input_messages.append( + Message( + content=[ResponseInputTextParam(text=prompt, type="input_text")], + role="developer", + status="completed", + type="message", + ) + ) + except Exception as e: + print(f"Warning: Failed to load knowledge context: {e}", file=sys.stderr) # noqa: T201 + # Always add the user question + input_messages.append( + Message( + content=[ResponseInputTextParam(text=question, type="input_text")], + role="user", + status="completed", + type="message", + ) + ) + + response = client.responses.create( + # model="gpt-4o", # noqa: ERA001 + model="o4-mini", + reasoning=Reasoning( + effort="medium", + # Your organization must be verified to generate reasoning summaries + # summary="detailed", # noqa: ERA001 + ), + instructions=Settings.instructions, + input=input_messages, # type: ignore[arg-type] + ) + return response.output_text diff --git a/src/cratedb_about/model.py b/src/cratedb_about/model.py new file mode 100644 index 0000000..22153b4 --- /dev/null +++ b/src/cratedb_about/model.py @@ -0,0 +1,57 @@ +import os +import sys + +import requests + + +class Example: + """ + A few example questions to ask about CrateDB. + """ + + questions = [ + "What are the benefits of CrateDB?", + "Tell me about why CrateDB is different.", + "Tell me about CrateDB Cloud.", + "How to use sequences with CrateDB?", + "CrateDB does not seem to provide an AUTOINCREMENT feature?", + "How do I apply sharding properly?", + "How much data can CrateDB store?", + "Please tell me how CrateDB stores data.", + "Does CrateDB support SQLAlchemy and pandas?", + ] + + +class Settings: + """ + Configure the language model to support conversations about CrateDB. + """ + + default_context = ( + "CrateDB is a distributed SQL database that makes it simple to" + "store and analyze massive amounts of data in real-time." + ) + + llms_txt_url = os.getenv( + "CRATEDB_CONTEXT_URL", + "https://raw.githubusercontent.com/crate/about/6876fedee57f59b693f37996e04f53c6446f2ad6/build/llm/llms-ctx.txt", + ) + instructions = "You are a helpful and concise assistant." + llms_txt = None + prompt = None + + @classmethod + def get_prompt(cls): + if cls.llms_txt is None: + try: + cls.llms_txt = requests.get(cls.llms_txt_url, timeout=10).text + cls.prompt = ( + cls.llms_txt + "\n\nThe above is necessary context for the conversation." + ) + except requests.RequestException as e: + print(f"Error fetching context: {e}", file=sys.stderr) # noqa: T201 + # Provide minimal fallback context. + cls.llms_txt = cls.default_context + cls.prompt = cls.llms_txt + "\n\nThe above is minimal context for the conversation." + + return cls.prompt