From 66669a6f153636c843bf873e0faed7b25d181bec Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Sun, 24 May 2026 11:10:35 +0200 Subject: [PATCH 01/38] chore: add docker rust validation workflow Add scripts, docs, Dockerfile, and package.json scripts for Docker-based Rust validation (fmt/check/clippy/test) so Windows users without MSVC Build Tools can still validate Rust code. - scripts/docker-rust.ps1: PowerShell script supporting fmt/check/clippy/ test/validate/shell tasks with persistent Docker volumes - Dockerfile.rust: minimal Rust image with rustfmt + clippy pre-installed - docs/docker-rust-validation.md: full usage and design documentation - package.json: 6 new docker:rust:* convenience scripts Design: Linux-target validation via rust:1-bookworm, persistent cargo volumes for caching, fail-fast sequential validation. --- Dockerfile.rust | 23 +++++ docs/docker-rust-validation.md | 119 ++++++++++++++++++++++ package.json | 11 +- scripts/docker-rust.ps1 | 179 +++++++++++++++++++++++++++++++++ 4 files changed, 329 insertions(+), 3 deletions(-) create mode 100644 Dockerfile.rust create mode 100644 docs/docker-rust-validation.md create mode 100644 scripts/docker-rust.ps1 diff --git a/Dockerfile.rust b/Dockerfile.rust new file mode 100644 index 00000000..c75392e6 --- /dev/null +++ b/Dockerfile.rust @@ -0,0 +1,23 @@ +# Dockerfile for Rust validation +# +# Used by scripts/docker-rust.ps1 to run Rust fmt/check/clippy/test +# inside a container, avoiding the need for native MSVC Build Tools +# on Windows. +# +# This is a minimal image: just Rust + rustfmt + clippy. +# If native dependencies fail during validation, add only the required +# apt packages and document why. +# +# Build (optional — the script pulls rust:1-bookworm directly): +# docker build -t aft-rust -f Dockerfile.rust . +# +# Override the default image via AFT_RUST_DOCKER_IMAGE: +# $env:AFT_RUST_DOCKER_IMAGE = 'aft-rust' + +FROM rust:1-bookworm + +WORKDIR /work + +RUN rustup component add rustfmt clippy + +ENV CARGO_TARGET_DIR=/target diff --git a/docs/docker-rust-validation.md b/docs/docker-rust-validation.md new file mode 100644 index 00000000..e642eca7 --- /dev/null +++ b/docs/docker-rust-validation.md @@ -0,0 +1,119 @@ +# Docker Rust Validation + +## Purpose + +Run Rust `fmt`, `check`, `clippy`, and `test` inside a Docker container so +Windows users do not need Microsoft C++ Build Tools (MSVC) installed. + +**Docker validation is Linux-target validation, not native Windows MSVC +validation.** It is acceptable for normal Rust implementation work unless you +are touching Windows-specific filesystem/path/process/TUI behavior. + +## When to use native Windows validation + +Native Windows validation is still required when changes touch: + +- Windows-specific path handling +- Process spawning (`std::process::Command` on Windows) +- Terminal/TUI behavior (ANSI sequences, console APIs) +- Packaging/release binaries (cross-compilation) +- Code relying on OS-specific `cfg!(windows)` or `#[cfg(windows)]` paths + +For everything else, Docker validation is faster and avoids the MSVC +toolchain dependency. + +## Prerequisites + +- Docker Desktop (or Docker Engine) installed and running +- The `aft-cargo-registry`, `aft-cargo-git`, and `aft-target` Docker volumes + (created automatically on first run) + +## How to run + +All commands below are run from the repo root. + +### Using npm/bun scripts (recommended) + +```powershell +# Full validation: fmt → check → clippy → test +bun run docker:rust:validate + +# Individual steps +bun run docker:rust:fmt +bun run docker:rust:check +bun run docker:rust:clippy +bun run docker:rust:test + +# Interactive shell inside the container +bun run docker:rust:shell +``` + +### Using the PowerShell script directly + +```powershell +# Full validation +.\scripts\docker-rust.ps1 validate + +# Individual steps +.\scripts\docker-rust.ps1 fmt +.\scripts\docker-rust.ps1 check +.\scripts\docker-rust.ps1 clippy +.\scripts\docker-rust.ps1 test + +# Interactive shell +.\scripts\docker-rust.ps1 shell +``` + +### Overriding the Docker image + +```powershell +$env:AFT_RUST_DOCKER_IMAGE = 'rust:1.80-bookworm' +.\scripts\docker-rust.ps1 validate +``` + +## Caching + +The script uses three persistent Docker volumes for Cargo caches: + +| Volume | Purpose | +|---|---| +| `aft-cargo-registry` | Crate registry download cache | +| `aft-cargo-git` | Git dependency cache | +| `aft-target` | Compiled artifact cache (`CARGO_TARGET_DIR=/target`) | + +These volumes persist across runs so subsequent invocations reuse compiled +artifacts and downloaded crates. + +## Cleaning up + +```powershell +# Remove Cargo and build caches +docker volume rm aft-cargo-registry aft-cargo-git aft-target + +# Remove the Rust image +docker image rm rust:1-bookworm +``` + +## How it works + +1. The script determines the repo root from its own location. +2. It checks that the three Docker volumes exist (creating them if needed). +3. It runs `docker run` with the repo root mounted at `/work` and the volumes + mounted at their respective Cargo paths. +4. `CARGO_TARGET_DIR=/target` ensures compiled artifacts land on the volume + instead of inside `/work/target/`. +5. Steps install `rustfmt` or `clippy` via `rustup component add` if the + component is not already present in the image. +6. Each step fails fast: if `fmt` fails, the validation stops before `check`. + +## Design decisions + +- **No `Cargo.toml` changes.** Cargo.toml is for Rust workspace/package + configuration, not Docker orchestration. All Docker logic lives in scripts + and documentation. +- **No additional `Dockerfile` required for basic usage.** The script pulls + `rust:1-bookworm` directly. The optional `Dockerfile.rust` at the repo root + is only needed if you want to pre-install components for faster startup. +- **Native scripts are preserved.** The existing `scripts/release.sh` and + `package.json` native scripts (`build:rust`, `test:rust`, `format:check`) + are unchanged and still work for users with a native Rust toolchain. diff --git a/package.json b/package.json index 52490a9e..9ce2320f 100644 --- a/package.json +++ b/package.json @@ -18,9 +18,14 @@ "test:windows-e2e": "bun run scripts/windows-vm/test.ts", "windows-vm:setup": "bun run scripts/windows-vm/setup.ts", "version-sync": "node scripts/version-sync.mjs", - "bench": "bun run benchmarks/src/runner.ts" - }, - "devDependencies": { + "bench": "bun run benchmarks/src/runner.ts", + "docker:rust:fmt": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 fmt", + "docker:rust:check": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 check", + "docker:rust:clippy": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 clippy", + "docker:rust:test": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 test", + "docker:rust:validate": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 validate", + "docker:rust:shell": "powershell -ExecutionPolicy Bypass -File scripts/docker-rust.ps1 shell" + }, "devDependencies": { "@biomejs/biome": "^2.4.7", "@types/node": "^25.8.0", "bun-types": "^1.3.13", diff --git a/scripts/docker-rust.ps1 b/scripts/docker-rust.ps1 new file mode 100644 index 00000000..feab7edf --- /dev/null +++ b/scripts/docker-rust.ps1 @@ -0,0 +1,179 @@ +<# +.SYNOPSIS +Run Rust validation inside a Docker container — fmt, check, clippy, test, or all four. + +.DESCRIPTION +Mounts the repo root into a Rust Docker image and runs Cargo commands with +persistent volumes for the Cargo registry, git cache, and target directory. + +This is Linux-target validation, NOT native Windows MSVC validation. It is +acceptable for normal Rust implementation work unless you are touching +Windows-specific filesystem/path/process/TUI behavior. + +.PARAMETER Task +Which task to run: fmt, check, clippy, test, validate, or shell. +Defaults to validate. + +.EXAMPLE +.\scripts\docker-rust.ps1 fmt +.\scripts\docker-rust.ps1 check +.\scripts\docker-rust.ps1 clippy +.\scripts\docker-rust.ps1 test +.\scripts\docker-rust.ps1 validate +.\scripts\docker-rust.ps1 shell + +.PARAMETER Image +Docker image to use. Override via $env:AFT_RUST_DOCKER_IMAGE. +Defaults to rust:1-bookworm. +#> + +param( + [Parameter(Position = 0)] + [ValidateSet('fmt', 'check', 'clippy', 'test', 'validate', 'shell')] + [string]$Task = 'validate' +) + +$ErrorActionPreference = 'Stop' + +# --- Image --- +$Image = if ($env:AFT_RUST_DOCKER_IMAGE) { $env:AFT_RUST_DOCKER_IMAGE } else { 'rust:1-bookworm' } + +# --- Volumes --- +$Volumes = @( + '--volume', 'aft-cargo-registry:/usr/local/cargo/registry', + '--volume', 'aft-cargo-git:/usr/local/cargo/git', + '--volume', 'aft-target:/target' +) + +# --- Determine repo root (where this script lives) --- +$RepoRoot = Split-Path -Parent $PSScriptRoot + +# --- Helper: run a Docker command --- +function Invoke-DockerTask { + param([string[]]$DockerArgs) + + $fullArgs = @( + 'run', '--rm', + '--workdir', '/work' + ) + $Volumes + @( + '--env', 'CARGO_TARGET_DIR=/target' + ) + $DockerArgs + + Write-Host "docker $($fullArgs -join ' ')" -ForegroundColor Cyan + & docker $fullArgs + $exitCode = $LASTEXITCODE + if ($exitCode -ne 0) { + Write-Host "Docker command failed with exit code $exitCode" -ForegroundColor Red + exit $exitCode + } +} + +# --- Ensure Docker volumes exist --- +foreach ($vol in 'aft-cargo-registry', 'aft-cargo-git', 'aft-target') { + $existing = docker volume ls --format '{{.Name}}' | Select-String -Pattern "^$vol$" + if (-not $existing) { + Write-Host "Creating Docker volume: $vol" -ForegroundColor Yellow + docker volume create $vol | Out-Null + } +} + +# --- Task dispatch --- +switch ($Task) { + 'fmt' { + Write-Host "=== cargo fmt --check ===" -ForegroundColor Green + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'rustup component add rustfmt && cargo fmt --check' + ) + } + + 'check' { + Write-Host "=== cargo check --workspace --all-targets ===" -ForegroundColor Green + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'cargo check --workspace --all-targets' + ) + } + + 'clippy' { + Write-Host "=== cargo clippy --workspace --all-targets --all-features -- -D warnings ===" -ForegroundColor Green + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'rustup component add clippy && cargo clippy --workspace --all-targets --all-features -- -D warnings' + ) + } + + 'test' { + Write-Host "=== cargo test --workspace --all-targets ===" -ForegroundColor Green + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'cargo test --workspace --all-targets' + ) + } + + 'validate' { + Write-Host "=== Running full validation: fmt → check → clippy → test ===" -ForegroundColor Green + + Write-Host "`n--- Step 1/4: cargo fmt --check ---" -ForegroundColor Cyan + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'rustup component add rustfmt && cargo fmt --check' + ) + + Write-Host "`n--- Step 2/4: cargo check --workspace --all-targets ---" -ForegroundColor Cyan + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'cargo check --workspace --all-targets' + ) + + Write-Host "`n--- Step 3/4: cargo clippy --workspace --all-targets --all-features -- -D warnings ---" -ForegroundColor Cyan + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'rustup component add clippy && cargo clippy --workspace --all-targets --all-features -- -D warnings' + ) + + Write-Host "`n--- Step 4/4: cargo test --workspace --all-targets ---" -ForegroundColor Cyan + Invoke-DockerTask -DockerArgs @( + '--volume', "${RepoRoot}:/work", + $Image, + 'sh', '-c', + 'cargo test --workspace --all-targets' + ) + + Write-Host "`n=== All validation steps passed ===" -ForegroundColor Green + } + + 'shell' { + Write-Host "=== Starting interactive shell in container ===" -ForegroundColor Green + $fullArgs = @( + 'run', '--rm', '-it', + '--workdir', '/work' + ) + $Volumes + @( + '--env', 'CARGO_TARGET_DIR=/target', + '--volume', "${RepoRoot}:/work", + $Image, + 'bash' + ) + Write-Host "docker $($fullArgs -join ' ')" -ForegroundColor Cyan + & docker $fullArgs + $exitCode = $LASTEXITCODE + if ($exitCode -ne 0) { + Write-Host "Docker shell exited with code $exitCode" -ForegroundColor Red + exit $exitCode + } + } +} From 50a7e651220e48826daf18aa572da8dee13d1aa7 Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Sun, 24 May 2026 11:49:24 +0200 Subject: [PATCH 02/38] =?UTF-8?q?aft-t6p.7:=20provider=20capabilities=20?= =?UTF-8?q?=E2=80=94=20config=20profiles,=20dimension=20pass-through,=20fi?= =?UTF-8?q?ngerprint=20upgrade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 6 + crates/aft/src/commands/configure.rs | 5 +- crates/aft/src/config.rs | 125 +++++- crates/aft/src/semantic_index.rs | 418 +++++++++++++++++- .../integration/file_summary_chunks_test.rs | 4 + .../tests/integration/semantic_disk_test.rs | 13 +- 6 files changed, 561 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 325e2973..cc74f600 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,9 @@ packages/npm/*/bin/aft.exe smoke-tests/ .aft-windows-vm benchmarks/aft-search/.bench/ + +# Beads / Dolt files (added by bd init) +.dolt/ +*.db +.beads-credential-key +.beads/proxieddb/ diff --git a/crates/aft/src/commands/configure.rs b/crates/aft/src/commands/configure.rs index 0dace206..e8213340 100644 --- a/crates/aft/src/commands/configure.rs +++ b/crates/aft/src/commands/configure.rs @@ -23,7 +23,7 @@ use crate::search_index::{ build_path_filters, current_git_head, project_cache_key, resolve_cache_dir, walk_project_files, CacheLock, SearchIndex, }; -use crate::semantic_index::{SemanticIndex, SemanticIndexLock}; +use crate::semantic_index::{EmbeddingModelProfile, SemanticIndex, SemanticIndexLock}; use crate::{slog_info, slog_warn}; static WATCHER_GENERATION: AtomicU64 = AtomicU64::new(0); @@ -1650,7 +1650,8 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { }); let mut model = crate::semantic_index::EmbeddingModel::from_config(&semantic_config)?; - let fingerprint = model.fingerprint(&semantic_config)?; + let profile = EmbeddingModelProfile::from_config(&semantic_config); + let fingerprint = model.fingerprint(&semantic_config, profile.as_ref())?; let fingerprint_key = fingerprint.as_string(); let _semantic_cache_lock = (!is_worktree_bridge_for_semantic) .then(|| ()) diff --git a/crates/aft/src/config.rs b/crates/aft/src/config.rs index 8eabb055..1d3cb74c 100644 --- a/crates/aft/src/config.rs +++ b/crates/aft/src/config.rs @@ -37,6 +37,103 @@ impl SemanticBackend { } } +/// The encoding format returned by the embedding provider. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OutputEncoding { + /// Standard float32 embeddings (default for most providers). + Float, + /// Base64-encoded signed int8 embeddings (e.g. Perplexity, some OpenAI-compatible). + #[serde(rename = "base64_int8")] + Base64Int8, + /// Base64-encoded binary packed embeddings (e.g. Perplexity binary). + #[serde(rename = "base64_binary")] + Base64Binary, +} + +impl OutputEncoding { + /// Default encoding for a given backend. + pub fn default_for_backend(backend: SemanticBackend) -> Self { + match backend { + SemanticBackend::Fastembed => Self::Float, + SemanticBackend::OpenAiCompatible => Self::Float, + SemanticBackend::Ollama => Self::Float, + } + } +} + +/// How embedding inputs are structured for the provider. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum InputMode { + /// Simple array of text strings. + #[serde(rename = "flat_texts")] + FlatTexts, + /// Grouped document-chunk inputs (e.g. Perplexity contextualized). + #[serde(rename = "document_chunks")] + DocumentChunks, +} + +impl InputMode { + pub fn default_for_backend(backend: SemanticBackend) -> Self { + match backend { + SemanticBackend::Fastembed => Self::FlatTexts, + SemanticBackend::OpenAiCompatible => Self::FlatTexts, + SemanticBackend::Ollama => Self::FlatTexts, + } + } +} + +/// How vectors are stored in the local index after retrieval. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum StorageStrategy { + /// Native f32 vectors stored as-is (default for Float output encoding). + #[serde(rename = "native_f32")] + NativeF32, + /// Decode int8 to f32 and L2-normalize before storage (compatibility path for base64_int8). + #[serde(rename = "decode_normalize_f32")] + DecodeNormalizeF32, +} + +impl StorageStrategy { + pub fn default_for_backend(backend: SemanticBackend) -> Self { + match backend { + SemanticBackend::Fastembed => Self::NativeF32, + SemanticBackend::OpenAiCompatible => Self::NativeF32, + SemanticBackend::Ollama => Self::NativeF32, + } + } +} + +/// Distance metric for similarity search. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum DistanceMetric { + /// Resolve from provider/model profile and storage strategy. + #[serde(rename = "auto")] + Auto, + /// Cosine similarity (default for normalized dense vectors). + Cosine, + /// Dot product. + #[serde(rename = "dot_product")] + DotProduct, + /// Euclidean distance. + Euclidean, + /// Hamming distance (for binary vectors). + Hamming, +} + +impl DistanceMetric { + pub fn default_for_backend(backend: SemanticBackend) -> Self { + match backend { + SemanticBackend::Fastembed => Self::Auto, + SemanticBackend::OpenAiCompatible => Self::Auto, + SemanticBackend::Ollama => Self::Auto, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct SemanticBackendConfig { pub backend: SemanticBackend, @@ -45,8 +142,28 @@ pub struct SemanticBackendConfig { pub api_key_env: Option, pub timeout_ms: u64, pub max_batch_size: usize, + /// Optional user-requested embedding dimensions. When set, the provider + /// is asked to return vectors of this dimension (if supported). + /// When unset, the provider's default dimension is used. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + /// Optional output encoding format from the provider. + /// Defaults to `float` for all built-in backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub output_encoding: Option, + /// Optional input mode for the provider. + /// Defaults to `flat_texts` for all built-in backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub input_mode: Option, + /// Optional storage strategy for how vectors are stored locally. + /// Defaults to `native_f32` for all built-in backends. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub storage_strategy: Option, + /// Optional distance metric for similarity search. + /// Defaults to `auto` which resolves from provider/model profile. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub distance_metric: Option, } - #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct UserServerDef { pub id: String, @@ -70,10 +187,14 @@ impl Default for SemanticBackendConfig { // semantic_search requests when callers do not set an explicit timeout. timeout_ms: 25_000, max_batch_size: 64, + dimensions: None, + output_encoding: None, + input_mode: None, + storage_strategy: None, + distance_metric: None, } } } - pub const DEFAULT_SEMANTIC_MODEL: &str = "all-MiniLM-L6-v2"; impl Config { diff --git a/crates/aft/src/semantic_index.rs b/crates/aft/src/semantic_index.rs index 44844497..e9ca24e3 100644 --- a/crates/aft/src/semantic_index.rs +++ b/crates/aft/src/semantic_index.rs @@ -1,5 +1,8 @@ use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict}; -use crate::config::{SemanticBackend, SemanticBackendConfig}; +use crate::config::{ + DistanceMetric, InputMode, OutputEncoding, SemanticBackend, SemanticBackendConfig, + StorageStrategy, +}; use crate::fs_lock; use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for}; use crate::search_index::{cache_relative_path, cached_path_under_root}; @@ -49,7 +52,284 @@ const SEMANTIC_INDEX_VERSION_V5: u8 = 5; const SEMANTIC_INDEX_VERSION_V6: u8 = 6; const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings"; const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed"; -// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends. + +// ---- Typed vector representation types ---- + +/// The kind of vector as emitted by the embedding provider. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum VectorKind { + /// Standard dense f32 vector (most providers). + DenseF32, + /// Dense int8 vector (e.g. Perplexity base64_int8). + DenseInt8, + /// Binary packed vector (e.g. Perplexity base64_binary). + BinaryPacked, +} + +/// Normalization policy for stored vectors. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum NormalizationPolicy { + /// Vector is already L2-normalized by the provider. + AlreadyNormalized, + /// AFT must L2-normalize on insert and query. + NormalizeOnInsertQuery, + /// Normalization is not applicable (e.g. binary vectors). + NotApplicable, +} + +/// A profile describing the capabilities and expected output of an embedding model. +/// +/// Used to validate that user configuration is compatible with the selected +/// provider/model before indexing starts. +#[derive(Debug, Clone)] +pub struct EmbeddingModelProfile { + /// Which semantic backend this profile applies to. + pub backend: SemanticBackend, + /// Model name (may be empty for generic profiles). + pub model: Option, + /// Supported input mode. + pub input_mode: InputMode, + /// Expected output encoding from the provider. + pub output_encoding: OutputEncoding, + /// The kind of vectors the provider emits. + pub source_vector_kind: VectorKind, + /// The kind of vectors stored after AFT conversion. + pub stored_vector_kind: VectorKind, + /// Metric that should be used for similarity search. + pub metric: DistanceMetric, + /// Normalization policy for stored vectors. + pub normalization: NormalizationPolicy, + /// Supported dimension range: (min, max). None if unknown. + pub dimension_range: Option<(usize, usize)>, + /// Default dimension when not specified. None if unknown. + pub default_dimensions: Option, + /// Whether Matryoshka Representation Learning (reduced dimensions) is supported. + pub mrl_supported: bool, + /// Whether contextualized document-chunk inputs are supported. + pub contextualized_supported: bool, +} + +impl EmbeddingModelProfile { + /// Returns a profile for the fastembed all-MiniLM-L6-v2 model. + pub fn fastembed_minilm() -> Self { + Self { + backend: SemanticBackend::Fastembed, + model: Some("all-MiniLM-L6-v2".to_string()), + input_mode: InputMode::FlatTexts, + output_encoding: OutputEncoding::Float, + source_vector_kind: VectorKind::DenseF32, + stored_vector_kind: VectorKind::DenseF32, + metric: DistanceMetric::Cosine, + normalization: NormalizationPolicy::AlreadyNormalized, + dimension_range: Some((384, 384)), + default_dimensions: Some(384), + mrl_supported: false, + contextualized_supported: false, + } + } + + /// Returns a generic profile for OpenAI-compatible embedding providers. + /// These may support `dimensions` depending on the model. + pub fn openai_compatible_generic() -> Self { + Self { + backend: SemanticBackend::OpenAiCompatible, + model: None, + input_mode: InputMode::FlatTexts, + output_encoding: OutputEncoding::Float, + source_vector_kind: VectorKind::DenseF32, + stored_vector_kind: VectorKind::DenseF32, + metric: DistanceMetric::Auto, + normalization: NormalizationPolicy::AlreadyNormalized, + dimension_range: None, + default_dimensions: None, + mrl_supported: true, + contextualized_supported: false, + } + } + + /// Returns a generic profile for Ollama embedding models. + pub fn ollama_generic() -> Self { + Self { + backend: SemanticBackend::Ollama, + model: None, + input_mode: InputMode::FlatTexts, + output_encoding: OutputEncoding::Float, + source_vector_kind: VectorKind::DenseF32, + stored_vector_kind: VectorKind::DenseF32, + metric: DistanceMetric::Auto, + normalization: NormalizationPolicy::AlreadyNormalized, + dimension_range: None, + default_dimensions: None, + mrl_supported: false, + contextualized_supported: false, + } + } + + /// Look up a profile for the given config. + /// Returns `None` if no specific profile is known (caller should use defaults). + pub fn from_config(config: &SemanticBackendConfig) -> Option { + match config.backend { + SemanticBackend::Fastembed => { + if config.model == "all-MiniLM-L6-v2" { + Some(Self::fastembed_minilm()) + } else { + None + } + } + SemanticBackend::OpenAiCompatible => Some(Self::openai_compatible_generic()), + SemanticBackend::Ollama => Some(Self::ollama_generic()), + } + } + + /// Validate that the configured options are compatible with this profile. + /// Returns `Ok(())` or a list of validation errors. + pub fn validate_config(&self, config: &SemanticBackendConfig) -> Result<(), Vec> { + let mut errors: Vec = Vec::new(); + let cfg_prefix = "semantic"; + + // Resolve effective output encoding + let output_encoding = config + .output_encoding + .unwrap_or(OutputEncoding::default_for_backend(config.backend)); + + // Resolve effective storage strategy + let storage_strategy = config + .storage_strategy + .unwrap_or(StorageStrategy::default_for_backend(config.backend)); + + // Check input mode compatibility + let input_mode = config + .input_mode + .unwrap_or(InputMode::default_for_backend(config.backend)); + if input_mode == InputMode::DocumentChunks && !self.contextualized_supported { + errors.push(format!( + "{}.input_mode=document_chunks is not supported by backend {}", + cfg_prefix, + config.backend.as_str() + )); + } + + // Check output encoding compatibility + if output_encoding != self.output_encoding + && !(output_encoding == OutputEncoding::Base64Int8 + && matches!(config.backend, SemanticBackend::OpenAiCompatible)) + { + // Allow base64_int8 for OpenAI-compatible (e.g. Perplexity) + if !matches!( + (output_encoding, self.output_encoding), + (OutputEncoding::Float, OutputEncoding::Float) + | (OutputEncoding::Base64Int8, OutputEncoding::Float) + ) { + errors.push(format!( + "{}.output_encoding={:?} is not supported by backend {}", + cfg_prefix, + output_encoding, + config.backend.as_str() + )); + } + } + + // Check storage strategy compatibility + match (output_encoding, storage_strategy) { + (OutputEncoding::Float, StorageStrategy::NativeF32) => {} + (OutputEncoding::Base64Int8, StorageStrategy::DecodeNormalizeF32) => {} + (OutputEncoding::Base64Binary, _) => { + errors.push(format!( + "{}.output_encoding=base64_binary requires a native binary vector store, not available in MVP", + cfg_prefix + )); + } + _ => { + errors.push(format!( + "{}.storage_strategy={:?} is not compatible with output_encoding={:?}", + cfg_prefix, storage_strategy, output_encoding + )); + } + } + + // Check dimensions against profile + if let Some(dimensions) = config.dimensions { + if let Some((min_dim, max_dim)) = self.dimension_range { + if dimensions < min_dim || dimensions > max_dim { + errors.push(format!( + "{}.dimensions={} is outside supported range {}-{} for {} {}", + cfg_prefix, + dimensions, + min_dim, + max_dim, + config.backend.as_str(), + config.model + )); + } + } + if !self.mrl_supported && config.dimensions.is_some() { + errors.push(format!( + "{}.dimensions is set but the model does not support reduced dimensions", + cfg_prefix + )); + } + } + + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } + } +} + +/// Resolve an effective distance metric from config and profile. +/// When `DistanceMetric::Auto` is configured, returns the profile's recommended metric. +pub fn resolve_distance_metric( + config: &SemanticBackendConfig, + profile: Option<&EmbeddingModelProfile>, +) -> DistanceMetric { + if let Some(metric) = config.distance_metric { + if metric != DistanceMetric::Auto { + return metric; + } + } + // Auto: resolve from profile + if let Some(profile) = profile { + profile.metric + } else { + // Fallback to cosine for unknown profiles + DistanceMetric::Cosine + } +} + +/// Resolve effective output encoding from config. +pub fn resolve_output_encoding(config: &SemanticBackendConfig) -> OutputEncoding { + config + .output_encoding + .unwrap_or(OutputEncoding::default_for_backend(config.backend)) +} + +/// Resolve effective storage strategy from config. +pub fn resolve_storage_strategy(config: &SemanticBackendConfig) -> StorageStrategy { + config + .storage_strategy + .unwrap_or(StorageStrategy::default_for_backend(config.backend)) +} + +/// Resolve effective input mode from config. +pub fn resolve_input_mode(config: &SemanticBackendConfig) -> InputMode { + config + .input_mode + .unwrap_or(InputMode::default_for_backend(config.backend)) +} + +/// Resolve effective dimensions from config with profile fallback. +pub fn resolve_dimensions( + config: &SemanticBackendConfig, + profile: Option<&EmbeddingModelProfile>, +) -> Option { + config + .dimensions + .or_else(|| profile.and_then(|p| p.default_dimensions)) +} // Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends. const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000; const DEFAULT_MAX_BATCH_SIZE: usize = 64; const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000; @@ -90,14 +370,34 @@ pub struct SemanticIndexFingerprint { pub dimension: usize, #[serde(default = "default_chunking_version")] pub chunking_version: u32, + /// Output encoding used for this index. + #[serde(default)] + pub output_encoding: String, + /// Storage strategy used for this index. + #[serde(default)] + pub storage_strategy: String, + /// Resolved distance metric for this index. + #[serde(default = "default_dot_auto")] + pub distance_metric: String, + /// Input mode used for this index. + #[serde(default)] + pub input_mode: String, } fn default_chunking_version() -> u32 { 2 } +fn default_dot_auto() -> String { + "auto".to_string() +} + impl SemanticIndexFingerprint { - fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self { + fn from_config( + config: &SemanticBackendConfig, + dimension: usize, + profile: Option<&EmbeddingModelProfile>, + ) -> Self { // Use normalized URL for fingerprinting so cosmetic differences // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds. let base_url = config @@ -111,6 +411,10 @@ impl SemanticIndexFingerprint { base_url, dimension, chunking_version: default_chunking_version(), + output_encoding: resolve_output_encoding(config).to_string(), + storage_strategy: resolve_storage_strategy(config).to_string(), + distance_metric: resolve_distance_metric(config, profile).to_string(), + input_mode: resolve_input_mode(config).to_string(), } } @@ -146,6 +450,16 @@ pub struct SemanticEmbeddingModel { timeout_ms: u64, max_batch_size: usize, dimension: Option, + /// User-requested dimension from config (None = use provider default). + config_dimensions: Option, + /// Resolved output encoding for this model. + output_encoding: OutputEncoding, + /// Resolved storage strategy for this model. + storage_strategy: StorageStrategy, + /// Resolved distance metric for this model. + distance_metric: DistanceMetric, + /// Resolved input mode for this model. + input_mode: InputMode, engine: SemanticEmbeddingEngine, query_embedding_cache: HashMap>, query_embedding_cache_order: VecDeque, @@ -406,6 +720,68 @@ where unreachable!("embedding request retries exhausted without returning") } +// ---- Display impls for capability types ---- + +impl std::fmt::Display for VectorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::DenseF32 => write!(f, "dense_f32"), + Self::DenseInt8 => write!(f, "dense_int8"), + Self::BinaryPacked => write!(f, "binary_packed"), + } + } +} + +impl std::fmt::Display for NormalizationPolicy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AlreadyNormalized => write!(f, "already_normalized"), + Self::NormalizeOnInsertQuery => write!(f, "normalize_on_insert_query"), + Self::NotApplicable => write!(f, "not_applicable"), + } + } +} + +impl std::fmt::Display for OutputEncoding { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Float => write!(f, "float"), + Self::Base64Int8 => write!(f, "base64_int8"), + Self::Base64Binary => write!(f, "base64_binary"), + } + } +} + +impl std::fmt::Display for InputMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::FlatTexts => write!(f, "flat_texts"), + Self::DocumentChunks => write!(f, "document_chunks"), + } + } +} + +impl std::fmt::Display for StorageStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NativeF32 => write!(f, "native_f32"), + Self::DecodeNormalizeF32 => write!(f, "decode_normalize_f32"), + } + } +} + +impl std::fmt::Display for DistanceMetric { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Auto => write!(f, "auto"), + Self::Cosine => write!(f, "cosine"), + Self::DotProduct => write!(f, "dot_product"), + Self::Euclidean => write!(f, "euclidean"), + Self::Hamming => write!(f, "hamming"), + } + } +} + impl SemanticEmbeddingModel { pub fn from_config(config: &SemanticBackendConfig) -> Result { let timeout_ms = if config.timeout_ms == 0 { @@ -475,6 +851,11 @@ impl SemanticEmbeddingModel { timeout_ms, max_batch_size, dimension: None, + config_dimensions: config.dimensions, + output_encoding: resolve_output_encoding(config), + storage_strategy: resolve_storage_strategy(config), + distance_metric: DistanceMetric::Auto, + input_mode: resolve_input_mode(config), engine, query_embedding_cache: HashMap::new(), query_embedding_cache_order: VecDeque::new(), @@ -506,9 +887,14 @@ impl SemanticEmbeddingModel { pub fn fingerprint( &mut self, config: &SemanticBackendConfig, + profile: Option<&EmbeddingModelProfile>, ) -> Result { let dimension = self.dimension()?; - Ok(SemanticIndexFingerprint::from_config(config, dimension)) + // Resolve distance metric (auto -> profile) + self.distance_metric = resolve_distance_metric(config, profile); + Ok(SemanticIndexFingerprint::from_config( + config, dimension, profile, + )) } pub fn dimension(&mut self) -> Result { @@ -600,10 +986,16 @@ impl SemanticEmbeddingModel { } => { let expected_text_count = texts.len(); let endpoint = build_openai_embeddings_endpoint(base_url); - let body = serde_json::json!({ + + let mut body = serde_json::json!({ "input": texts, "model": model, }); + // Conditionally add dimensions when user-configured or when + // we already know the dimension from a previous probe. + if let Some(dims) = self.config_dimensions.or(self.dimension) { + body["dimensions"] = serde_json::json!(dims); + } let raw = send_embedding_request( || { @@ -2648,6 +3040,10 @@ mod tests { base_url: FALLBACK_BACKEND.to_string(), dimension: 4, chunking_version: default_chunking_version(), + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }); let bytes = index.to_bytes(); @@ -3223,6 +3619,10 @@ mod tests { base_url: "http://127.0.0.1:1234/v1".to_string(), dimension: 3, chunking_version: default_chunking_version(), + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }); index.write_to_disk(storage.path(), project_key); @@ -3242,6 +3642,10 @@ mod tests { base_url: "http://127.0.0.1:11434".to_string(), dimension: 3, chunking_version: default_chunking_version(), + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), } .as_string(); assert!(SemanticIndex::read_from_disk( @@ -3286,6 +3690,10 @@ mod tests { base_url: FALLBACK_BACKEND.to_string(), dimension: 3, chunking_version: default_chunking_version(), + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }; index.set_fingerprint(fingerprint.clone()); diff --git a/crates/aft/tests/integration/file_summary_chunks_test.rs b/crates/aft/tests/integration/file_summary_chunks_test.rs index b417f0d1..09c1d8ae 100644 --- a/crates/aft/tests/integration/file_summary_chunks_test.rs +++ b/crates/aft/tests/integration/file_summary_chunks_test.rs @@ -129,6 +129,10 @@ fn reindex_roundtrip_after_chunking_version_bump_is_deterministic() { base_url: "none".to_string(), dimension: 1, chunking_version: 2, + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }; index.set_fingerprint(fingerprint.clone()); index.write_to_disk(storage.path(), "file-summary-roundtrip"); diff --git a/crates/aft/tests/integration/semantic_disk_test.rs b/crates/aft/tests/integration/semantic_disk_test.rs index f0240ef4..375edf25 100644 --- a/crates/aft/tests/integration/semantic_disk_test.rs +++ b/crates/aft/tests/integration/semantic_disk_test.rs @@ -286,9 +286,12 @@ fn read_from_disk_rebuilds_v1_cache_when_fingerprint_is_expected() { base_url: "none".to_string(), dimension: 3, chunking_version: 2, + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), } .as_string(); - assert!(SemanticIndex::read_from_disk( storage.path(), "v1-project", @@ -380,6 +383,10 @@ fn read_from_disk_rebuilds_v2_cache_for_v4_snippets() { base_url: "none".to_string(), dimension: 4, chunking_version: 2, + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }; let fp_str = fingerprint.as_string(); let fp_bytes = fp_str.as_bytes(); @@ -457,6 +464,10 @@ fn from_bytes_rejects_corrupt_v3_cache_payloads() { base_url: "none".to_string(), dimension: 4, chunking_version: 2, + output_encoding: "float".to_string(), + storage_strategy: "native_f32".to_string(), + distance_metric: "auto".to_string(), + input_mode: "flat_texts".to_string(), }; let fp_bytes = fingerprint.as_string().into_bytes(); let mut bytes = Vec::new(); From 34073bedc18e3ad7229b3157dc7f4478be20490d Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Sun, 24 May 2026 12:09:45 +0200 Subject: [PATCH 03/38] aft-t6p.1: embedding query/document prompt-template support --- crates/aft/src/commands/configure.rs | 33 +++++++++- crates/aft/src/commands/semantic_search.rs | 3 +- crates/aft/src/config.rs | 12 ++++ crates/aft/src/semantic_index.rs | 63 +++++++++++++++++-- .../integration/file_summary_chunks_test.rs | 1 + .../tests/integration/semantic_disk_test.rs | 3 + 6 files changed, 105 insertions(+), 10 deletions(-) diff --git a/crates/aft/src/commands/configure.rs b/crates/aft/src/commands/configure.rs index e8213340..645afe8b 100644 --- a/crates/aft/src/commands/configure.rs +++ b/crates/aft/src/commands/configure.rs @@ -1701,7 +1701,23 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { } let mut cached = cached; - let mut embed = |texts: Vec| model.embed(texts); + let doc_template = semantic_config.document_prompt_template.clone(); + let mut embed = move |texts: Vec| { + let texts = if let Some(ref tpl) = doc_template { + texts + .iter() + .map(|t| { + crate::semantic_index::apply_document_template( + t, + Some(tpl), + ) + }) + .collect() + } else { + texts + }; + model.embed(texts) + }; let _ = tx_progress.send(SemanticIndexEvent::Progress { stage: "refreshing_stale_files".to_string(), files: None, @@ -1790,7 +1806,20 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { )); } - let mut embed = |texts: Vec| model.embed(texts); + let doc_template = semantic_config.document_prompt_template.clone(); + let mut embed = move |texts: Vec| { + let texts = if let Some(ref tpl) = doc_template { + texts + .iter() + .map(|t| { + crate::semantic_index::apply_document_template(t, Some(tpl)) + }) + .collect() + } else { + texts + }; + model.embed(texts) + }; let _ = tx_progress.send(SemanticIndexEvent::Progress { stage: "extracting_symbols".to_string(), diff --git a/crates/aft/src/commands/semantic_search.rs b/crates/aft/src/commands/semantic_search.rs index d11277d5..f721c50c 100644 --- a/crates/aft/src/commands/semantic_search.rs +++ b/crates/aft/src/commands/semantic_search.rs @@ -178,9 +178,8 @@ fn embed_query(query: &str, ctx: &AppContext) -> Result, String> { .as_mut() .ok_or_else(|| "embedding model was not initialized".to_string())?; let query_vector = model - .embed_query_cached(query) + .embed_query_cached(query, semantic_config.query_prompt_template.as_deref()) .map_err(|error| format!("failed to embed query: {error}"))?; - if let Some(index) = ctx.semantic_index().borrow().as_ref() { if index.dimension() != query_vector.len() { return Err(format!( diff --git a/crates/aft/src/config.rs b/crates/aft/src/config.rs index 1d3cb74c..98c2d2b6 100644 --- a/crates/aft/src/config.rs +++ b/crates/aft/src/config.rs @@ -163,6 +163,16 @@ pub struct SemanticBackendConfig { /// Defaults to `auto` which resolves from provider/model profile. #[serde(default, skip_serializing_if = "Option::is_none")] pub distance_metric: Option, + /// Optional template applied to user queries before embedding. + /// Use `{query}` as the placeholder for the raw query text. + /// Example: "Instruct: Given a code search query, retrieve relevant code snippet that answer the query\nQuery: {query}" + #[serde(default, skip_serializing_if = "Option::is_none")] + pub query_prompt_template: Option, + /// Optional template applied to document/chunk text before embedding. + /// Use `{text}` as the placeholder for the raw chunk text. + /// Example: "Represent this code snippet for retrieval: {text}" + #[serde(default, skip_serializing_if = "Option::is_none")] + pub document_prompt_template: Option, } #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct UserServerDef { @@ -192,6 +202,8 @@ impl Default for SemanticBackendConfig { input_mode: None, storage_strategy: None, distance_metric: None, + query_prompt_template: None, + document_prompt_template: None, } } } diff --git a/crates/aft/src/semantic_index.rs b/crates/aft/src/semantic_index.rs index e9ca24e3..f2f7a952 100644 --- a/crates/aft/src/semantic_index.rs +++ b/crates/aft/src/semantic_index.rs @@ -336,6 +336,38 @@ const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000; const FALLBACK_BACKEND: &str = "none"; const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3; const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000]; + +/// Apply a query prompt template to a raw query string. +/// Replaces `{query}` with the raw query text. +/// Returns the template with `{query}` replaced, or the raw query if template is None or missing placeholder. +pub fn apply_query_template(query: &str, template: Option<&str>) -> String { + match template { + Some(tpl) if tpl.contains("{query}") => tpl.replace("{query}", query), + Some(_) => query.to_string(), + None => query.to_string(), + } +} + +/// Apply a document prompt template to raw chunk text. +/// Replaces `{text}` with the raw chunk text. +/// Returns the template with `{text}` replaced, or the raw text if template is None or missing placeholder. +pub fn apply_document_template(text: &str, template: Option<&str>) -> String { + match template { + Some(tpl) if tpl.contains("{text}") => tpl.replace("{text}", text), + Some(_) => text.to_string(), + None => text.to_string(), + } +} + +/// Compute a stable hash for a prompt template. Returns empty string when None. +pub fn prompt_template_hash(template: Option<&str>) -> String { + template.map_or(String::new(), |t| { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + use std::hash::{Hash, Hasher}; + t.hash(&mut hasher); + hasher.finish().to_string() + }) +} static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(()); pub struct SemanticIndexLock { @@ -382,6 +414,9 @@ pub struct SemanticIndexFingerprint { /// Input mode used for this index. #[serde(default)] pub input_mode: String, + /// Hash of the document prompt template (empty string when no document prompt is configured). + #[serde(default)] + pub document_prompt_hash: String, } fn default_chunking_version() -> u32 { @@ -415,6 +450,7 @@ impl SemanticIndexFingerprint { storage_strategy: resolve_storage_strategy(config).to_string(), distance_metric: resolve_distance_metric(config, profile).to_string(), input_mode: resolve_input_mode(config).to_string(), + document_prompt_hash: prompt_template_hash(config.document_prompt_template.as_deref()), } } @@ -938,14 +974,26 @@ impl SemanticEmbeddingModel { self.embed_texts(texts) } - pub fn embed_query_cached(&mut self, query: &str) -> Result, String> { - if let Some(vector) = self.query_embedding_cache.get(query) { + pub fn embed_query_cached( + &mut self, + query: &str, + query_prompt_template: Option<&str>, + ) -> Result, String> { + let prompt_hash = prompt_template_hash(query_prompt_template); + let cache_key = if prompt_hash.is_empty() { + query.to_string() + } else { + format!("{prompt_hash}:{query}") + }; + + if let Some(vector) = self.query_embedding_cache.get(&cache_key) { self.query_embedding_cache_hits += 1; return Ok(vector.clone()); } self.query_embedding_cache_misses += 1; - let embeddings = self.embed_texts(vec![query.to_string()])?; + let prefixed_query = apply_query_template(query, query_prompt_template); + let embeddings = self.embed_texts(vec![prefixed_query])?; let vector = embeddings .first() .cloned() @@ -957,9 +1005,8 @@ impl SemanticEmbeddingModel { } } self.query_embedding_cache - .insert(query.to_string(), vector.clone()); - self.query_embedding_cache_order - .push_back(query.to_string()); + .insert(cache_key.clone(), vector.clone()); + self.query_embedding_cache_order.push_back(cache_key); Ok(vector) } @@ -3044,6 +3091,7 @@ mod tests { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }); let bytes = index.to_bytes(); @@ -3623,6 +3671,7 @@ mod tests { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }); index.write_to_disk(storage.path(), project_key); @@ -3646,6 +3695,7 @@ mod tests { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), } .as_string(); assert!(SemanticIndex::read_from_disk( @@ -3694,6 +3744,7 @@ mod tests { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }; index.set_fingerprint(fingerprint.clone()); diff --git a/crates/aft/tests/integration/file_summary_chunks_test.rs b/crates/aft/tests/integration/file_summary_chunks_test.rs index 09c1d8ae..091cad1a 100644 --- a/crates/aft/tests/integration/file_summary_chunks_test.rs +++ b/crates/aft/tests/integration/file_summary_chunks_test.rs @@ -133,6 +133,7 @@ fn reindex_roundtrip_after_chunking_version_bump_is_deterministic() { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }; index.set_fingerprint(fingerprint.clone()); index.write_to_disk(storage.path(), "file-summary-roundtrip"); diff --git a/crates/aft/tests/integration/semantic_disk_test.rs b/crates/aft/tests/integration/semantic_disk_test.rs index 375edf25..1dc0bb40 100644 --- a/crates/aft/tests/integration/semantic_disk_test.rs +++ b/crates/aft/tests/integration/semantic_disk_test.rs @@ -290,6 +290,7 @@ fn read_from_disk_rebuilds_v1_cache_when_fingerprint_is_expected() { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), } .as_string(); assert!(SemanticIndex::read_from_disk( @@ -387,6 +388,7 @@ fn read_from_disk_rebuilds_v2_cache_for_v4_snippets() { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }; let fp_str = fingerprint.as_string(); let fp_bytes = fp_str.as_bytes(); @@ -468,6 +470,7 @@ fn from_bytes_rejects_corrupt_v3_cache_payloads() { storage_strategy: "native_f32".to_string(), distance_metric: "auto".to_string(), input_mode: "flat_texts".to_string(), + document_prompt_hash: String::new(), }; let fp_bytes = fingerprint.as_string().into_bytes(); let mut bytes = Vec::new(); From f60a2a97949fae48fdaf7ae461168a8fd23102b6 Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Sun, 24 May 2026 14:36:08 +0200 Subject: [PATCH 04/38] =?UTF-8?q?aft-t6p.15:=20semantic=20config=20trust?= =?UTF-8?q?=20boundary=20=E2=80=94=20TypeScript=20schema,=20warning=20fiel?= =?UTF-8?q?ds,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/__tests__/config.test.ts | 58 ++++++++++++++++-- packages/opencode-plugin/src/config.ts | 61 ++++++++++++++++--- 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/packages/opencode-plugin/src/__tests__/config.test.ts b/packages/opencode-plugin/src/__tests__/config.test.ts index c2bdcfbe..27eec9d0 100644 --- a/packages/opencode-plugin/src/__tests__/config.test.ts +++ b/packages/opencode-plugin/src/__tests__/config.test.ts @@ -690,10 +690,61 @@ describe("loadAftConfig", () => { }, }); expect(result.stderr).toContain( - "Ignoring semantic.backend/base_url/api_key_env from project config (security: use user config for external backends)", + "Ignoring semantic.backend, base_url, api_key_env from project config (security: these semantic settings only honor user-level config)", ); }); + test("strips new semantic fields from project config with warning", () => { + const fixture = createConfigFixture(); + // User config with a backend + writeFileSync( + fixture.userConfigPath, + JSON.stringify({ + semantic: { + backend: "ollama", + base_url: "http://localhost:11434", + model: "mxbai-embed-large", + }, + }), + ); + // Project config tries to set all the new restricted fields + writeFileSync( + fixture.projectConfigPath, + JSON.stringify({ + semantic: { + output_encoding: "binary", + storage_strategy: "binary_pack", + input_mode: "contextualized", + dimensions: 256, + distance_metric: "dot", + query_prompt_template: "inject {{query}}", + document_prompt_template: "inject {{document}}", + }, + }), + ); + + const result = runConfigLoader(fixture.projectDirectory, { + HOME: join(fixture.root, "home"), + XDG_CONFIG_HOME: fixture.xdgConfigHome, + }); + + const config = JSON.parse(result.stdout); + // User's settings must survive + expect(config.semantic.backend).toBe("ollama"); + expect(config.semantic.model).toBe("mxbai-embed-large"); + // Project's new fields must be stripped + expect(config.semantic.output_encoding).toBeUndefined(); + expect(config.semantic.storage_strategy).toBeUndefined(); + expect(config.semantic.input_mode).toBeUndefined(); + expect(config.semantic.dimensions).toBeUndefined(); + expect(config.semantic.distance_metric).toBeUndefined(); + expect(config.semantic.query_prompt_template).toBeUndefined(); + expect(config.semantic.document_prompt_template).toBeUndefined(); + // Warning must mention the new fields + expect(result.stderr).toContain("Ignoring semantic.output_encoding, storage_strategy, input_mode"); + expect(result.stderr).toContain("Ignoring semantic."); + }); + test("blocks exfiltration when project config has ONLY sensitive semantic fields (no safe fields)", () => { const fixture = createConfigFixture(); // User has a real external backend configured @@ -730,11 +781,10 @@ describe("loadAftConfig", () => { expect(config.semantic.base_url).toBe("http://localhost:11434"); expect(config.semantic.model).toBe("mxbai-embed-large"); expect(config.semantic.api_key_env).toBeUndefined(); - expect(result.stderr).toContain("Ignoring semantic.backend/base_url/api_key_env"); + expect(result.stderr).toContain("Ignoring semantic.backend, base_url, api_key_env"); }); - test("partial safe-field override preserves user model", () => { - const fixture = createConfigFixture(); + test("partial safe-field override preserves user model", () => { const fixture = createConfigFixture(); writeFileSync( fixture.userConfigPath, JSON.stringify({ diff --git a/packages/opencode-plugin/src/config.ts b/packages/opencode-plugin/src/config.ts index 19dc958d..df7f0417 100644 --- a/packages/opencode-plugin/src/config.ts +++ b/packages/opencode-plugin/src/config.ts @@ -36,6 +36,18 @@ const CheckerEnum = z.enum([ const SemanticBackendEnum = z.enum(["fastembed", "openai_compatible", "ollama"]); +/** Output encoding mode for embeddings. */ +const SemanticOutputEncodingEnum = z.enum(["float", "binary", "ubinary", "int8", "uint8"]); + +/** Storage strategy for embedding vectors. */ +const SemanticStorageStrategyEnum = z.enum(["flat", "binary_pack"]); + +/** Input mode for document chunking before embedding. */ +const SemanticInputModeEnum = z.enum(["flat_texts", "chunk_extracts", "contextualized"]); + +/** Distance metric for similarity search. */ +const SemanticDistanceMetricEnum = z.enum(["cosine", "dot", "hamming"]); + const SemanticConfigSchema = z.object({ /** Semantic backend type: local fastembed, OpenAI-compatible API, or Ollama. */ backend: SemanticBackendEnum.optional(), @@ -49,8 +61,21 @@ const SemanticConfigSchema = z.object({ timeout_ms: z.number().int().positive().optional(), /** Maximum batch size used by the semantic pipeline. */ max_batch_size: z.number().int().positive().optional(), + /** Output encoding for embedding vectors: "float" (default), "binary", "ubinary", "int8", or "uint8". */ + output_encoding: SemanticOutputEncodingEnum.optional(), + /** Storage strategy: "flat" (default) or "binary_pack". */ + storage_strategy: SemanticStorageStrategyEnum.optional(), + /** Input mode for document processing: "flat_texts" (default), "chunk_extracts", or "contextualized". */ + input_mode: SemanticInputModeEnum.optional(), + /** Embedding dimension count (for providers that support variable dimensions). */ + dimensions: z.number().int().positive().optional(), + /** Distance metric: "cosine" (default), "dot", or "hamming". */ + distance_metric: SemanticDistanceMetricEnum.optional(), + /** Optional query prompt template (applied before embedding queries). */ + query_prompt_template: z.string().optional(), + /** Optional document prompt template (applied before embedding documents). */ + document_prompt_template: z.string().optional(), }); - const LspExtensionSchema = z .string() .trim() @@ -1027,8 +1052,31 @@ function getProjectLspStrippedKeys(lsp: AftConfig["lsp"]): string[] { } /** - * Top-level fields that are SAFE to inherit from project config. + * Semantic config fields that are USER-ONLY (security boundary). + * These fields control remote endpoints, vector storage, and prompt behavior — + * a hostile project config could weaponize any of them. * + * Returns a comma-separated list of the offending field names found in `semantic`, + * so the caller can generate a warning. Empty string means no restricted fields. + */ +function getStrippedSemanticKeys(semantic: AftConfig["semantic"]): string { + if (!semantic) return ""; + const stripped: string[] = []; + if (semantic.backend !== undefined) stripped.push("backend"); + if (semantic.base_url !== undefined) stripped.push("base_url"); + if (semantic.api_key_env !== undefined) stripped.push("api_key_env"); + if (semantic.output_encoding !== undefined) stripped.push("output_encoding"); + if (semantic.storage_strategy !== undefined) stripped.push("storage_strategy"); + if (semantic.input_mode !== undefined) stripped.push("input_mode"); + if (semantic.dimensions !== undefined) stripped.push("dimensions"); + if (semantic.distance_metric !== undefined) stripped.push("distance_metric"); + if (semantic.query_prompt_template !== undefined) stripped.push("query_prompt_template"); + if (semantic.document_prompt_template !== undefined) stripped.push("document_prompt_template"); + return stripped.join(", "); +} + +/** + * Top-level fields that are SAFE to inherit from project config. * * Anything NOT in this list flows from user config only. This is the * strict-allowlist trust boundary — adding a new field requires explicit * security review of whether a hostile repo could weaponize it. @@ -1177,13 +1225,10 @@ export function loadAftConfig(projectDirectory: string): AftConfig { // Override with project config const projectConfig = loadConfigFromPath(projectConfigPath); if (projectConfig) { - if ( - projectConfig.semantic?.backend !== undefined || - projectConfig.semantic?.base_url !== undefined || - projectConfig.semantic?.api_key_env !== undefined - ) { + const strippedSemanticKeys = getStrippedSemanticKeys(projectConfig.semantic); + if (strippedSemanticKeys) { warn( - "Ignoring semantic.backend/base_url/api_key_env from project config (security: use user config for external backends)", + `Ignoring semantic.${strippedSemanticKeys} from project config (security: these semantic settings only honor user-level config)`, ); } const strippedLspKeys = getProjectLspStrippedKeys(projectConfig.lsp); From 0f640ca9640f9b4657a89eb496303be5217195cb Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Mon, 25 May 2026 13:46:22 +0200 Subject: [PATCH 05/38] =?UTF-8?q?aft-t6p.8:=20semantic=20index=20lifecycle?= =?UTF-8?q?=20=E2=80=94=20immutable=20snapshots,=20stale-vector=20pruning,?= =?UTF-8?q?=20write-lock=20sync?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/aft/src/commands/configure.rs | 50 +- crates/aft/src/semantic_index.rs | 720 +++++++++++++++++---------- 2 files changed, 468 insertions(+), 302 deletions(-) diff --git a/crates/aft/src/commands/configure.rs b/crates/aft/src/commands/configure.rs index 645afe8b..14395339 100644 --- a/crates/aft/src/commands/configure.rs +++ b/crates/aft/src/commands/configure.rs @@ -1653,6 +1653,24 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { let profile = EmbeddingModelProfile::from_config(&semantic_config); let fingerprint = model.fingerprint(&semantic_config, profile.as_ref())?; let fingerprint_key = fingerprint.as_string(); + + // Create embed closure once and reuse for both incremental refresh + // and full rebuild. Must be created before model is moved. + let doc_template = semantic_config.document_prompt_template.clone(); + let mut embed = move |texts: Vec| { + let texts = if let Some(ref tpl) = doc_template { + texts + .iter() + .map(|t| { + crate::semantic_index::apply_document_template(t, Some(tpl)) + }) + .collect() + } else { + texts + }; + model.embed(texts) + }; + let _semantic_cache_lock = (!is_worktree_bridge_for_semantic) .then(|| ()) .and_then(|_| semantic_storage.as_ref()) @@ -1701,23 +1719,6 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { } let mut cached = cached; - let doc_template = semantic_config.document_prompt_template.clone(); - let mut embed = move |texts: Vec| { - let texts = if let Some(ref tpl) = doc_template { - texts - .iter() - .map(|t| { - crate::semantic_index::apply_document_template( - t, - Some(tpl), - ) - }) - .collect() - } else { - texts - }; - model.embed(texts) - }; let _ = tx_progress.send(SemanticIndexEvent::Progress { stage: "refreshing_stale_files".to_string(), files: None, @@ -1806,21 +1807,6 @@ pub fn handle_configure(req: &RawRequest, ctx: &AppContext) -> Response { )); } - let doc_template = semantic_config.document_prompt_template.clone(); - let mut embed = move |texts: Vec| { - let texts = if let Some(ref tpl) = doc_template { - texts - .iter() - .map(|t| { - crate::semantic_index::apply_document_template(t, Some(tpl)) - }) - .collect() - } else { - texts - }; - model.embed(texts) - }; - let _ = tx_progress.send(SemanticIndexEvent::Progress { stage: "extracting_symbols".to_string(), files: Some(files.len()), diff --git a/crates/aft/src/semantic_index.rs b/crates/aft/src/semantic_index.rs index f2f7a952..ccc5279a 100644 --- a/crates/aft/src/semantic_index.rs +++ b/crates/aft/src/semantic_index.rs @@ -18,7 +18,7 @@ use std::env; use std::fmt::Display; use std::fs; use std::path::{Path, PathBuf}; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::Duration; use std::time::SystemTime; use tree_sitter::Parser; @@ -479,6 +479,7 @@ enum SemanticEmbeddingEngine { }, } +#[allow(dead_code)] pub struct SemanticEmbeddingModel { backend: SemanticBackend, model: String, @@ -1393,27 +1394,180 @@ pub struct SemanticChunk { } /// A stored embedding entry — chunk metadata + vector -#[derive(Debug)] +#[derive(Debug, Clone)] struct EmbeddingEntry { chunk: SemanticChunk, vector: Vec, } -/// The semantic index — stores embeddings for all symbols in a project -#[derive(Debug)] -pub struct SemanticIndex { +/// Lifecycle state of a [`SemanticIndex`]. +/// +/// State machine transitions: +/// Disabled → (no transitions) +/// ColdStart → ScanningFiles → Chunking → Embedding → Ready +/// Ready → Refreshing → Ready (or Degraded on partial failure) +/// Ready → RebuildRequired → ColdStart → ... → Ready +/// Ready → Failed → ColdStart → ... → Ready +/// Degraded → Refreshing → Ready (or Failed) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(dead_code)] +pub(crate) enum SemanticIndexLifecycle { + /// Semantic search is disabled by configuration. + Disabled, + /// Freshly constructed — no embedded data yet. + ColdStart, + /// Currently scanning the file system. + ScanningFiles, + /// Parsing and chunking files. + Chunking, + /// Sending chunks to the embedding backend. + Embedding, + /// Index is complete and ready for search. + Ready, + /// Incremental refresh in progress. + Refreshing, + /// Config or fingerprint changed; a full rebuild is required. + RebuildRequired, + /// Index is usable but some files failed to embed. + Degraded, + /// Build or refresh failed entirely. + Failed, +} + +/// Immutable snapshot of the core semantic index data. +/// +/// Held behind `Arc` inside [`SemanticIndex`]. +/// Clone + mutate + swap is the only mutation path, which keeps the +/// snapshot structurally immutable once published. +#[derive(Debug, Clone)] +pub struct SemanticIndexSnapshot { entries: Vec, - /// Track which files are indexed and their mtime for staleness detection - file_mtimes: HashMap, - /// Track indexed file sizes alongside mtimes for staleness detection - file_sizes: HashMap, - file_hashes: HashMap, + /// Track indexed files and their metadata for staleness detection + file_metadata: HashMap, /// Embedding dimension (384 for MiniLM-L6-v2) dimension: usize, - fingerprint: Option, project_root: PathBuf, } +impl SemanticIndexSnapshot { + /// Search the index with a query embedding, returning top-K results sorted by relevance + pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec { + if self.entries.is_empty() || query_vector.len() != self.dimension { + return Vec::new(); + } + + let mut scored: Vec<(f32, usize)> = self + .entries + .iter() + .enumerate() + .map(|(i, entry)| { + let mut score = cosine_similarity(query_vector, &entry.vector); + if entry.chunk.exported { + score *= 1.1; + } + (score, i) + }) + .collect(); + + // Sort descending by score + scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); + + scored + .into_iter() + .take(top_k) + .map(|(score, idx)| { + let entry = &self.entries[idx]; + SemanticResult { + file: entry.chunk.file.clone(), + name: entry.chunk.name.clone(), + kind: entry.chunk.kind.clone(), + start_line: entry.chunk.start_line, + end_line: entry.chunk.end_line, + exported: entry.chunk.exported, + snippet: entry.chunk.snippet.clone(), + score, + source: "semantic", + } + }) + .collect() + } + + /// Number of indexed entries + pub fn len(&self) -> usize { + self.entries.len() + } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + /// Get the embedding dimension + pub fn dimension(&self) -> usize { + self.dimension + } + + /// Check if a file needs re-indexing based on mtime/size/hash + pub fn is_file_stale(&self, file: &Path) -> bool { + let Some(metadata) = self.file_metadata.get(file) else { + return true; + }; + let cached = FileFreshness { + mtime: metadata.mtime, + size: metadata.size, + content_hash: metadata.content_hash, + }; + match cache_freshness::verify_file(file, &cached) { + FreshnessVerdict::HotFresh => false, + FreshnessVerdict::ContentFresh { .. } => false, + FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true, + } + } + + /// Remove stale/zero-norm vectors from the snapshot. + pub fn prune_stale_vectors(&mut self) -> usize { + let before = self.entries.len(); + self.entries.retain(|entry| { + let norm = entry.vector.iter().map(|v| v * v).sum::().sqrt(); + norm > 0.0 + }); + before - self.entries.len() + } + + /// Mutable entry access for the inner `entries` field (test-only). + #[cfg(test)] + #[allow(private_interfaces)] + pub fn entries_mut_inner(&mut self) -> &mut Vec { + &mut self.entries + } + + /// Mutable file_metadata access — only available in tests. + #[cfg(test)] + #[allow(private_interfaces)] + pub fn file_metadata_mut_inner(&mut self) -> &mut HashMap { + &mut self.file_metadata + } +} + +/// The semantic index — stores embeddings for all symbols in a project. +/// +/// Read-only data lives in [`SemanticIndexSnapshot`], accessible through +/// [`Deref`]. Mutation follows a clone–swap pattern: clone the inner +/// snapshot, apply changes, atomically swap. +#[derive(Debug)] +pub struct SemanticIndex { + snapshot: Arc, + lifecycle: SemanticIndexLifecycle, + last_error: Option, + fingerprint: Option, +} + +impl std::ops::Deref for SemanticIndex { + type Target = SemanticIndexSnapshot; + fn deref(&self) -> &Self::Target { + &self.snapshot + } +} + #[derive(Debug, Clone, Copy)] struct IndexedFileMetadata { mtime: SystemTime, @@ -1456,13 +1610,15 @@ impl SemanticIndex { pub fn new(project_root: PathBuf, dimension: usize) -> Self { debug_assert!(project_root.is_absolute()); Self { - entries: Vec::new(), - file_mtimes: HashMap::new(), - file_sizes: HashMap::new(), - file_hashes: HashMap::new(), - dimension, + snapshot: Arc::new(SemanticIndexSnapshot { + entries: Vec::new(), + file_metadata: HashMap::new(), + dimension, + project_root, + }), + lifecycle: SemanticIndexLifecycle::ColdStart, + last_error: None, fingerprint: None, - project_root, } } @@ -1480,6 +1636,80 @@ impl SemanticIndex { } } + /// Access the current lifecycle state. + #[allow(dead_code)] + pub(crate) fn lifecycle(&self) -> &SemanticIndexLifecycle { + &self.lifecycle + } + + /// Mark the index with a new lifecycle state. + #[allow(dead_code)] + pub(crate) fn set_lifecycle(&mut self, lifecycle: SemanticIndexLifecycle) { + self.lifecycle = lifecycle; + } + + /// Convenience: extract the error string when lifecycle is `Failed`. + pub fn last_error(&self) -> Option<&str> { + self.last_error.as_deref() + } + + /// Convenience: set lifecycle to `Failed` with a message. + pub fn set_last_error(&mut self, error: String) { + self.last_error = Some(error); + self.lifecycle = SemanticIndexLifecycle::Failed; + } + + /// Access the inner snapshot. + pub fn snapshot(&self) -> &SemanticIndexSnapshot { + &self.snapshot + } + + /// Atomically swap the inner snapshot. The only mutation path. + fn swap_snapshot(&mut self, new_snapshot: SemanticIndexSnapshot) { + self.snapshot = Arc::new(new_snapshot); + } + + /// Remove stale/zero-norm vectors from the current snapshot. + pub fn prune_stale_vectors(&mut self) -> usize { + let mut new_snapshot = (*self.snapshot).clone(); + let count = new_snapshot.prune_stale_vectors(); + self.swap_snapshot(new_snapshot); + count + } + + /// Mutable entry access (read-only via Deref) — only available in tests. + #[cfg(test)] + #[allow(private_interfaces)] + pub fn entries_mut(&mut self) -> &mut Vec { + Arc::make_mut(&mut self.snapshot).entries_mut_inner() + } + + /// Replace the entire snapshot atomically — only available in tests. + #[cfg(test)] + pub fn set_snapshot(&mut self, snapshot: SemanticIndexSnapshot) { + self.snapshot = Arc::new(snapshot); + } + + /// Mutable file_metadata access — only available in tests. + #[cfg(test)] + #[allow(private_interfaces)] + pub fn file_metadata_mut(&mut self) -> &mut HashMap { + Arc::make_mut(&mut self.snapshot).file_metadata_mut_inner() + } + + /// Read-only file_metadata access — only available in tests. + #[cfg(test)] + #[allow(private_interfaces)] + pub fn file_metadata(&self) -> &HashMap { + &self.snapshot.file_metadata + } + + /// Set dimension — only available in tests. + #[cfg(test)] + pub fn set_dimension(&mut self, dim: usize) { + Arc::make_mut(&mut self.snapshot).dimension = dim; + } + fn collect_chunks( project_root: &Path, files: &[PathBuf], @@ -1535,7 +1765,7 @@ impl SemanticIndex { embed_fn: &mut F, max_batch_size: usize, mut progress: Option<&mut P>, - ) -> Result + ) -> Result where F: FnMut(Vec) -> Result>, String>, P: FnMut(usize, usize), @@ -1544,22 +1774,10 @@ impl SemanticIndex { let total_chunks = chunks.len(); if chunks.is_empty() { - return Ok(Self { + return Ok(SemanticIndexSnapshot { entries: Vec::new(), - file_mtimes: file_metadata - .iter() - .map(|(path, metadata)| (path.clone(), metadata.mtime)) - .collect(), - file_sizes: file_metadata - .iter() - .map(|(path, metadata)| (path.clone(), metadata.size)) - .collect(), - file_hashes: file_metadata - .into_iter() - .map(|(path, metadata)| (path, metadata.content_hash)) - .collect(), + file_metadata, dimension: DEFAULT_DIMENSION, - fingerprint: None, project_root: project_root.to_path_buf(), }); } @@ -1609,22 +1827,10 @@ impl SemanticIndex { .map(|e| e.vector.len()) .unwrap_or(DEFAULT_DIMENSION); - Ok(Self { + Ok(SemanticIndexSnapshot { entries, - file_mtimes: file_metadata - .iter() - .map(|(path, metadata)| (path.clone(), metadata.mtime)) - .collect(), - file_sizes: file_metadata - .iter() - .map(|(path, metadata)| (path.clone(), metadata.size)) - .collect(), - file_hashes: file_metadata - .into_iter() - .map(|(path, metadata)| (path, metadata.content_hash)) - .collect(), + file_metadata, dimension, - fingerprint: None, project_root: project_root.to_path_buf(), }) } @@ -1641,14 +1847,20 @@ impl SemanticIndex { F: FnMut(Vec) -> Result>, String>, { let (chunks, file_mtimes) = Self::collect_chunks(project_root, files); - Self::build_from_chunks( + let snapshot = Self::build_from_chunks( project_root, chunks, file_mtimes, embed_fn, max_batch_size, Option::<&mut fn(usize, usize)>::None, - ) + )?; + Ok(Self { + snapshot: Arc::new(snapshot), + lifecycle: SemanticIndexLifecycle::Ready, + last_error: None, + fingerprint: None, + }) } /// Build the semantic index and report embedding progress using entry counts. @@ -1666,14 +1878,20 @@ impl SemanticIndex { let (chunks, file_mtimes) = Self::collect_chunks(project_root, files); let total_chunks = chunks.len(); progress(0, total_chunks); - Self::build_from_chunks( + let snapshot = Self::build_from_chunks( project_root, chunks, file_mtimes, embed_fn, max_batch_size, Some(progress), - ) + )?; + Ok(Self { + snapshot: Arc::new(snapshot), + lifecycle: SemanticIndexLifecycle::Ready, + last_error: None, + fingerprint: None, + }) } /// Incrementally refresh entries for changed/new files only, preserving cached @@ -1698,13 +1916,14 @@ impl SemanticIndex { F: FnMut(Vec) -> Result>, String>, P: FnMut(usize, usize), { - self.backfill_missing_file_sizes(); + // Clone the current snapshot to mutate it (clone-swap pattern). + let mut snapshot = (*self.snapshot).clone(); // 1. Bucket files into deleted / changed / added. let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect(); - let total_processed = current_set.len() + self.file_mtimes.len() - - self - .file_mtimes + let total_processed = current_set.len() + snapshot.file_metadata.len() + - snapshot + .file_metadata .keys() .filter(|path| current_set.contains(path.as_path())) .count(); @@ -1713,32 +1932,31 @@ impl SemanticIndex { // walked set. Both cases need their entries dropped. let mut deleted: Vec = Vec::new(); let mut changed: Vec = Vec::new(); - let indexed_paths: Vec = self.file_mtimes.keys().cloned().collect(); + let indexed_paths: Vec = snapshot.file_metadata.keys().cloned().collect(); for indexed_path in &indexed_paths { if !current_set.contains(indexed_path.as_path()) { deleted.push(indexed_path.clone()); continue; } - let cached = match ( - self.file_mtimes.get(indexed_path), - self.file_sizes.get(indexed_path), - self.file_hashes.get(indexed_path), - ) { - (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness { - mtime: *mtime, - size: *size, - content_hash: *hash, - }), - _ => None, - }; + let cached = snapshot + .file_metadata + .get(indexed_path) + .map(|meta| FileFreshness { + mtime: meta.mtime, + size: meta.size, + content_hash: meta.content_hash, + }); match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) { Some(FreshnessVerdict::HotFresh) => {} Some(FreshnessVerdict::ContentFresh { new_mtime, new_size, }) => { - self.file_mtimes.insert(indexed_path.clone(), new_mtime); - self.file_sizes.insert(indexed_path.clone(), new_size); + // Update mtime/size in metadata — content_hash unchanged. + if let Some(meta) = snapshot.file_metadata.get_mut(indexed_path) { + meta.mtime = new_mtime; + meta.size = new_size; + } } Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => { changed.push(indexed_path.clone()); @@ -1749,7 +1967,7 @@ impl SemanticIndex { // Files in walk that were never indexed. let mut added: Vec = Vec::new(); for path in current_files { - if !self.file_mtimes.contains_key(path) { + if !snapshot.file_metadata.contains_key(path) { added.push(path.clone()); } } @@ -1768,12 +1986,11 @@ impl SemanticIndex { // read/parse errors keep the stale-but-valid cache entry. if !deleted.is_empty() { let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect(); - self.entries + snapshot + .entries .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path())); for path in &deleted { - self.file_mtimes.remove(path); - self.file_sizes.remove(path); - self.file_hashes.remove(path); + snapshot.file_metadata.remove(path); } } @@ -1785,6 +2002,7 @@ impl SemanticIndex { if to_embed.is_empty() { // Only deletions happened. progress(0, 0); + self.swap_snapshot(snapshot); return Ok(RefreshSummary { changed: 0, added: 0, @@ -1799,7 +2017,8 @@ impl SemanticIndex { progress(0, 0); let successful_files: HashSet = fresh_metadata.keys().cloned().collect(); if !successful_files.is_empty() { - self.entries + snapshot + .entries .retain(|entry| !successful_files.contains(&entry.chunk.file)); } let changed_count = changed @@ -1810,11 +2029,8 @@ impl SemanticIndex { .iter() .filter(|path| successful_files.contains(*path)) .count(); - for (file, metadata) in fresh_metadata { - self.file_mtimes.insert(file.clone(), metadata.mtime); - self.file_sizes.insert(file.clone(), metadata.size); - self.file_hashes.insert(file.clone(), metadata.content_hash); - } + snapshot.file_metadata.extend(fresh_metadata); + self.swap_snapshot(snapshot); return Ok(RefreshSummary { changed: changed_count, added: added_count, @@ -1827,10 +2043,10 @@ impl SemanticIndex { let total_chunks = chunks.len(); progress(0, total_chunks); let batch_size = max_batch_size.max(1); - let existing_dimension = if self.entries.is_empty() { + let existing_dimension = if snapshot.entries.is_empty() { None } else { - Some(self.dimension) + Some(snapshot.dimension) }; let mut new_entries: Vec = Vec::with_capacity(chunks.len()); let mut observed_dimension: Option = existing_dimension; @@ -1873,20 +2089,19 @@ impl SemanticIndex { let successful_files: HashSet = fresh_metadata.keys().cloned().collect(); if !successful_files.is_empty() { - self.entries + snapshot + .entries .retain(|entry| !successful_files.contains(&entry.chunk.file)); } - self.entries.extend(new_entries); - for (file, metadata) in fresh_metadata { - self.file_mtimes.insert(file.clone(), metadata.mtime); - self.file_sizes.insert(file.clone(), metadata.size); - self.file_hashes.insert(file, metadata.content_hash); - } + snapshot.entries.extend(new_entries); + snapshot.file_metadata.extend(fresh_metadata); if let Some(dim) = observed_dimension { - self.dimension = dim; + snapshot.dimension = dim; } + self.swap_snapshot(snapshot); + Ok(RefreshSummary { changed: changed .iter() @@ -1901,108 +2116,16 @@ impl SemanticIndex { }) } - /// Search the index with a query embedding, returning top-K results sorted by relevance - pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec { - if self.entries.is_empty() || query_vector.len() != self.dimension { - return Vec::new(); - } - - let mut scored: Vec<(f32, usize)> = self - .entries - .iter() - .enumerate() - .map(|(i, entry)| { - let mut score = cosine_similarity(query_vector, &entry.vector); - if entry.chunk.exported { - score *= 1.1; - } - (score, i) - }) - .collect(); - - // Sort descending by score - scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal)); - - scored - .into_iter() - .take(top_k) - // Keep the sort → take → map ordering explicit: removing the old - // `> 0.0` floor cannot evict positive hits because top_k has already - // been selected, but it can surface zero-score noise in the tail. - .map(|(score, idx)| { - let entry = &self.entries[idx]; - SemanticResult { - file: entry.chunk.file.clone(), - name: entry.chunk.name.clone(), - kind: entry.chunk.kind.clone(), - start_line: entry.chunk.start_line, - end_line: entry.chunk.end_line, - exported: entry.chunk.exported, - snippet: entry.chunk.snippet.clone(), - score, - source: "semantic", - } - }) - .collect() - } - - /// Number of indexed entries - pub fn len(&self) -> usize { - self.entries.len() - } - - /// Check if a file needs re-indexing based on mtime/size - pub fn is_file_stale(&self, file: &Path) -> bool { - let Some(stored_mtime) = self.file_mtimes.get(file) else { - return true; - }; - let Some(stored_size) = self.file_sizes.get(file) else { - return true; - }; - let Some(stored_hash) = self.file_hashes.get(file) else { - return true; - }; - let cached = FileFreshness { - mtime: *stored_mtime, - size: *stored_size, - content_hash: *stored_hash, - }; - match cache_freshness::verify_file(file, &cached) { - FreshnessVerdict::HotFresh => false, - FreshnessVerdict::ContentFresh { .. } => false, - FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true, - } - } - - fn backfill_missing_file_sizes(&mut self) { - for path in self.file_mtimes.keys() { - if self.file_sizes.contains_key(path) { - continue; - } - if let Ok(metadata) = fs::metadata(path) { - self.file_sizes.insert(path.clone(), metadata.len()); - if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) { - self.file_hashes.insert(path.clone(), hash); - } - } - } - } - - /// Remove entries for a specific file + /// Remove entries for a specific file (clone–swap pattern) pub fn remove_file(&mut self, file: &Path) { self.invalidate_file(file); } pub fn invalidate_file(&mut self, file: &Path) { - self.entries.retain(|e| e.chunk.file != file); - self.file_mtimes.remove(file); - self.file_sizes.remove(file); - self.file_hashes.remove(file); - } - - /// Get the embedding dimension - pub fn dimension(&self) -> usize { - self.dimension + let mut snapshot = (*self.snapshot).clone(); + snapshot.entries.retain(|e| e.chunk.file != file); + snapshot.file_metadata.remove(file); + self.snapshot = Arc::new(snapshot); } pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> { @@ -2155,14 +2278,6 @@ impl SemanticIndex { Some(encoded.into_bytes()) } }); - let file_mtimes: Vec<_> = self - .file_mtimes - .iter() - .filter_map(|(path, mtime)| { - cache_relative_path(&self.project_root, path) - .map(|relative| (relative, path, mtime)) - }) - .collect(); let entries: Vec<_> = self .entries .iter() @@ -2192,26 +2307,29 @@ impl SemanticIndex { buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes()); buf.extend_from_slice(fp_bytes_ref); - // File mtime table: count(4) + entries - // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4) - buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes()); - for (relative, path, mtime) in &file_mtimes { + // File metadata table: count(4) + entries + // V6 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4) + size(u64) + blake3(32). + // Preserves full APFS/ext4/NTFS precision and catches mtime ties. + let file_metadata_entries: Vec<_> = self + .file_metadata + .iter() + .filter_map(|(path, meta)| { + cache_relative_path(&self.project_root, path).map(|relative| (relative, meta)) + }) + .collect(); + buf.extend_from_slice(&(file_metadata_entries.len() as u32).to_le_bytes()); + for (relative, meta) in &file_metadata_entries { let path_bytes = relative.to_string_lossy().as_bytes().to_vec(); buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes()); buf.extend_from_slice(&path_bytes); - let duration = mtime + let duration = meta + .mtime .duration_since(SystemTime::UNIX_EPOCH) .unwrap_or_default(); buf.extend_from_slice(&duration.as_secs().to_le_bytes()); buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes()); - let size = self.file_sizes.get(*path).copied().unwrap_or_default(); - buf.extend_from_slice(&size.to_le_bytes()); - let hash = self - .file_hashes - .get(*path) - .copied() - .unwrap_or_else(cache_freshness::zero_hash); - buf.extend_from_slice(hash.as_bytes()); + buf.extend_from_slice(&meta.size.to_le_bytes()); + buf.extend_from_slice(meta.content_hash.as_bytes()); } // Entries: each is metadata + vector @@ -2338,9 +2456,8 @@ impl SemanticIndex { return Err("semantic index vectors exceed available data".to_string()); } - let mut file_mtimes = HashMap::with_capacity(mtime_count); - let mut file_sizes = HashMap::with_capacity(mtime_count); - let mut file_hashes = HashMap::with_capacity(mtime_count); + let mut file_metadata: HashMap = + HashMap::with_capacity(mtime_count); for _ in 0..mtime_count { let path = read_string(data, &mut pos)?; let secs = read_u64(data, &mut pos)?; @@ -2402,9 +2519,14 @@ impl SemanticIndex { } else { PathBuf::from(path) }; - file_mtimes.insert(path.clone(), mtime); - file_sizes.insert(path.clone(), size); - file_hashes.insert(path, content_hash); + file_metadata.insert( + path, + IndexedFileMetadata { + mtime, + size, + content_hash, + }, + ); } // Entries @@ -2474,7 +2596,7 @@ impl SemanticIndex { )); } for entry in &entries { - if !file_mtimes.contains_key(&entry.chunk.file) { + if !file_metadata.contains_key(&entry.chunk.file) { return Err(format!( "semantic cache metadata missing for entry file {}", entry.chunk.file.display() @@ -2482,14 +2604,17 @@ impl SemanticIndex { } } - Ok(Self { + let snapshot = SemanticIndexSnapshot { entries, - file_mtimes, - file_sizes, - file_hashes, + file_metadata, dimension, - fingerprint, project_root: current_canonical_root.to_path_buf(), + }; + Ok(Self { + snapshot: Arc::new(snapshot), + lifecycle: SemanticIndexLifecycle::Ready, + last_error: None, + fingerprint, }) } } @@ -2997,11 +3122,15 @@ mod tests { } fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) { - index.file_mtimes.insert(file.to_path_buf(), mtime); - index.file_sizes.insert(file.to_path_buf(), size); - index - .file_hashes - .insert(file.to_path_buf(), cache_freshness::zero_hash()); + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + file.to_path_buf(), + IndexedFileMetadata { + mtime, + size, + content_hash: hash, + }, + ); } #[test] @@ -3010,14 +3139,16 @@ mod tests { let project = fs::canonicalize(dir.path()).expect("canonical project"); let outside = project.join("..").join("outside.rs"); let mut index = SemanticIndex::new(project.clone(), 3); - index - .file_mtimes - .insert(outside.clone(), SystemTime::UNIX_EPOCH); - index.file_sizes.insert(outside.clone(), 1); - index - .file_hashes - .insert(outside.clone(), cache_freshness::zero_hash()); - index.entries.push(EmbeddingEntry { + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + outside.clone(), + IndexedFileMetadata { + mtime: SystemTime::UNIX_EPOCH, + size: 1, + content_hash: hash, + }, + ); + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: outside, name: "outside".to_string(), @@ -3034,7 +3165,7 @@ mod tests { let bytes = index.to_bytes(); let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index"); assert_eq!(loaded.entries.len(), 0); - assert!(loaded.file_mtimes.is_empty()); + assert!(loaded.file_metadata().is_empty()); } #[test] @@ -3063,7 +3194,7 @@ mod tests { let project_root = test_project_root(); let file = project_root.join("src/main.rs"); let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION); - index.entries.push(EmbeddingEntry { + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: file.clone(), name: "handle_request".to_string(), @@ -3076,11 +3207,16 @@ mod tests { }, vector: vec![0.1, 0.2, 0.3, 0.4], }); - index.dimension = 4; - index - .file_mtimes - .insert(file.clone(), SystemTime::UNIX_EPOCH); - index.file_sizes.insert(file, 0); + index.set_dimension(4); + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + file.clone(), + IndexedFileMetadata { + mtime: SystemTime::UNIX_EPOCH, + size: 0, + content_hash: hash, + }, + ); index.set_fingerprint(SemanticIndexFingerprint { backend: "fastembed".to_string(), model: "all-MiniLM-L6-v2".to_string(), @@ -3129,13 +3265,13 @@ mod tests { #[test] fn test_search_top_k() { let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION); - index.dimension = 3; + index.set_dimension(3); // Add entries with known vectors for (i, name) in ["auth", "database", "handler"].iter().enumerate() { let mut vec = vec![0.0f32; 3]; vec[i] = 1.0; // orthogonal vectors - index.entries.push(EmbeddingEntry { + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: PathBuf::from("/src/lib.rs"), name: name.to_string(), @@ -3253,7 +3389,7 @@ mod tests { fn invalidate_file_removes_entries_and_mtime() { let target = PathBuf::from("/src/main.rs"); let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION); - index.entries.push(EmbeddingEntry { + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: target.clone(), name: "main".to_string(), @@ -3266,16 +3402,20 @@ mod tests { }, vector: vec![1.0; DEFAULT_DIMENSION], }); - index - .file_mtimes - .insert(target.clone(), SystemTime::UNIX_EPOCH); - index.file_sizes.insert(target.clone(), 0); + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + target.clone(), + IndexedFileMetadata { + mtime: SystemTime::UNIX_EPOCH, + size: 0, + content_hash: hash, + }, + ); index.invalidate_file(&target); assert!(index.entries.is_empty()); - assert!(!index.file_mtimes.contains_key(&target)); - assert!(!index.file_sizes.contains_key(&target)); + assert!(!index.file_metadata().contains_key(&target)); } #[test] @@ -3288,8 +3428,9 @@ mod tests { let mut index = build_test_index(project_root, std::slice::from_ref(&file)); let original_entry_count = index.entries.len(); - let original_mtime = *index.file_mtimes.get(&file).unwrap(); - let original_size = *index.file_sizes.get(&file).unwrap(); + let meta = index.file_metadata().get(&file).unwrap(); + let original_mtime = meta.mtime; + let original_size = meta.size; let stale_mtime = SystemTime::UNIX_EPOCH; set_file_metadata(&mut index, &file, stale_mtime, original_size + 1); @@ -3315,9 +3456,18 @@ mod tests { .entries .iter() .any(|entry| entry.chunk.name == "kept_symbol")); - assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime)); - assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime)); - assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1))); + assert_eq!( + index.file_metadata().get(&file).map(|m| m.mtime), + Some(stale_mtime) + ); + assert_ne!( + index.file_metadata().get(&file).map(|m| m.mtime), + Some(original_mtime) + ); + assert_eq!( + index.file_metadata().get(&file).map(|m| m.size), + Some(original_size + 1) + ); } #[test] @@ -3343,8 +3493,7 @@ mod tests { assert_eq!(summary.added, 0); assert_eq!(summary.changed, 0); assert_eq!(summary.deleted, 0); - assert!(!index.file_mtimes.contains_key(&missing)); - assert!(!index.file_sizes.contains_key(&missing)); + assert!(!index.file_metadata().contains_key(&missing)); assert!(index.entries.is_empty()); } @@ -3375,7 +3524,7 @@ mod tests { assert_eq!(summary.changed, 0); assert_eq!(summary.deleted, 0); assert_eq!(summary.total_processed, 2); - assert!(index.file_mtimes.contains_key(&added)); + assert!(index.file_metadata().contains_key(&added)); assert!(index.entries.iter().any(|entry| entry.chunk.file == added)); } @@ -3400,7 +3549,7 @@ mod tests { assert_eq!(summary.changed, 0); assert_eq!(summary.added, 0); assert_eq!(summary.total_processed, 1); - assert!(!index.file_mtimes.contains_key(&deleted)); + assert!(!index.file_metadata().contains_key(&deleted)); assert!(index.entries.is_empty()); } @@ -3506,6 +3655,13 @@ mod tests { api_key_env: None, timeout_ms: 5_000, max_batch_size: 64, + dimensions: None, + output_encoding: None, + input_mode: None, + storage_strategy: None, + distance_metric: None, + query_prompt_template: None, + document_prompt_template: None, }; let mut model = SemanticEmbeddingModel::from_config(&config).unwrap(); @@ -3579,6 +3735,13 @@ mod tests { api_key_env: None, timeout_ms: 5_000, max_batch_size: 64, + dimensions: None, + output_encoding: None, + input_mode: None, + storage_strategy: None, + distance_metric: None, + query_prompt_template: None, + document_prompt_template: None, }; let mut model = SemanticEmbeddingModel::from_config(&config).unwrap(); let _ = model.embed(vec!["probe".to_string()]).unwrap(); @@ -3624,6 +3787,13 @@ mod tests { api_key_env: None, timeout_ms: 5_000, max_batch_size: 64, + dimensions: None, + output_encoding: None, + input_mode: None, + storage_strategy: None, + distance_metric: None, + query_prompt_template: None, + document_prompt_template: None, }; let mut model = SemanticEmbeddingModel::from_config(&config).unwrap(); @@ -3643,7 +3813,7 @@ mod tests { let project_root = test_project_root(); let file = project_root.join("src/main.rs"); let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION); - index.entries.push(EmbeddingEntry { + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: file.clone(), name: "handle_request".to_string(), @@ -3656,11 +3826,16 @@ mod tests { }, vector: vec![0.1, 0.2, 0.3], }); - index.dimension = 3; - index - .file_mtimes - .insert(file.clone(), SystemTime::UNIX_EPOCH); - index.file_sizes.insert(file, 0); + index.set_dimension(3); + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + file.clone(), + IndexedFileMetadata { + mtime: SystemTime::UNIX_EPOCH, + size: 0, + content_hash: hash, + }, + ); index.set_fingerprint(SemanticIndexFingerprint { backend: "openai_compatible".to_string(), model: "test-embedding".to_string(), @@ -3716,7 +3891,7 @@ mod tests { fs::create_dir_all(&dir).unwrap(); let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION); - index.entries.push(EmbeddingEntry { + index.entries_mut().push(EmbeddingEntry { chunk: SemanticChunk { file: PathBuf::from("/src/main.rs"), name: "handle_request".to_string(), @@ -3729,11 +3904,16 @@ mod tests { }, vector: vec![0.1, 0.2, 0.3], }); - index.dimension = 3; - index - .file_mtimes - .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH); - index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0); + index.set_dimension(3); + let hash = cache_freshness::zero_hash(); + index.file_metadata_mut().insert( + PathBuf::from("/src/main.rs"), + IndexedFileMetadata { + mtime: SystemTime::UNIX_EPOCH, + size: 0, + content_hash: hash, + }, + ); let fingerprint = SemanticIndexFingerprint { backend: "fastembed".to_string(), model: "test".to_string(), From 54377d94ed04fb2017c08f4d782c1192c5cabc8a Mon Sep 17 00:00:00 2001 From: Zireael <3856578+Zireael@users.noreply.github.com> Date: Mon, 25 May 2026 13:46:49 +0200 Subject: [PATCH 06/38] chore: add testuser non-root runner to docker-rust.ps1, update benchmark data --- .../compression-tokens/data/spike-output.json | 100 +++++++++--------- scripts/docker-rust.ps1 | 28 +++-- 2 files changed, 69 insertions(+), 59 deletions(-) diff --git a/benchmarks/compression-tokens/data/spike-output.json b/benchmarks/compression-tokens/data/spike-output.json index 89a973de..94838d3d 100644 --- a/benchmarks/compression-tokens/data/spike-output.json +++ b/benchmarks/compression-tokens/data/spike-output.json @@ -4,9 +4,9 @@ "command": "git status --short --branch", "category": "git", "tier": "rust modules", - "original_bytes": 214, + "original_bytes": 220, "compressed_bytes": 213, - "original_text": "## feature/compress-metrics...origin/feature/compress-metrics [ahead 3]\n M crates/aft/src/compress/mod.rs\n M crates/aft/src/commands/bash.rs\n M Cargo.lock\n?? benchmarks/compression-tokens/\n?? tmp/spike-output.json\n", + "original_text": "## feature/compress-metrics...origin/feature/compress-metrics [ahead 3]\r\n M crates/aft/src/compress/mod.rs\r\n M crates/aft/src/commands/bash.rs\r\n M Cargo.lock\r\n?? benchmarks/compression-tokens/\r\n?? tmp/spike-output.json\r\n", "compressed_text": "## feature/compress-metrics...origin/feature/compress-metrics [ahead 3]\n M crates/aft/src/compress/mod.rs\n M crates/aft/src/commands/bash.rs\n M Cargo.lock\n?? benchmarks/compression-tokens/\n?? tmp/spike-output.json" }, { @@ -14,9 +14,9 @@ "command": "git log --oneline --decorate -25", "category": "git", "tier": "rust modules", - "original_bytes": 560, + "original_bytes": 570, "compressed_bytes": 559, - "original_text": "e4e8f7e (HEAD -> feature/compress-metrics, origin/main) chore(release): v0.26.4\n9c4aa18 feat(compress): add builtin filters for kubectl and gh\n651bb01 fix(bash): preserve completion frames for background tasks\n37f9a72 test(compress): cover tsc pretty output\n0b51408 feat(compress): add biome compressor\nb11c850 docs: update v0.27 sqlite storage plan\n8a871dd refactor(config): normalize storage dir lookup\n4a1d7b8 feat(lsp): add pull diagnostics fallback\nf70c533 fix(imports): handle type-only namespace imports\n2c55219 perf(search): cap embedding batch memory\n", + "original_text": "e4e8f7e (HEAD -> feature/compress-metrics, origin/main) chore(release): v0.26.4\r\n9c4aa18 feat(compress): add builtin filters for kubectl and gh\r\n651bb01 fix(bash): preserve completion frames for background tasks\r\n37f9a72 test(compress): cover tsc pretty output\r\n0b51408 feat(compress): add biome compressor\r\nb11c850 docs: update v0.27 sqlite storage plan\r\n8a871dd refactor(config): normalize storage dir lookup\r\n4a1d7b8 feat(lsp): add pull diagnostics fallback\r\nf70c533 fix(imports): handle type-only namespace imports\r\n2c55219 perf(search): cap embedding batch memory\r\n", "compressed_text": "e4e8f7e (HEAD -> feature/compress-metrics, origin/main) chore(release): v0.26.4\n9c4aa18 feat(compress): add builtin filters for kubectl and gh\n651bb01 fix(bash): preserve completion frames for background tasks\n37f9a72 test(compress): cover tsc pretty output\n0b51408 feat(compress): add biome compressor\nb11c850 docs: update v0.27 sqlite storage plan\n8a871dd refactor(config): normalize storage dir lookup\n4a1d7b8 feat(lsp): add pull diagnostics fallback\nf70c533 fix(imports): handle type-only namespace imports\n2c55219 perf(search): cap embedding batch memory" }, { @@ -24,9 +24,9 @@ "command": "git diff -- crates/aft/src/compress/mod.rs", "category": "git", "tier": "rust modules", - "original_bytes": 997, + "original_bytes": 1019, "compressed_bytes": 996, - "original_text": "diff --git a/crates/aft/src/compress/mod.rs b/crates/aft/src/compress/mod.rs\nindex e2a94b1..8cbe201 100644\n--- a/crates/aft/src/compress/mod.rs\n+++ b/crates/aft/src/compress/mod.rs\n@@ -84,6 +84,17 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\n compress_with_registry(command, &output, &guard)\n }\n+\n+#[cfg(test)]\n+pub fn compress_for_spike(command: &str, output: &str) -> String {\n+ let registry = toml_filter::build_registry(builtin_filters::ALL, None, None);\n+ compress_with_registry(command, output, ®istry)\n+}\n+\n /// Thread-safe dispatch that does not need `AppContext`. Caller is responsible\n /// for the `experimental_bash_compress` gate (the registry has no opinion).\n@@ -99,7 +110,7 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\n- let compressors: [&dyn Compressor; 9] = [\n+ let compressors: [&dyn Compressor; 10] = [\n &GitCompressor,\n &CargoCompressor,\n &TscCompressor,\n", + "original_text": "diff --git a/crates/aft/src/compress/mod.rs b/crates/aft/src/compress/mod.rs\r\nindex e2a94b1..8cbe201 100644\r\n--- a/crates/aft/src/compress/mod.rs\r\n+++ b/crates/aft/src/compress/mod.rs\r\n@@ -84,6 +84,17 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\r\n compress_with_registry(command, &output, &guard)\r\n }\r\n+\r\n+#[cfg(test)]\r\n+pub fn compress_for_spike(command: &str, output: &str) -> String {\r\n+ let registry = toml_filter::build_registry(builtin_filters::ALL, None, None);\r\n+ compress_with_registry(command, output, ®istry)\r\n+}\r\n+\r\n /// Thread-safe dispatch that does not need `AppContext`. Caller is responsible\r\n /// for the `experimental_bash_compress` gate (the registry has no opinion).\r\n@@ -99,7 +110,7 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\r\n- let compressors: [&dyn Compressor; 9] = [\r\n+ let compressors: [&dyn Compressor; 10] = [\r\n &GitCompressor,\r\n &CargoCompressor,\r\n &TscCompressor,\r\n", "compressed_text": "diff --git a/crates/aft/src/compress/mod.rs b/crates/aft/src/compress/mod.rs\nindex e2a94b1..8cbe201 100644\n--- a/crates/aft/src/compress/mod.rs\n+++ b/crates/aft/src/compress/mod.rs\n@@ -84,6 +84,17 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\n compress_with_registry(command, &output, &guard)\n }\n+\n+#[cfg(test)]\n+pub fn compress_for_spike(command: &str, output: &str) -> String {\n+ let registry = toml_filter::build_registry(builtin_filters::ALL, None, None);\n+ compress_with_registry(command, output, ®istry)\n+}\n+\n /// Thread-safe dispatch that does not need `AppContext`. Caller is responsible\n /// for the `experimental_bash_compress` gate (the registry has no opinion).\n@@ -99,7 +110,7 @@ pub fn compress_with_registry(command: &str, output: &str, registry: &FilterRegi\n- let compressors: [&dyn Compressor; 9] = [\n+ let compressors: [&dyn Compressor; 10] = [\n &GitCompressor,\n &CargoCompressor,\n &TscCompressor," }, { @@ -34,9 +34,9 @@ "command": "git fetch origin main", "category": "git", "tier": "rust modules", - "original_bytes": 495, + "original_bytes": 505, "compressed_bytes": 122, - "original_text": "remote: Enumerating objects: 42, done.\nremote: Counting objects: 100% (42/42), done.\nremote: Compressing objects: 100% (18/18), done.\nremote: Total 24 (delta 14), reused 17 (delta 6), pack-reused 0\nUnpacking objects: 100% (24/24), 6.81 KiB | 697.00 KiB/s, done.\nFrom github.com:cortexkit/aft\n * branch main -> FETCH_HEAD\n e4e8f7e..4af3b19 main -> origin/main\nAuto packing the repository in background for optimum performance.\nSee \"git help gc\" for manual housekeeping.\n", + "original_text": "remote: Enumerating objects: 42, done.\r\nremote: Counting objects: 100% (42/42), done.\r\nremote: Compressing objects: 100% (18/18), done.\r\nremote: Total 24 (delta 14), reused 17 (delta 6), pack-reused 0\r\nUnpacking objects: 100% (24/24), 6.81 KiB | 697.00 KiB/s, done.\r\nFrom github.com:cortexkit/aft\r\n * branch main -> FETCH_HEAD\r\n e4e8f7e..4af3b19 main -> origin/main\r\nAuto packing the repository in background for optimum performance.\r\nSee \"git help gc\" for manual housekeeping.\r\n", "compressed_text": "From github.com:cortexkit/aft\n * branch main -> FETCH_HEAD\n e4e8f7e..4af3b19 main -> origin/main" }, { @@ -44,9 +44,9 @@ "command": "git push origin feature/compress-metrics", "category": "git", "tier": "rust modules", - "original_bytes": 623, + "original_bytes": 636, "compressed_bytes": 105, - "original_text": "Enumerating objects: 18, done.\nCounting objects: 100% (18/18), done.\nDelta compression using up to 10 threads\nCompressing objects: 100% (12/12), done.\nWriting objects: 100% (12/12), 3.21 KiB | 3.21 MiB/s, done.\nTotal 12 (delta 8), reused 0 (delta 0), pack-reused 0\nremote: Resolving deltas: 100% (8/8), completed with 5 local objects.\nremote: \nremote: Create a pull request for 'feature/compress-metrics' on GitHub by visiting:\nremote: https://github.com/cortexkit/aft/pull/new/feature/compress-metrics\nremote: \nTo github.com:cortexkit/aft.git\n * [new branch] feature/compress-metrics -> feature/compress-metrics\n", + "original_text": "Enumerating objects: 18, done.\r\nCounting objects: 100% (18/18), done.\r\nDelta compression using up to 10 threads\r\nCompressing objects: 100% (12/12), done.\r\nWriting objects: 100% (12/12), 3.21 KiB | 3.21 MiB/s, done.\r\nTotal 12 (delta 8), reused 0 (delta 0), pack-reused 0\r\nremote: Resolving deltas: 100% (8/8), completed with 5 local objects.\r\nremote: \r\nremote: Create a pull request for 'feature/compress-metrics' on GitHub by visiting:\r\nremote: https://github.com/cortexkit/aft/pull/new/feature/compress-metrics\r\nremote: \r\nTo github.com:cortexkit/aft.git\r\n * [new branch] feature/compress-metrics -> feature/compress-metrics\r\n", "compressed_text": "To github.com:cortexkit/aft.git\n * [new branch] feature/compress-metrics -> feature/compress-metrics" }, { @@ -54,9 +54,9 @@ "command": "cargo test", "category": "build-test", "tier": "rust modules", - "original_bytes": 1335, + "original_bytes": 1365, "compressed_bytes": 259, - "original_text": " Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\nwarning: function `normalize_command` is never used\n --> crates/aft/src/compress/git.rs:218:4\n |\n218 | fn normalize_command(command: &str) -> String {\n | ^^^^^^^^^^^^^^^^^\n |\n = note: `#[warn(dead_code)]` on by default\nwarning: `agent-file-tools` (lib test) generated 1 warning\n Finished `test` profile [unoptimized + debuginfo] target(s) in 7.42s\n Running unittests src/lib.rs (target/debug/deps/aft-3e63e65b6f8e5a12)\n\nrunning 312 tests\ntest compress::git::tests::status_short_preserves_branch ... ok\ntest compress::cargo::tests::test_summary_keeps_failures ... ok\ntest commands::bash::tests::try_spawn_with_login_shell ... ok\ntest lsp::tests::pull_diagnostics_prefers_317 ... ok\ntest imports::tests::organize_groups_external_before_internal ... ok\ntest search_index::tests::incremental_cache_reuses_embeddings ... ok\n\ntest result: ok. 312 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 5.86s\n\n Running tests/compress_filters.rs (target/debug/deps/compress_filters-ea287c4a1a64c0e8)\n\nrunning 18 tests\ntest builtin_filters_are_parseable ... ok\ntest terraform_plan_filter_caps_middle ... ok\ntest kubectl_get_pods_strips_age_noise ... ok\n\ntest result: ok. 18 passed; 0 failed; finished in 0.09s\n", + "original_text": " Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\r\nwarning: function `normalize_command` is never used\r\n --> crates/aft/src/compress/git.rs:218:4\r\n |\r\n218 | fn normalize_command(command: &str) -> String {\r\n | ^^^^^^^^^^^^^^^^^\r\n |\r\n = note: `#[warn(dead_code)]` on by default\r\nwarning: `agent-file-tools` (lib test) generated 1 warning\r\n Finished `test` profile [unoptimized + debuginfo] target(s) in 7.42s\r\n Running unittests src/lib.rs (target/debug/deps/aft-3e63e65b6f8e5a12)\r\n\r\nrunning 312 tests\r\ntest compress::git::tests::status_short_preserves_branch ... ok\r\ntest compress::cargo::tests::test_summary_keeps_failures ... ok\r\ntest commands::bash::tests::try_spawn_with_login_shell ... ok\r\ntest lsp::tests::pull_diagnostics_prefers_317 ... ok\r\ntest imports::tests::organize_groups_external_before_internal ... ok\r\ntest search_index::tests::incremental_cache_reuses_embeddings ... ok\r\n\r\ntest result: ok. 312 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 5.86s\r\n\r\n Running tests/compress_filters.rs (target/debug/deps/compress_filters-ea287c4a1a64c0e8)\r\n\r\nrunning 18 tests\r\ntest builtin_filters_are_parseable ... ok\r\ntest terraform_plan_filter_caps_middle ... ok\r\ntest kubectl_get_pods_strips_age_noise ... ok\r\n\r\ntest result: ok. 18 passed; 0 failed; finished in 0.09s\r\n", "compressed_text": " Finished `test` profile [unoptimized + debuginfo] target(s) in 7.42s\nrunning 312 tests\ntest result: ok. 312 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 5.86s\nrunning 18 tests\ntest result: ok. 18 passed; 0 failed; finished in 0.09s" }, { @@ -64,19 +64,19 @@ "command": "cargo build --release", "category": "build-test", "tier": "rust modules", - "original_bytes": 501, - "compressed_bytes": 500, - "original_text": " Compiling libc v0.2.177\n Compiling proc-macro2 v1.0.101\n Compiling unicode-ident v1.0.19\n Compiling quote v1.0.41\n Compiling serde_core v1.0.228\n Compiling memchr v2.7.6\n Compiling aho-corasick v1.1.3\n Compiling regex-syntax v0.8.8\n Compiling serde v1.0.228\n Compiling regex-automata v0.4.13\n Compiling tree-sitter v0.26.2\n Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\n Finished `release` profile [optimized] target(s) in 38.74s\n", - "compressed_text": " Compiling libc v0.2.177\n Compiling proc-macro2 v1.0.101\n Compiling unicode-ident v1.0.19\n Compiling quote v1.0.41\n Compiling serde_core v1.0.228\n Compiling memchr v2.7.6\n Compiling aho-corasick v1.1.3\n Compiling regex-syntax v0.8.8\n Compiling serde v1.0.228\n Compiling regex-automata v0.4.13\n Compiling tree-sitter v0.26.2\n Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\n Finished `release` profile [optimized] target(s) in 38.74s" + "original_bytes": 514, + "compressed_bytes": 512, + "original_text": " Compiling libc v0.2.177\r\n Compiling proc-macro2 v1.0.101\r\n Compiling unicode-ident v1.0.19\r\n Compiling quote v1.0.41\r\n Compiling serde_core v1.0.228\r\n Compiling memchr v2.7.6\r\n Compiling aho-corasick v1.1.3\r\n Compiling regex-syntax v0.8.8\r\n Compiling serde v1.0.228\r\n Compiling regex-automata v0.4.13\r\n Compiling tree-sitter v0.26.2\r\n Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\r\n Finished `release` profile [optimized] target(s) in 38.74s\r\n", + "compressed_text": " Compiling libc v0.2.177\r\n Compiling proc-macro2 v1.0.101\r\n Compiling unicode-ident v1.0.19\r\n Compiling quote v1.0.41\r\n Compiling serde_core v1.0.228\r\n Compiling memchr v2.7.6\r\n Compiling aho-corasick v1.1.3\r\n Compiling regex-syntax v0.8.8\r\n Compiling serde v1.0.228\r\n Compiling regex-automata v0.4.13\r\n Compiling tree-sitter v0.26.2\r\n Compiling agent-file-tools v0.26.4 (/Users/ufukaltinok/Work/OSS/opencode-aft/crates/aft)\r\n Finished `release` profile [optimized] target(s) in 38.74s" }, { "file": "build-test/npm-install.txt", "command": "npm install", "category": "build-test", "tier": "rust modules", - "original_bytes": 639, + "original_bytes": 658, "compressed_bytes": 312, - "original_text": "npm WARN EBADENGINE Unsupported engine {\nnpm WARN EBADENGINE package: 'vite@7.2.2',\nnpm WARN EBADENGINE required: { node: '^20.19.0 || >=22.12.0' },\nnpm WARN EBADENGINE current: { node: 'v20.11.1', npm: '10.2.4' }\nnpm WARN EBADENGINE }\nnpm WARN deprecated inflight@1.0.6: This module is not supported, and leaks memory.\nnpm WARN deprecated glob@7.2.3: Glob versions prior to v9 are no longer supported\n\nadded 428 packages, and audited 429 packages in 12s\n\n82 packages are looking for funding\n run `npm fund` for details\n\n3 moderate severity vulnerabilities\n\nTo address all issues, run:\n npm audit fix\n\nRun `npm audit` for details.\n", + "original_text": "npm WARN EBADENGINE Unsupported engine {\r\nnpm WARN EBADENGINE package: 'vite@7.2.2',\r\nnpm WARN EBADENGINE required: { node: '^20.19.0 || >=22.12.0' },\r\nnpm WARN EBADENGINE current: { node: 'v20.11.1', npm: '10.2.4' }\r\nnpm WARN EBADENGINE }\r\nnpm WARN deprecated inflight@1.0.6: This module is not supported, and leaks memory.\r\nnpm WARN deprecated glob@7.2.3: Glob versions prior to v9 are no longer supported\r\n\r\nadded 428 packages, and audited 429 packages in 12s\r\n\r\n82 packages are looking for funding\r\n run `npm fund` for details\r\n\r\n3 moderate severity vulnerabilities\r\n\r\nTo address all issues, run:\r\n npm audit fix\r\n\r\nRun `npm audit` for details.\r\n", "compressed_text": "npm WARN deprecated inflight@1.0.6: This module is not supported, and leaks memory.\nnpm WARN deprecated glob@7.2.3: Glob versions prior to v9 are no longer supported\n82 packages are looking for funding\n3 moderate severity vulnerabilities\n\nTo address all issues, run:\n npm audit fix\n\nRun `npm audit` for details." }, { @@ -84,9 +84,9 @@ "command": "pnpm install", "category": "build-test", "tier": "rust modules", - "original_bytes": 540, + "original_bytes": 558, "compressed_bytes": 180, - "original_text": "Scope: all 7 workspace projects\nLockfile is up to date, resolution step is skipped\nPackages: +821\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\nProgress: resolved 821, reused 814, downloaded 0, added 0\nProgress: resolved 821, reused 814, downloaded 0, added 138\nProgress: resolved 821, reused 814, downloaded 0, added 821, done\n\ndependencies:\n+ @modelcontextprotocol/sdk 1.18.1\n+ ai-tokenizer 1.0.6\n+ zod 4.1.12\n\ndevDependencies:\n+ @biomejs/biome 2.4.7\n+ typescript 5.8.3\n\nDone in 4.8s using pnpm v9.15.9\n", + "original_text": "Scope: all 7 workspace projects\r\nLockfile is up to date, resolution step is skipped\r\nPackages: +821\r\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\r\nProgress: resolved 821, reused 814, downloaded 0, added 0\r\nProgress: resolved 821, reused 814, downloaded 0, added 138\r\nProgress: resolved 821, reused 814, downloaded 0, added 821, done\r\n\r\ndependencies:\r\n+ @modelcontextprotocol/sdk 1.18.1\r\n+ ai-tokenizer 1.0.6\r\n+ zod 4.1.12\r\n\r\ndevDependencies:\r\n+ @biomejs/biome 2.4.7\r\n+ typescript 5.8.3\r\n\r\nDone in 4.8s using pnpm v9.15.9\r\n", "compressed_text": "Progress: resolved 821, reused 814, downloaded 0, added 0\nProgress: resolved 821, reused 814, downloaded 0, added 138\ndependencies:\ndevDependencies:\nDone in 4.8s using pnpm v9.15.9" }, { @@ -94,9 +94,9 @@ "command": "pytest -q", "category": "build-test", "tier": "rust modules", - "original_bytes": 1602, + "original_bytes": 1632, "compressed_bytes": 877, - "original_text": "============================= test session starts ==============================\nplatform darwin -- Python 3.12.4, pytest-8.3.3, pluggy-1.5.0\nrootdir: /Users/ufukaltinok/Work/OSS/example-service\nconfigfile: pyproject.toml\nplugins: anyio-4.6.0, asyncio-0.24.0, cov-5.0.0\ncollected 146 items\n\ntests/test_api.py ...................... [ 15%]\ntests/test_auth.py .............F.... [ 27%]\ntests/test_cache.py ........................ [ 43%]\ntests/test_cli.py ....................... [ 58%]\ntests/test_storage.py ............................... [ 79%]\ntests/test_workers.py .............................. [100%]\n\n=================================== FAILURES ===================================\n_______________________ test_refresh_token_rejects_reuse _______________________\n\nclient = \n\n async def test_refresh_token_rejects_reuse(client):\n first = await client.post('/auth/refresh', json={'token': TOKEN})\n second = await client.post('/auth/refresh', json={'token': TOKEN})\n> assert second.status_code == 401\nE assert 200 == 401\nE + where 200 = .status_code\n\ntests/test_auth.py:87: AssertionError\n=========================== short test summary info ============================\nFAILED tests/test_auth.py::test_refresh_token_rejects_reuse - assert 200 == 401\n======================== 1 failed, 145 passed in 9.41s =========================\n", + "original_text": "============================= test session starts ==============================\r\nplatform darwin -- Python 3.12.4, pytest-8.3.3, pluggy-1.5.0\r\nrootdir: /Users/ufukaltinok/Work/OSS/example-service\r\nconfigfile: pyproject.toml\r\nplugins: anyio-4.6.0, asyncio-0.24.0, cov-5.0.0\r\ncollected 146 items\r\n\r\ntests/test_api.py ...................... [ 15%]\r\ntests/test_auth.py .............F.... [ 27%]\r\ntests/test_cache.py ........................ [ 43%]\r\ntests/test_cli.py ....................... [ 58%]\r\ntests/test_storage.py ............................... [ 79%]\r\ntests/test_workers.py .............................. [100%]\r\n\r\n=================================== FAILURES ===================================\r\n_______________________ test_refresh_token_rejects_reuse _______________________\r\n\r\nclient = \r\n\r\n async def test_refresh_token_rejects_reuse(client):\r\n first = await client.post('/auth/refresh', json={'token': TOKEN})\r\n second = await client.post('/auth/refresh', json={'token': TOKEN})\r\n> assert second.status_code == 401\r\nE assert 200 == 401\r\nE + where 200 = .status_code\r\n\r\ntests/test_auth.py:87: AssertionError\r\n=========================== short test summary info ============================\r\nFAILED tests/test_auth.py::test_refresh_token_rejects_reuse - assert 200 == 401\r\n======================== 1 failed, 145 passed in 9.41s =========================\r\n", "compressed_text": "platform darwin -- Python 3.12.4, pytest-8.3.3, pluggy-1.5.0\nrootdir: /Users/ufukaltinok/Work/OSS/example-service\ncollected 146 items\n=================================== FAILURES ===================================\n_______________________ test_refresh_token_rejects_reuse _______________________\n\nclient = \n\n async def test_refresh_token_rejects_reuse(client):\n first = await client.post('/auth/refresh', json={'token': TOKEN})\n second = await client.post('/auth/refresh', json={'token': TOKEN})\n> assert second.status_code == 401\nE assert 200 == 401\nE + where 200 = .status_code\n\ntests/test_auth.py:87: AssertionError\n=========================== short test summary info ============================\n======================== 1 failed, 145 passed in 9.41s =========================" }, { @@ -104,9 +104,9 @@ "command": "eslint . --format stylish", "category": "lint", "tier": "rust modules", - "original_bytes": 619, + "original_bytes": 630, "compressed_bytes": 546, - "original_text": "\n/Users/ufukaltinok/Work/OSS/web/src/App.tsx\n 12:7 warning 'unused' is assigned a value but never used @typescript-eslint/no-unused-vars\n 48:13 error Unexpected any. Specify a different type @typescript-eslint/no-explicit-any\n 93:5 error React Hook useEffect has a missing dependency react-hooks/exhaustive-deps\n\n/Users/ufukaltinok/Work/OSS/web/src/lib/api.ts\n 21:10 error 'ResponsePayload' is defined but never used @typescript-eslint/no-unused-vars\n 77:3 warning Unexpected console statement no-console\n\n✖ 5 problems (3 errors, 2 warnings)\n", + "original_text": "\r\n/Users/ufukaltinok/Work/OSS/web/src/App.tsx\r\n 12:7 warning 'unused' is assigned a value but never used @typescript-eslint/no-unused-vars\r\n 48:13 error Unexpected any. Specify a different type @typescript-eslint/no-explicit-any\r\n 93:5 error React Hook useEffect has a missing dependency react-hooks/exhaustive-deps\r\n\r\n/Users/ufukaltinok/Work/OSS/web/src/lib/api.ts\r\n 21:10 error 'ResponsePayload' is defined but never used @typescript-eslint/no-unused-vars\r\n 77:3 warning Unexpected console statement no-console\r\n\r\n✖ 5 problems (3 errors, 2 warnings)\r\n", "compressed_text": "/Users/ufukaltinok/Work/OSS/web/src/App.tsx\n 12:7 warning @typescript-eslint/no-unused-vars 'unused' is assigned a value but never used\n 48:13 error @typescript-eslint/no-explicit-any Unexpected any. Specify a different type\n 93:5 error react-hooks/exhaustive-deps React Hook useEffect has a missing dependency\n/Users/ufukaltinok/Work/OSS/web/src/lib/api.ts\n 21:10 error @typescript-eslint/no-unused-vars 'ResponsePayload' is defined but never used\n 77:3 warning no-console Unexpected console statement\n\n✖ 5 problems (3 errors, 2 warnings)" }, { @@ -114,9 +114,9 @@ "command": "biome check .", "category": "lint", "tier": "rust modules", - "original_bytes": 900, + "original_bytes": 921, "compressed_bytes": 61, - "original_text": "src/hooks/useSession.ts:14:7 lint/correctness/noUnusedVariables ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n ✖ This variable is unused.\n\n 12 │ export function useSession() {\n 13 │ const [session, setSession] = useState(null);\n > 14 │ const debugSession = session;\n │ ^^^^^^^^^^^^\n 15 │ return session;\n 16 │ }\n\nsrc/components/Button.tsx format ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n ✖ Formatter would have printed the following content:\n\n 8 8 │ export function Button(props: Props) {\n 9 │ - return