From b337d6f98a74b7a39b84c00503a7f469095316a6 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Thu, 16 Apr 2026 13:49:39 -0400 Subject: [PATCH 1/5] Seed existing MEMORY.md knowledge into Mem0 (#311) --- src/kai/main.py | 49 ++++ src/kai/memory.py | 333 ++++++++++++++++++++++++ tests/test_main.py | 160 +++++++++++- tests/test_memory.py | 584 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1123 insertions(+), 3 deletions(-) diff --git a/src/kai/main.py b/src/kai/main.py index fd6e229c..4f237173 100644 --- a/src/kai/main.py +++ b/src/kai/main.py @@ -305,6 +305,55 @@ async def _init_and_run() -> None: except Exception: logging.warning("Could not initialize semantic memory", exc_info=True) + # Seed Mem0 with existing MEMORY.md topic-file content on first startup + # after memory system install. Per-user flag: each user_id gets its own + # flag so later-added users trigger their own seed on next startup. + # Import is_enabled fresh here rather than relying on memory_is_enabled + # from the try block above, which is undefined if the import failed. + try: + from kai.memory import is_enabled as _memory_ready + from kai.memory import seed_from_memory_md + + if config.memory_enabled and _memory_ready(): + # Collect user_ids that still need seeding. Skip any user whose + # flag is already set (prior successful run for that user). + user_ids_to_seed: list[str] = [] + for user_id_int in sorted(config.allowed_user_ids): + flag_key = f"memory_seeded:{user_id_int}" + if await sessions.get_setting(flag_key) is None: + user_ids_to_seed.append(str(user_id_int)) + + if user_ids_to_seed: + # Run the seed in a thread executor: the seed does synchronous + # Qdrant I/O per entry and should not block the event loop for + # the minute or two the first-run migration takes. Subsequent + # runs are no-ops (dedup short-circuits), but we still offload + # to keep the startup path non-blocking. + loop = asyncio.get_running_loop() + counts = await loop.run_in_executor( + None, + lambda: seed_from_memory_md(user_ids=user_ids_to_seed), + ) + # Set the per-user flag ONLY for users whose seed had no + # failures. Users with partial failures will retry on the + # next startup, and dedup will skip the already-seeded entries. + for user_id_str, user_counts in counts.items(): + if user_counts["failed"] == 0: + flag_key = f"memory_seeded:{user_id_str}" + await sessions.set_setting(flag_key, "1") + logging.info("Memory seed flag set for user_id=%s", user_id_str) + else: + logging.warning( + "Memory seed for user_id=%s had %d failures; flag NOT set, will retry on next startup", + user_id_str, + user_counts["failed"], + ) + except Exception: + # Migration failure is non-fatal. The bot runs without seed data; + # Track 1 exchange ingestion continues to populate the store. + # Also catches ImportError if mem0 is not installed. + logging.warning("Memory seed migration failed", exc_info=True) + try: # Retry initialization if the network isn't ready yet (e.g. after a # power outage where DNS may take a while to come back). diff --git a/src/kai/memory.py b/src/kai/memory.py index c885c427..5ef180d0 100644 --- a/src/kai/memory.py +++ b/src/kai/memory.py @@ -21,6 +21,7 @@ import logging import os from dataclasses import dataclass +from pathlib import Path from kai.config import DATA_DIR, Config @@ -370,6 +371,338 @@ async def add_exchange( log.warning("Memory ingestion failed", exc_info=True) +# ── Structured ingestion (Track 2 primitive) ────────────────────── + + +def add_structured( + content: str, + *, + user_id: str, + memory_type: str = "fact", + tags: list[str] | None = None, + metadata: dict | None = None, +) -> str | None: + """ + Store a single structured memory with explicit type and metadata. + + This is the Track 2 primitive: the caller pre-extracts the content + (no LLM call inside Mem0). Used by the seed migration here, and by + the REST /api/memory/add endpoint in a later PR (#308). + + Args: + content: The memory text to store. Must be non-empty after stripping. + user_id: Telegram chat_id as a string. Mem0 isolates memories per user. + memory_type: Free-form type tag. Current callers use "fact" or + "preference". Future callers may use "episode" or "self_assessment". + Stored in metadata["type"]; no validation is performed so future + types do not require a code change here. + tags: Optional list of topic tags. Stored in metadata["tags"]. + metadata: Optional additional key/value pairs. Merged into the final + metadata dict; the keys "type" and "tags" are reserved and will + be overwritten by memory_type and tags arguments. + + Returns: + The Mem0 memory ID as a string on success. None if memory is + disabled or the store call failed. Mem0 v2.0.0's add() return + shape is not strictly typed; this function unwraps the common + shapes (dict with "results" list, bare dict, None) and returns + the first memory ID found or None if none is present. + """ + # Memory disabled or init failed: no-op. Matches add_exchange() behavior. + if _memory is None: + return None + + # Reject empty content. Mem0 will silently no-op on empty strings but + # the caller will think storage succeeded. Caller bug, not our problem, + # but cheap to catch here. + if not content.strip(): + return None + + # Build the metadata dict. Caller-provided metadata comes first so + # the reserved keys (type, tags) can override caller values. + final_metadata: dict = dict(metadata) if metadata else {} + final_metadata["type"] = memory_type + if tags is not None: + final_metadata["tags"] = tags + + try: + # infer=False means no LLM call; Mem0 only embeds + stores. + # This is the entire point of the Track 2 primitive. + raw = _memory.add( + content, + user_id=user_id, + infer=False, + metadata=final_metadata, + ) + except Exception: + log.warning("add_structured failed (user_id=%s)", user_id, exc_info=True) + return None + + # Mem0 v2.0.0 returns either {"results": [{"id": ..., ...}]} or a + # bare dict in some code paths. Normalize to return the first id. + if isinstance(raw, dict): + results = raw.get("results") + if isinstance(results, list) and results: + first = results[0] + if isinstance(first, dict): + return first.get("id") + # Bare dict fallthrough (some Mem0 versions return the memory directly) + return raw.get("id") + return None + + +# ── Migration: seed from MEMORY.md topic files ──────────────────── + + +def _classify_source_file(filename: str) -> str | None: + """ + Map a topic file name to its memory type, or None to skip. + + Classification is deterministic by file name. The topic file structure + under /var/lib/kai/memory/ is already a hand-curated taxonomy, so file + name IS the type. New topic files added in the future must be added + to this mapping explicitly (do not default to "fact" for unknowns). + + Returns: + "fact" or "preference" for a file that should be seeded. + None for the MEMORY.md index, api-reference.md, or any unknown file. + """ + mapping = { + "preferences.md": "preference", + "hard-lessons.md": "preference", + "user.md": "fact", + "projects.md": "fact", + "notes.md": "fact", + "planned-features.md": "fact", + } + # Explicit skip list. api-reference.md is already in the system prompt, + # so seeding it would create duplicate matches on every scheduling query. + # MEMORY.md is the index file: pointers, not content. + if filename in ("MEMORY.md", "api-reference.md"): + return None + return mapping.get(filename) + + +def _parse_topic_file(path: Path) -> list[dict]: + """ + Parse a markdown topic file into memory candidates. + + Grammar: + - Lines beginning with "- " (after optional indent) are bullet items. + Each bullet becomes one memory candidate. Indented continuation + lines under a bullet are NOT merged; they fall through to the + paragraph accumulator and become their own candidate. No current + topic file relies on bullet continuations, so the simpler single- + line bullet rule is sufficient. + - Lines beginning with "#" are headings. Headings are NOT seeded as + memories; they are used as context prefixes. The most recent heading + before a bullet/paragraph is stored in the candidate's "heading" key. + - Non-empty, non-heading, non-bullet lines are paragraph text. + Consecutive paragraph lines are joined with spaces and become a + single memory candidate when a blank line or heading or bullet + terminates the paragraph block. + - Code blocks (fenced with ```) are skipped entirely. They are + reference syntax for humans, not facts to embed. + + Args: + path: Absolute path to a markdown topic file. + + Returns: + List of dicts shaped {"content": str, "heading": str (optional)}. + Empty list if the file has no memory-worthy content. + + Raises: + OSError: If the file cannot be read (caller catches and counts + as one failure, then continues with the next file). + """ + text = path.read_text(encoding="utf-8") + lines = text.splitlines() + + candidates: list[dict] = [] + current_heading: str = "" + paragraph_buffer: list[str] = [] + in_code_block = False + + def flush_paragraph() -> None: + # Join the buffered paragraph lines into one candidate and clear + # the buffer. Called at blank lines, headings, bullets, and EOF. + if paragraph_buffer: + joined = " ".join(paragraph_buffer).strip() + if joined: + para_entry: dict = {"content": joined} + if current_heading: + para_entry["heading"] = current_heading + candidates.append(para_entry) + paragraph_buffer.clear() + + for raw in lines: + # Toggle code-block state on fence lines. Everything inside is + # treated as skip-worthy text (not seeded). + stripped = raw.strip() + if stripped.startswith("```"): + in_code_block = not in_code_block + flush_paragraph() + continue + if in_code_block: + continue + + # Blank line terminates any buffered paragraph. + if not stripped: + flush_paragraph() + continue + + # Heading line: record as current_heading; do not seed as content. + if stripped.startswith("#"): + flush_paragraph() + # Strip leading # characters and whitespace to get the heading text. + current_heading = stripped.lstrip("#").strip() + continue + + # Bullet line: flush any paragraph, then add this bullet as its own + # candidate. Bullet content is the text after "- ". + if stripped.startswith("- "): + flush_paragraph() + bullet_text = stripped[2:].strip() + if bullet_text: + bullet_entry: dict = {"content": bullet_text} + if current_heading: + bullet_entry["heading"] = current_heading + candidates.append(bullet_entry) + continue + + # Otherwise: paragraph line. Accumulate into the paragraph buffer. + paragraph_buffer.append(stripped) + + # EOF: flush any remaining paragraph. + flush_paragraph() + return candidates + + +def _is_duplicate(content: str, *, user_id: str, threshold: float = 0.9) -> bool: + """ + Check whether a memory with near-identical content already exists. + + Runs a top-1 semantic search against the user's memory space; returns + True if the best match's score exceeds the threshold. This lets reruns + and partial-failure recoveries skip already-seeded content rather than + duplicating it. + + Args: + content: The candidate memory text. + user_id: Telegram chat_id as string. + threshold: Minimum score to be considered a duplicate. 0.9 is + intentionally high so that genuinely different content does + not get skipped, at the cost of tolerating some near-duplicates. + + Returns: + True if a duplicate exists; False otherwise (including if the + store is empty, memory is disabled, or the search itself failed). + """ + results = search(content, user_id=user_id, limit=1) + if not results: + return False + return results[0].score >= threshold + + +def seed_from_memory_md( + *, + user_ids: list[str], + memory_dir: Path | None = None, +) -> dict[str, dict[str, int]]: + """ + One-time migration: seed Mem0 with content from topic files in + DATA_DIR/memory/. + + Iterates over user_ids, parses each topic file, classifies each entry + by source file (see _classify_source_file), and calls add_structured() + per entry. Deduplicates via pre-insert search so reruns and partial + failures are safe. Does NOT set any settings flag; the caller + (main.py) owns flag management so per-user completion can be tracked + atomically with the insert loop. + + Args: + user_ids: List of Telegram chat_ids as strings. Each user_id gets + its own copy of the seeded content (Mem0 partitions by user_id). + memory_dir: Override the memory directory path (for tests). + Defaults to DATA_DIR / "memory". + + Returns: + Per-user counts: {user_id: {"seeded": N, "skipped": M, "failed": K}}. + "seeded" is the number of memories newly added. "skipped" is the + number deduplicated against existing memories. "failed" is the + number of parse or add exceptions (counted per candidate entry). + """ + # If memory is disabled, return empty per-user counts so the caller + # does not treat this as a successful migration. + if _memory is None: + return {uid: {"seeded": 0, "skipped": 0, "failed": 0} for uid in user_ids} + + target_dir = memory_dir if memory_dir is not None else DATA_DIR / "memory" + + # Collect topic files to process, in a stable order so test output is + # deterministic. _classify_source_file returns None for files we skip + # (MEMORY.md index, api-reference.md, unknown files). + topic_files: list[tuple[Path, str]] = [] + for path in sorted(target_dir.glob("*.md")): + memory_type = _classify_source_file(path.name) + if memory_type is not None: + topic_files.append((path, memory_type)) + + per_user_counts: dict[str, dict[str, int]] = {} + for user_id in user_ids: + counts = {"seeded": 0, "skipped": 0, "failed": 0} + for path, memory_type in topic_files: + # Parse errors surface as empty entries lists; we still count + # the individual parse failures via _parse_topic_file. + try: + entries = _parse_topic_file(path) + except OSError: + # File unreadable; count as one failure and move on to the + # next topic file. Do not abort the whole migration. + log.warning("Could not read %s during seed", path, exc_info=True) + counts["failed"] += 1 + continue + + for entry in entries: + # Pre-insert dedup: skip if an existing memory for this + # user already scores > 0.9 against the candidate content. + if _is_duplicate(entry["content"], user_id=user_id): + counts["skipped"] += 1 + continue + + # Build the metadata for this entry. source_file lets #310 + # (/memory Telegram command) show provenance later. + meta = { + "source": "memory_md_migration", + "source_file": path.name, + } + if "heading" in entry: + meta["heading"] = entry["heading"] + + memory_id = add_structured( + entry["content"], + user_id=user_id, + memory_type=memory_type, + tags=[path.stem], # e.g. ["preferences"] from preferences.md + metadata=meta, + ) + if memory_id is None: + counts["failed"] += 1 + else: + counts["seeded"] += 1 + + per_user_counts[user_id] = counts + log.info( + "Seed complete for user_id=%s: %d seeded, %d skipped, %d failed", + user_id, + counts["seeded"], + counts["skipped"], + counts["failed"], + ) + + return per_user_counts + + def get_all(*, user_id: str) -> list[MemoryResult]: """ Get all memories for a user. diff --git a/tests/test_main.py b/tests/test_main.py index 64462ee9..c5ed649c 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,9 +1,11 @@ """ -Tests for main.py - setup_logging(), _bootstrap_memory(), and _file_age/_file_cleanup_loop. +Tests for main.py - setup_logging(), _bootstrap_memory(), _file_age/_file_cleanup_loop, +and memory seed migration integration. The main() and _init_and_run() functions orchestrate the full application lifecycle and are impractical to unit test. The helper functions are -testable in isolation. +testable in isolation. The memory seed tests verify the integration logic +(flag checks, flag setting) using mocked memory and session modules. """ import asyncio @@ -11,7 +13,7 @@ from datetime import UTC, datetime from logging.handlers import TimedRotatingFileHandler from pathlib import Path -from unittest.mock import patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -385,3 +387,155 @@ async def mock_sleep(duration): # Error was logged mock_log.assert_called() # Should not raise - error is counted, not propagated + + +# ── Memory seed migration integration ────────────────────────────── + + +class TestMemorySeedIntegration: + """ + Tests for the memory seed migration block in _init_and_run(). + + These tests verify the flag-check/flag-set logic by directly + exercising the seed integration path with mocked memory and session + modules. The actual seed_from_memory_md() behavior is thoroughly + tested in test_memory.py; these tests focus on the orchestration. + """ + + async def test_seed_runs_when_flag_absent(self): + """When no memory_seeded flag exists, seed_from_memory_md is called.""" + mock_seed = MagicMock(return_value={"123": {"seeded": 10, "skipped": 0, "failed": 0}}) + mock_is_enabled = MagicMock(return_value=True) + mock_get_setting = AsyncMock(return_value=None) + mock_set_setting = AsyncMock() + + with ( + patch("kai.memory.is_enabled", mock_is_enabled), + patch("kai.memory.seed_from_memory_md", mock_seed), + patch("kai.sessions.get_setting", mock_get_setting), + patch("kai.sessions.set_setting", mock_set_setting), + ): + # Import fresh to get the patched modules + from kai.memory import is_enabled as _memory_ready + from kai.memory import seed_from_memory_md + + # Simulate the seed block from main.py + memory_enabled = True + allowed_user_ids = {123} + + if memory_enabled and _memory_ready(): + from kai import sessions + + user_ids_to_seed: list[str] = [] + for user_id_int in sorted(allowed_user_ids): + flag_key = f"memory_seeded:{user_id_int}" + if await sessions.get_setting(flag_key) is None: + user_ids_to_seed.append(str(user_id_int)) + + if user_ids_to_seed: + loop = asyncio.get_running_loop() + counts = await loop.run_in_executor( + None, + lambda: seed_from_memory_md(user_ids=user_ids_to_seed), + ) + for user_id_str, user_counts in counts.items(): + if user_counts["failed"] == 0: + flag_key = f"memory_seeded:{user_id_str}" + await sessions.set_setting(flag_key, "1") + + mock_seed.assert_called_once_with(user_ids=["123"]) + mock_set_setting.assert_called_once_with("memory_seeded:123", "1") + + async def test_seed_skipped_when_flag_present(self): + """When memory_seeded flag exists, seed_from_memory_md is NOT called.""" + mock_seed = MagicMock() + mock_is_enabled = MagicMock(return_value=True) + # Flag already set - return a non-None value + mock_get_setting = AsyncMock(return_value="1") + + with ( + patch("kai.memory.is_enabled", mock_is_enabled), + patch("kai.memory.seed_from_memory_md", mock_seed), + patch("kai.sessions.get_setting", mock_get_setting), + ): + from kai.memory import is_enabled as _memory_ready + from kai.memory import seed_from_memory_md + + memory_enabled = True + allowed_user_ids = {123} + + if memory_enabled and _memory_ready(): + from kai import sessions + + user_ids_to_seed: list[str] = [] + for user_id_int in sorted(allowed_user_ids): + flag_key = f"memory_seeded:{user_id_int}" + if await sessions.get_setting(flag_key) is None: + user_ids_to_seed.append(str(user_id_int)) + + if user_ids_to_seed: + seed_from_memory_md(user_ids=user_ids_to_seed) + + mock_seed.assert_not_called() + + async def test_seed_flag_not_set_on_failure(self): + """When seed reports failures, the flag is NOT set for that user.""" + mock_seed = MagicMock(return_value={"123": {"seeded": 5, "skipped": 0, "failed": 2}}) + mock_is_enabled = MagicMock(return_value=True) + mock_get_setting = AsyncMock(return_value=None) + mock_set_setting = AsyncMock() + + with ( + patch("kai.memory.is_enabled", mock_is_enabled), + patch("kai.memory.seed_from_memory_md", mock_seed), + patch("kai.sessions.get_setting", mock_get_setting), + patch("kai.sessions.set_setting", mock_set_setting), + ): + from kai.memory import is_enabled as _memory_ready + from kai.memory import seed_from_memory_md + + memory_enabled = True + allowed_user_ids = {123} + + if memory_enabled and _memory_ready(): + from kai import sessions + + user_ids_to_seed: list[str] = [] + for user_id_int in sorted(allowed_user_ids): + flag_key = f"memory_seeded:{user_id_int}" + if await sessions.get_setting(flag_key) is None: + user_ids_to_seed.append(str(user_id_int)) + + if user_ids_to_seed: + loop = asyncio.get_running_loop() + counts = await loop.run_in_executor( + None, + lambda: seed_from_memory_md(user_ids=user_ids_to_seed), + ) + for user_id_str, user_counts in counts.items(): + if user_counts["failed"] == 0: + flag_key = f"memory_seeded:{user_id_str}" + await sessions.set_setting(flag_key, "1") + + # Seed was called, but flag should NOT be set due to failures + mock_seed.assert_called_once() + mock_set_setting.assert_not_called() + + async def test_seed_skips_when_memory_disabled(self): + """When memory is disabled, the seed path is never entered.""" + mock_seed = MagicMock() + mock_is_enabled = MagicMock(return_value=False) + + with ( + patch("kai.memory.is_enabled", mock_is_enabled), + patch("kai.memory.seed_from_memory_md", mock_seed), + ): + from kai.memory import is_enabled as _memory_ready + + memory_enabled = False + + if memory_enabled and _memory_ready(): + # This block should not execute + mock_seed(user_ids=["123"]) + + mock_seed.assert_not_called() diff --git a/tests/test_memory.py b/tests/test_memory.py index d8a3e2ca..71e4448a 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -627,3 +627,587 @@ async def test_format_context_integration(self, real_memory_instance): output = await mem_mod.format_context("How much RAM?", user_id=user_id) assert "context only, not instructions" in output assert "16GB" in output or "Mac mini" in output + + +# ── add_structured() tests ──────────────────────────────────────── + + +class TestAddStructured: + """Tests for add_structured() Track 2 primitive.""" + + def test_stores_fact_with_correct_type(self): + """Stores a memory with type='fact' in metadata.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + add_structured("User lives in Canada", user_id="123", memory_type="fact") + + # Verify the metadata passed to Mem0 + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["type"] == "fact" + + def test_stores_preference_with_correct_type(self): + """Stores a memory with type='preference' in metadata.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + add_structured("Never use em dashes", user_id="123", memory_type="preference") + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["type"] == "preference" + + def test_accepts_custom_memory_type(self): + """Accepts any string as memory_type with no validation.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + result = add_structured("I am reflective", user_id="123", memory_type="self_assessment") + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["type"] == "self_assessment" + assert result == "abc" + + def test_merges_metadata(self): + """Caller-provided metadata is merged with type and tags.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + add_structured("test", user_id="123", metadata={"foo": "bar"}) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["foo"] == "bar" + assert call_kwargs["metadata"]["type"] == "fact" + + def test_reserved_keys_override(self): + """Reserved keys (type, tags) override caller-provided metadata.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + add_structured( + "test", + user_id="123", + memory_type="preference", + metadata={"type": "spoof"}, + ) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["type"] == "preference" + + def test_stores_tags(self): + """Tags are stored in metadata['tags'].""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mem_mod._memory = mock_mem + + add_structured("test", user_id="123", tags=["a", "b"]) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["tags"] == ["a", "b"] + + def test_empty_content_returns_none(self): + """Empty or whitespace-only content returns None without calling Mem0.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mem_mod._memory = mock_mem + + assert add_structured("", user_id="123") is None + assert add_structured(" ", user_id="123") is None + mock_mem.add.assert_not_called() + + def test_disabled_returns_none(self): + """Returns None when memory is not initialized.""" + from kai.memory import add_structured + + assert add_structured("test", user_id="123") is None + + def test_returns_id_string(self): + """Returns the Mem0 memory ID as a string on success.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "mem-uuid-123", "memory": "test"}]} + mem_mod._memory = mock_mem + + result = add_structured("test", user_id="123") + assert result == "mem-uuid-123" + assert isinstance(result, str) + + def test_mem0_failure_returns_none_and_logs(self, caplog): + """Mem0 add() exceptions are caught, logged, and return None.""" + import kai.memory as mem_mod + from kai.memory import add_structured + + mock_mem = MagicMock() + mock_mem.add.side_effect = RuntimeError("disk full") + mem_mod._memory = mock_mem + + with caplog.at_level("WARNING", logger="kai.memory"): + result = add_structured("test", user_id="123") + + assert result is None + assert "add_structured failed" in caplog.text + + +# ── _parse_topic_file() tests ───────────────────────────────────── + + +class TestParseTopicFile: + """Tests for _parse_topic_file() markdown parser.""" + + def test_bullets_become_candidates(self, tmp_path): + """Each bullet line becomes one memory candidate.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("# Heading\n\n- First item\n- Second item\n- Third item\n") + + result = _parse_topic_file(f) + assert len(result) == 3 + assert result[0]["content"] == "First item" + assert result[1]["content"] == "Second item" + + def test_headings_stored_as_context(self, tmp_path): + """Heading text is stored in the 'heading' key, not as content.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("# Main\n\n## Communication\n\n- Be concise\n") + + result = _parse_topic_file(f) + assert len(result) == 1 + assert result[0]["content"] == "Be concise" + assert result[0]["heading"] == "Communication" + + def test_paragraphs_become_candidates(self, tmp_path): + """Non-bullet, non-heading text is joined into paragraph candidates.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("# Notes\n\nFirst line of paragraph.\nSecond line of paragraph.\n\nAnother paragraph.\n") + + result = _parse_topic_file(f) + assert len(result) == 2 + assert result[0]["content"] == "First line of paragraph. Second line of paragraph." + assert result[1]["content"] == "Another paragraph." + + def test_code_blocks_skipped(self, tmp_path): + """Content inside fenced code blocks is not seeded.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text( + "# Reference\n\n- Real memory\n\n```\n- Not a memory\nAlso not a memory\n```\n\n- Another real one\n" + ) + + result = _parse_topic_file(f) + contents = [r["content"] for r in result] + assert "Real memory" in contents + assert "Another real one" in contents + assert "Not a memory" not in contents + assert "Also not a memory" not in contents + + def test_empty_file_returns_empty(self, tmp_path): + """An empty file produces no candidates.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("") + + assert _parse_topic_file(f) == [] + + def test_heading_only_file_returns_empty(self, tmp_path): + """A file with only headings and no content produces no candidates.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("# Title\n\n## Section\n\n### Subsection\n") + + assert _parse_topic_file(f) == [] + + +# ── _classify_source_file() tests ───────────────────────────────── + + +class TestClassifySourceFile: + """Tests for _classify_source_file() file-to-type mapping.""" + + def test_known_files(self): + """Known files map to their expected types.""" + from kai.memory import _classify_source_file + + assert _classify_source_file("preferences.md") == "preference" + assert _classify_source_file("hard-lessons.md") == "preference" + assert _classify_source_file("user.md") == "fact" + assert _classify_source_file("projects.md") == "fact" + assert _classify_source_file("notes.md") == "fact" + assert _classify_source_file("planned-features.md") == "fact" + + def test_skip_files(self): + """MEMORY.md and api-reference.md return None (skip).""" + from kai.memory import _classify_source_file + + assert _classify_source_file("MEMORY.md") is None + assert _classify_source_file("api-reference.md") is None + + def test_unknown_files_return_none(self): + """Unknown files default to None (skip), not 'fact'.""" + from kai.memory import _classify_source_file + + assert _classify_source_file("random.md") is None + assert _classify_source_file("todo.md") is None + + +# ── seed_from_memory_md() tests ─────────────────────────────────── + + +class TestSeedFromMemoryMd: + """Tests for seed_from_memory_md() one-time migration.""" + + def test_parses_preferences_file_as_preference(self, tmp_path): + """Preferences file bullets are seeded with type='preference'.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "preferences.md").write_text("# Preferences\n\n- Item A\n- Item B\n- Item C\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 3 + # Verify all calls used memory_type="preference" via metadata + for call in mock_mem.add.call_args_list: + assert call[1]["metadata"]["type"] == "preference" + + def test_parses_user_file_as_fact(self, tmp_path): + """User file bullets are seeded with type='fact'.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Location: Canada\n- Timezone: EST\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 2 + for call in mock_mem.add.call_args_list: + assert call[1]["metadata"]["type"] == "fact" + + def test_skips_api_reference_file(self, tmp_path): + """api-reference.md is not seeded even when present.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "api-reference.md").write_text("# API\n\n- Endpoint A\n") + (memory_dir / "user.md").write_text("# User\n\n- Location: Canada\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + # Only user.md should be seeded, not api-reference.md + assert counts["123"]["seeded"] == 1 + for call in mock_mem.add.call_args_list: + assert call[1]["metadata"]["source_file"] != "api-reference.md" + + def test_skips_memory_md_index(self, tmp_path): + """MEMORY.md index file is not seeded.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "MEMORY.md").write_text("# Memory\n\n- [User](user.md)\n") + (memory_dir / "user.md").write_text("# User\n\n- Location: Canada\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 1 + + def test_skips_unknown_files(self, tmp_path): + """Files not in the classification mapping are ignored.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "random.md").write_text("# Random\n\n- Should be ignored\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 0 + mock_mem.add.assert_not_called() + + def test_is_idempotent_on_rerun(self, tmp_path): + """Second run skips all entries via dedup (skipped == first run's seeded).""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + # Track stored memories to simulate search returning them on second run + stored: list[dict] = [] + call_count = 0 + + def mock_add(content, **kwargs): + nonlocal call_count + call_count += 1 + mem_id = f"id-{call_count}" + stored.append({"id": mem_id, "memory": content, "score": 0.95, "metadata": kwargs.get("metadata", {})}) + return {"results": [{"id": mem_id, "memory": content}]} + + def mock_search(query, **kwargs): + # Return the best match from stored memories (simulate high score for exact match) + for s in stored: + if s["memory"] == query: + return { + "results": [{"id": s["id"], "memory": s["memory"], "score": 0.95, "metadata": s["metadata"]}] + } + return {"results": []} + + mock_mem = MagicMock() + mock_mem.add.side_effect = mock_add + mock_mem.search.side_effect = mock_search + mem_mod._memory = mock_mem + # search() requires _config to be set (returns [] otherwise) + mem_mod._config = _make_config() + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n- Fact B\n") + + # First run: seeds everything + counts1 = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + assert counts1["123"]["seeded"] == 2 + + # Second run: everything should be skipped via dedup + counts2 = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + assert counts2["123"]["skipped"] == 2 + assert counts2["123"]["seeded"] == 0 + + def test_multi_user_isolation(self, tmp_path): + """Each user_id gets their own copy of the seeded content.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n") + + counts = seed_from_memory_md(user_ids=["111", "222"], memory_dir=memory_dir) + + assert counts["111"]["seeded"] == 1 + assert counts["222"]["seeded"] == 1 + # Two calls total - one per user + assert mock_mem.add.call_count == 2 + + def test_partial_failure_counts_failures(self, tmp_path): + """File read errors are counted as failures; other files still seed.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n") + + # Create a notes.md that will fail to read by making it a directory + # (reading a directory raises OSError/IsADirectoryError) + (memory_dir / "notes.md").mkdir() + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 1 # user.md succeeded + assert counts["123"]["failed"] == 1 # notes.md failed + + def test_preserves_heading_context(self, tmp_path): + """Headings are stored as metadata['heading'] on subsequent bullets.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "preferences.md").write_text("# Preferences\n\n## Communication\n\n- Be concise\n") + + seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["heading"] == "Communication" + + def test_stores_source_file_metadata(self, tmp_path): + """Every seeded memory has metadata['source_file'] set.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n") + + seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["source_file"] == "user.md" + + def test_stores_source_migration_tag(self, tmp_path): + """Every seeded memory has metadata['source'] == 'memory_md_migration'.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n") + + seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["source"] == "memory_md_migration" + + def test_stores_tag_from_file_stem(self, tmp_path): + """Tags contain the file stem (e.g. 'preferences' for preferences.md).""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "preferences.md").write_text("# Preferences\n\n- Item A\n") + + seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + call_kwargs = mock_mem.add.call_args[1] + assert call_kwargs["metadata"]["tags"] == ["preferences"] + + def test_disabled_returns_zero_counts(self): + """With memory disabled, returns all-zero counts without exceptions.""" + from kai.memory import seed_from_memory_md + + counts = seed_from_memory_md(user_ids=["123", "456"]) + + assert counts["123"] == {"seeded": 0, "skipped": 0, "failed": 0} + assert counts["456"] == {"seeded": 0, "skipped": 0, "failed": 0} + + def test_code_blocks_not_stored(self, tmp_path): + """Content inside fenced code blocks is not seeded.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "notes.md").write_text( + "# Notes\n\n- Real fact\n\n```\n- Not a fact\nAlso not a fact\n```\n\n- Another real fact\n" + ) + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 2 + # Verify the stored content + stored_texts = [call[0][0] for call in mock_mem.add.call_args_list] + assert "Real fact" in stored_texts + assert "Another real fact" in stored_texts + assert "Not a fact" not in stored_texts + + def test_paragraphs_stored_when_not_bullets(self, tmp_path): + """Non-bullet prose paragraphs are seeded as single memories.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "hard-lessons.md").write_text( + "# Hard Lessons\n\n## Never do X\n\nFirst line of lesson.\nSecond line of lesson.\n\n## Also bad\n\nAnother paragraph here.\n" + ) + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 2 + stored_texts = [call[0][0] for call in mock_mem.add.call_args_list] + assert "First line of lesson. Second line of lesson." in stored_texts + assert "Another paragraph here." in stored_texts From 288f60e92afb483d061441a5d50d288521654164 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Thu, 16 Apr 2026 13:55:43 -0400 Subject: [PATCH 2/5] Address review: heading detection, missing-dir guard, dead code, test docs --- src/kai/memory.py | 21 +++++++++++++++------ tests/test_main.py | 6 ++++++ tests/test_memory.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/kai/memory.py b/src/kai/memory.py index 5ef180d0..53fc785c 100644 --- a/src/kai/memory.py +++ b/src/kai/memory.py @@ -475,11 +475,10 @@ def _classify_source_file(filename: str) -> str | None: "notes.md": "fact", "planned-features.md": "fact", } - # Explicit skip list. api-reference.md is already in the system prompt, - # so seeding it would create duplicate matches on every scheduling query. - # MEMORY.md is the index file: pointers, not content. - if filename in ("MEMORY.md", "api-reference.md"): - return None + # api-reference.md and MEMORY.md are intentionally absent from the + # mapping. api-reference.md is already in the system prompt (seeding + # would create duplicate matches). MEMORY.md is the index file + # (pointers, not content). mapping.get() returns None for both. return mapping.get(filename) @@ -552,7 +551,10 @@ def flush_paragraph() -> None: continue # Heading line: record as current_heading; do not seed as content. - if stripped.startswith("#"): + # CommonMark requires "# " (hash + space) for headings. Bare "#foo" + # (e.g. issue references like #311) is NOT a heading and should be + # treated as paragraph text to avoid silently swallowing content. + if stripped.startswith("# ") or (len(stripped) > 1 and stripped[0] == "#" and stripped[1] == "#"): flush_paragraph() # Strip leading # characters and whitespace to get the heading text. current_heading = stripped.lstrip("#").strip() @@ -639,6 +641,13 @@ def seed_from_memory_md( target_dir = memory_dir if memory_dir is not None else DATA_DIR / "memory" + # Guard: on first install before any memory files are written, the + # memory directory may not exist yet. Return zero counts so the caller + # treats this as "nothing to do" rather than an error. + if not target_dir.exists(): + log.info("Memory directory %s does not exist; skipping seed", target_dir) + return {uid: {"seeded": 0, "skipped": 0, "failed": 0} for uid in user_ids} + # Collect topic files to process, in a stable order so test output is # deterministic. _classify_source_file returns None for files we skip # (MEMORY.md index, api-reference.md, unknown files). diff --git a/tests/test_main.py b/tests/test_main.py index c5ed649c..95a6ff81 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -400,6 +400,12 @@ class TestMemorySeedIntegration: exercising the seed integration path with mocked memory and session modules. The actual seed_from_memory_md() behavior is thoroughly tested in test_memory.py; these tests focus on the orchestration. + + NOTE: These tests inline a copy of the seed block from _init_and_run() + rather than calling the function itself (which orchestrates the full + app lifecycle and is impractical to unit-test). If the production + seed block changes shape, these tests must be updated manually. + They verify the flag logic pattern, not the exact production code. """ async def test_seed_runs_when_flag_absent(self): diff --git a/tests/test_memory.py b/tests/test_memory.py index 71e4448a..bb33dbf3 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -850,6 +850,21 @@ def test_heading_only_file_returns_empty(self, tmp_path): assert _parse_topic_file(f) == [] + def test_bare_hash_not_treated_as_heading(self, tmp_path): + """Lines like #311 or #hashtag are paragraph text, not headings.""" + from kai.memory import _parse_topic_file + + f = tmp_path / "test.md" + f.write_text("# Real Heading\n\n#311 is an issue reference\n#hashtag\n") + + result = _parse_topic_file(f) + # Both bare-hash lines should be joined into one paragraph candidate + assert len(result) == 1 + assert "#311 is an issue reference" in result[0]["content"] + assert "#hashtag" in result[0]["content"] + # The real heading should be context, not content + assert result[0]["heading"] == "Real Heading" + # ── _classify_source_file() tests ───────────────────────────────── @@ -1164,6 +1179,23 @@ def test_disabled_returns_zero_counts(self): assert counts["123"] == {"seeded": 0, "skipped": 0, "failed": 0} assert counts["456"] == {"seeded": 0, "skipped": 0, "failed": 0} + def test_missing_directory_returns_zero_counts(self, tmp_path): + """Non-existent memory directory returns zero counts, not an error.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mem_mod._memory = mock_mem + + # Point to a directory that does not exist + missing_dir = tmp_path / "does_not_exist" / "memory" + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=missing_dir) + + assert counts["123"] == {"seeded": 0, "skipped": 0, "failed": 0} + # No Mem0 calls should be made + mock_mem.add.assert_not_called() + def test_code_blocks_not_stored(self, tmp_path): """Content inside fenced code blocks is not seeded.""" import kai.memory as mem_mod From 524e0c5792dcd3319968035a38e402ab4ce389a8 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Thu, 16 Apr 2026 14:00:59 -0400 Subject: [PATCH 3/5] Fix heading detection symmetry: require space after hashes at all ATX levels --- src/kai/memory.py | 12 ++++++------ tests/test_memory.py | 10 ++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/kai/memory.py b/src/kai/memory.py index 53fc785c..ba69d35b 100644 --- a/src/kai/memory.py +++ b/src/kai/memory.py @@ -551,13 +551,13 @@ def flush_paragraph() -> None: continue # Heading line: record as current_heading; do not seed as content. - # CommonMark requires "# " (hash + space) for headings. Bare "#foo" - # (e.g. issue references like #311) is NOT a heading and should be - # treated as paragraph text to avoid silently swallowing content. - if stripped.startswith("# ") or (len(stripped) > 1 and stripped[0] == "#" and stripped[1] == "#"): + # CommonMark requires a space after the hash(es) for all ATX heading + # levels: "# H1", "## H2", etc. Lines like #311 or ##cross-ref are + # NOT headings and must be treated as paragraph text. + heading_hashes = len(stripped) - len(stripped.lstrip("#")) + if 1 <= heading_hashes <= 6 and stripped[heading_hashes : heading_hashes + 1] == " ": flush_paragraph() - # Strip leading # characters and whitespace to get the heading text. - current_heading = stripped.lstrip("#").strip() + current_heading = stripped[heading_hashes:].strip() continue # Bullet line: flush any paragraph, then add this bullet as its own diff --git a/tests/test_memory.py b/tests/test_memory.py index bb33dbf3..64530232 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -851,18 +851,20 @@ def test_heading_only_file_returns_empty(self, tmp_path): assert _parse_topic_file(f) == [] def test_bare_hash_not_treated_as_heading(self, tmp_path): - """Lines like #311 or #hashtag are paragraph text, not headings.""" + """Lines like #311, #hashtag, or ##cross-ref are paragraph text.""" from kai.memory import _parse_topic_file f = tmp_path / "test.md" - f.write_text("# Real Heading\n\n#311 is an issue reference\n#hashtag\n") + # All ATX levels require a space: #, ##, ###, etc. + f.write_text("# Real Heading\n\n#311 is an issue reference\n#hashtag\n##nospace\n") result = _parse_topic_file(f) - # Both bare-hash lines should be joined into one paragraph candidate + # All three bare-hash lines should be joined into one paragraph assert len(result) == 1 assert "#311 is an issue reference" in result[0]["content"] assert "#hashtag" in result[0]["content"] - # The real heading should be context, not content + assert "##nospace" in result[0]["content"] + # The real heading (with space) should be context, not content assert result[0]["heading"] == "Real Heading" From 399e4bf1fe655d204e86d851104afa125e6e669a Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Thu, 16 Apr 2026 14:04:09 -0400 Subject: [PATCH 4/5] Widen except to catch UnicodeDecodeError; guard _is_duplicate against search failures --- src/kai/memory.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/kai/memory.py b/src/kai/memory.py index ba69d35b..30eeb4c3 100644 --- a/src/kai/memory.py +++ b/src/kai/memory.py @@ -600,7 +600,14 @@ def _is_duplicate(content: str, *, user_id: str, threshold: float = 0.9) -> bool True if a duplicate exists; False otherwise (including if the store is empty, memory is disabled, or the search itself failed). """ - results = search(content, user_id=user_id, limit=1) + try: + results = search(content, user_id=user_id, limit=1) + except Exception: + # Search failure during dedup should not block seeding. Log and + # return False so the entry gets inserted (possible duplicate is + # better than a lost entry). + log.warning("Dedup search failed for '%s'", content[:60], exc_info=True) + return False if not results: return False return results[0].score >= threshold @@ -665,9 +672,11 @@ def seed_from_memory_md( # the individual parse failures via _parse_topic_file. try: entries = _parse_topic_file(path) - except OSError: - # File unreadable; count as one failure and move on to the - # next topic file. Do not abort the whole migration. + except (OSError, UnicodeDecodeError): + # File unreadable or not valid UTF-8; count as one failure + # and move on to the next topic file. UnicodeDecodeError is + # a ValueError subclass, not OSError, so it needs its own + # branch to maintain per-file isolation. log.warning("Could not read %s during seed", path, exc_info=True) counts["failed"] += 1 continue From 9a8815298a49e2e6e9607c31acd49c9e2ea9ea34 Mon Sep 17 00:00:00 2001 From: Daniel Ellison Date: Thu, 16 Apr 2026 14:43:33 -0400 Subject: [PATCH 5/5] Add tests for UnicodeDecodeError catch and _is_duplicate exception guard --- tests/test_memory.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/test_memory.py b/tests/test_memory.py index 64530232..c859e708 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -900,6 +900,29 @@ def test_unknown_files_return_none(self): assert _classify_source_file("todo.md") is None +# ── _is_duplicate() tests ──────────────────────────────────────── + + +class TestIsDuplicate: + """Tests for _is_duplicate() dedup helper.""" + + def test_search_exception_returns_false(self): + """When search() raises, _is_duplicate returns False (insert, don't skip).""" + import kai.memory as mem_mod + from kai.memory import _is_duplicate + + # Set _config so search() doesn't short-circuit on the None guard + mem_mod._config = _make_config() + # Mock _memory.search to raise inside search() + mock_mem = MagicMock() + mock_mem.search.side_effect = RuntimeError("qdrant connection refused") + mem_mod._memory = mock_mem + + # Should return False (not a duplicate), not raise + result = _is_duplicate("some content", user_id="123") + assert result is False + + # ── seed_from_memory_md() tests ─────────────────────────────────── @@ -1096,6 +1119,27 @@ def test_partial_failure_counts_failures(self, tmp_path): assert counts["123"]["seeded"] == 1 # user.md succeeded assert counts["123"]["failed"] == 1 # notes.md failed + def test_unicode_decode_error_counts_as_failure(self, tmp_path): + """Non-UTF-8 files are caught and counted as failures, not crashes.""" + import kai.memory as mem_mod + from kai.memory import seed_from_memory_md + + mock_mem = MagicMock() + mock_mem.add.return_value = {"results": [{"id": "abc", "memory": "test"}]} + mock_mem.search.return_value = {"results": []} + mem_mod._memory = mock_mem + + memory_dir = tmp_path / "memory" + memory_dir.mkdir() + (memory_dir / "user.md").write_text("# User\n\n- Fact A\n") + # Write raw bytes that are not valid UTF-8 + (memory_dir / "notes.md").write_bytes(b"\xff\xfe# Notes\n\n- Broken\n") + + counts = seed_from_memory_md(user_ids=["123"], memory_dir=memory_dir) + + assert counts["123"]["seeded"] == 1 # user.md succeeded + assert counts["123"]["failed"] == 1 # notes.md failed (UnicodeDecodeError) + def test_preserves_heading_context(self, tmp_path): """Headings are stored as metadata['heading'] on subsequent bullets.""" import kai.memory as mem_mod