diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 05813e6ac..b1dba84e1 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -275,6 +275,9 @@ def FreeTable(conn_or_name, full_table_name: str | None = None) -> _FreeTable: "diagram": (".diagram", None), # Return the module itself # cli imports click "cli": (".cli", "cli"), + # gc — exposed lazily so `dj.gc.scan(...)` works as documented in gc.py + # and in the user docs (how-to/garbage-collection.md). + "gc": (".gc", None), # Return the module itself } diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index 7f083416b..8c87efd84 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -229,11 +229,14 @@ def scan_hash_references( if verbose: logger.info(f" Scanning {table_name}.{attr_name}") - # Fetch all values for this attribute + # Read raw JSON metadata via cursor — bypasses decode_attribute + # so we get the stored dict (PostgreSQL/JSONB) or JSON string + # (MySQL), not the decoded codec output. _extract_hash_refs + # handles both shapes. try: - values = table.to_arrays(attr_name) - for value in values: - for path, ref_store in _extract_hash_refs(value): + cursor = table.proj(attr_name).cursor(as_dict=True) + for row in cursor: + for path, ref_store in _extract_hash_refs(row[attr_name]): # Filter by store if specified if store_name is None or ref_store == store_name: referenced.add(path) @@ -291,11 +294,14 @@ def scan_schema_references( if verbose: logger.info(f" Scanning {table_name}.{attr_name}") - # Fetch all values for this attribute + # Read raw JSON metadata via cursor — bypasses decode_attribute + # so we get the stored dict (PostgreSQL/JSONB) or JSON string + # (MySQL), not the decoded codec output. _extract_schema_refs + # handles both shapes. try: - values = table.to_arrays(attr_name) - for value in values: - for path, ref_store in _extract_schema_refs(value): + cursor = table.proj(attr_name).cursor(as_dict=True) + for row in cursor: + for path, ref_store in _extract_schema_refs(row[attr_name]): # Filter by store if specified if store_name is None or ref_store == store_name: referenced.add(path) diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 47ca0a96d..c9ea741bd 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -4,12 +4,43 @@ from unittest.mock import MagicMock, patch +import numpy as np import pytest +import datajoint as dj from datajoint import gc from datajoint.errors import DataJointError +# Tables used by TestScanWithLiveData. Defined at module scope so dj.Schema's +# context resolution can find them by class name; bound to a schema inside +# each fixture (see schema(...) calls below). + + +class GcBlobTest(dj.Manual): + definition = """ + rid : int + --- + payload : + """ + + +class GcNpyTest(dj.Manual): + definition = """ + rid : int + --- + waveform : + """ + + +class GcObjectTest(dj.Manual): + definition = """ + rid : int + --- + results : + """ + + class TestUsesHashStorage: """Tests for _uses_hash_storage helper function.""" @@ -347,3 +378,90 @@ def test_formats_collect_stats_actual(self): assert "Schema paths: 1" in result assert "2.00 MB" in result assert "Errors: 2" in result + + +class TestScanWithLiveData: + """End-to-end tests for gc.scan() against real schemas with external storage. + + Exercises the full production path: + scan_*_references → table.proj(attr).cursor() → raw JSON metadata. + + These are the regression tests that would have caught issue #1442 + (silent type mismatch when scan helpers iterated decoded codec outputs + instead of raw stored metadata). + """ + + @pytest.fixture + def schema_blob(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_blob" + schema = dj.Schema( + schema_name, + context={"GcBlobTest": GcBlobTest}, + connection=connection_test, + ) + schema(GcBlobTest) + yield schema + schema.drop() + + @pytest.fixture + def schema_npy(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_npy" + schema = dj.Schema( + schema_name, + context={"GcNpyTest": GcNpyTest}, + connection=connection_test, + ) + schema(GcNpyTest) + yield schema + schema.drop() + + @pytest.fixture + def schema_object(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_object" + schema = dj.Schema( + schema_name, + context={"GcObjectTest": GcObjectTest}, + connection=connection_test, + ) + schema(GcObjectTest) + yield schema + schema.drop() + + def test_scan_finds_active_blob_reference(self, schema_blob): + """scan() must report hash_referenced >= 1 for a populated column. + + Decoded value type returned by BlobCodec.decode is numpy.ndarray, which + does not satisfy `_extract_hash_refs`'s dict/JSON-string check — this + test fails before the cursor-based fix in scan_hash_references. + """ + GcBlobTest.insert1({"rid": 1, "payload": np.arange(64, dtype="uint8")}) + + stats = gc.scan(schema_blob, store_name="local") + + assert stats["hash_referenced"] >= 1, f"scan should find the active reference; got {stats}" + + def test_scan_finds_active_npy_reference(self, schema_npy): + """scan() must report schema_paths_referenced >= 1 for a populated column. + + Decoded value type returned by NpyCodec.decode is NpyRef (lazy handle), + which does not satisfy `_extract_schema_refs`'s dict check — this test + fails before the cursor-based fix in scan_schema_references. + """ + GcNpyTest.insert1({"rid": 1, "waveform": np.arange(64, dtype="float32")}) + + stats = gc.scan(schema_npy, store_name="local") + + assert stats["schema_paths_referenced"] >= 1, f"scan should find the active reference; got {stats}" + + def test_scan_finds_active_object_reference(self, schema_object): + """scan() must report schema_paths_referenced >= 1 for a populated column. + + Decoded value type returned by ObjectCodec.decode is ObjectRef (lazy + handle), which does not satisfy `_extract_schema_refs`'s dict check — + this test fails before the cursor-based fix in scan_schema_references. + """ + GcObjectTest.insert1({"rid": 1, "results": b"hello-gc-test"}) + + stats = gc.scan(schema_object, store_name="local") + + assert stats["schema_paths_referenced"] >= 1, f"scan should find the active reference; got {stats}"