diff --git a/.env.example b/.env.example index 4e70037..3fc27bc 100644 --- a/.env.example +++ b/.env.example @@ -7,19 +7,46 @@ MAX_REQUEST_BODY_MB=50 LOG_LEVEL=INFO # --- Datastore engine --- -# Selects the storage backend: +# Selects the storage backend (must match a folder under +# `datastore/infrastructure/engines/`): # bigquery — real BigQuery adapter (placeholder while being built). -# ducklake — Future planned +# ducklake — Future planned. DATASTORE_ENGINE=bigquery -BQ_PROJECT= +BIGQUERY_PROJECT= +BIGQUERY_DATASET= +BIGQUERY_CREDENTIALS= +BIGQUERY_CREDENTIALS_RO= +# Use BigQuery's built-in 24h query-results cache on read paths +# (datastore_search / datastore_search_sql / datastore_info). Identical +# SELECTs return free + fast on cache hits. False = force fresh scan. +BIGQUERY_USE_QUERY_CACHE=true SQL_FUNCTIONS_ALLOW_FILE= -# --- CKAN auth gate --- -# Set AUTH_ENABLED=false for local dev / CI without a CKAN instance. -AUTH_ENABLED=true +# --- Auth --- +# AUTH_TYPE selects the provider (folder under `datastore/auth/`): +# ckan — CKAN /api/3/action/datastore_authorize (uses CKAN_URL below). +# jwt — verify JWT against JWT_SECRET or JWT_PUBLIC_KEY (see below). +# anonymous — always allow, no identity. Use for local dev / CI. +AUTH_TYPE=ckan +AUTH_CACHE_TTL=10 + +# CKAN — required when AUTH_TYPE=ckan. CKAN_URL= HTTP_TIMEOUT_SECONDS=10 -AUTH_CACHE_TTL=10 + +# JWT — required when AUTH_TYPE=jwt. HS* uses JWT_SECRET; RS*/ES* uses JWT_PUBLIC_KEY. +JWT_ALGORITHM=HS256 +JWT_SECRET= +JWT_PUBLIC_KEY= +JWT_AUDIENCE= +JWT_ISSUER= + +# --- System columns + search --- +# Toggle the per-row `_updated_at` TIMESTAMP system column. False = `_id` only. +INCLUDE_UPDATED_AT=true +# Hard cap on `datastore_search` / `datastore_search_sql` `limit`. Requests +# above this return 400. Raise this only when downstream clients can stream. +SEARCH_RESULT_ROWS_MAX=32000 # --- Cache --- # Empty REDIS_URL keeps the in-process InMemoryCache (single-pod only). diff --git a/.gitignore b/.gitignore index e9acc69..11585df 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,8 @@ Thumbs.db # Environment files .env .env.local + +# Local-only test engine — synthetic data for local dev; never push. +# Pair with DATASTORE_ENGINE=bigquery_test (which is NOT in the committed +# Config Literal, so flip to a non-Literal locally if you need to run it). +datastore/infrastructure/engines/bigquery_test/ diff --git a/CLAUDE.md b/CLAUDE.md index 7bad866..8dc3485 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,10 +7,11 @@ storage backend (BigQuery Datastore or Ducklake as future support). ## 1. Goals - CKAN-compatible request/response shapes for `/api/3/action/datastore_*`. -- Pluggable backend selected by `DATASTORE_ENGINE` env var (`bigquery` or `ducklake`). -- Streaming responses for search (peak memory ≈ 1 row). _Planned, not yet implemented._ -- Strict request validation, structured error responses. -- CKAN-based auth gate with TTL-cached decisions (InMemory by default; Redis when `REDIS_URL` is set). +- **Pluggable storage backend** selected by `DATASTORE_ENGINE` (`bigquery` today; `ducklake` planned). +- **Pluggable auth** selected by `AUTH_TYPE` (`ckan` / `jwt` / `anonymous`). Provider lives in `datastore/auth//`; only the CKAN provider touches the network, and its TTL cache is local to that provider. +- **Standalone-capable** — runs without an upstream CKAN under `AUTH_TYPE=anonymous` or `AUTH_TYPE=jwt`. CKAN is only required when `AUTH_TYPE=ckan`. +- **Streaming search responses** (peak memory ≈ 1 row) for `datastore_search` / `datastore_search_sql`. +- Strict request validation, structured CKAN-shaped error envelopes. ## 2. Technology Stack @@ -22,16 +23,18 @@ storage backend (BigQuery Datastore or Ducklake as future support). | Validation | **Pydantic v2** (request only) | Strict shape validation, no per-row cost | | JSON | **orjson** | 5–10× stdlib `json`, returns bytes, datetime-aware | | Datastore backend | **google-cloud-bigquery** | Managed, cached, scalable | -| HTTP client | **httpx** (`AsyncClient`) | Connection-pooled CKAN auth calls | -| Cache / auth store | **redis** + `hiredis` | TTL cache for auth decisions | +| HTTP client | **httpx** (`AsyncClient`) | Connection-pooled CKAN calls | +| Cache | **redis** + `hiredis` | TTL cache for CKAN auth decisions | | Schema validation | **frictionless** | Field schema validation on `datastore_create` | +| SQL parsing | **sqlglot** | Parse `datastore_search_sql` — pull table + function names for the auth + allow-list gates | +| JWT auth | **PyJWT** | HS*/RS*/ES* signature + `aud`/`iss`/`exp` validation for the JWT provider | -`pyproject.toml` dependencies (target): +`pyproject.toml` dependencies (live): ```toml [project] dependencies = [ - "fastapi[standard]>=0.113,<0.114", + "fastapi[standard]>=0.115,<0.116", "pydantic>=2.7,<3", "pydantic-settings>=2.3", "orjson>=3.10", @@ -41,6 +44,8 @@ dependencies = [ "frictionless>=5.18", "uvloop>=0.21", "httptools>=0.6", + "sqlglot>=25.0", + "pyjwt>=2.8,<3", ] [tool.ruff.lint] @@ -71,15 +76,17 @@ home in the tree: ``` api ──▶ services ──▶ infrastructure - ▲ │ │ - │ └──▶ schemas ◀────┘ (schemas = Pydantic models, plain data) - │ ▲ - └─ uses schemas for request/response shapes only + │ │ ▲ + ├──▶ auth ─────────────────┤ + │ │ + └──▶ schemas ◀──────────────┘ (schemas = Pydantic models, plain data) ``` -One-way dependency arrow. `infrastructure/` never imports from `api/` or -`services/`. `services/` never imports from `api/`. `api/` is the only layer -that knows about FastAPI/Starlette. +One-way dependencies. `infrastructure/` never imports from `api/`, +`services/`, or `auth/`. `services/` and `auth/` never import from +`api/`. `api/` is the only layer that knows about FastAPI/Starlette. +`auth/` may use `infrastructure/` adapters (the CKAN provider needs +`CKANClient`, all providers may use `CachePort`). ### Tree @@ -106,10 +113,15 @@ datastore-api/ │ ├── api/ │ │ ├── __init__.py │ │ ├── routes.py # Top-level APIRouter; mounts endpoints/ -│ │ ├── context.py # RequestContext + AuthContext + ContextDep -│ │ │ # (per-request handles: config, auth, ckan) -│ │ ├── auth.py # CKAN `datastore_authorize` with TTL cache — -│ │ │ # pure async functions; `AuthContext` wraps state +│ │ ├── context.py # RequestContext + ContextDep — per-request +│ │ │ # handles (config, api_key, auth_provider, ckan) +│ │ │ # with an `.authorize()` method that delegates +│ │ │ # to auth.py +│ │ ├── auth.py # Provider-agnostic boundary policy: permission +│ │ │ # whitelist, resource_id XOR package_id rule, +│ │ │ # anonymous-read rule. Delegates to the active +│ │ │ # AuthProvider; no caching here (CKAN caches +│ │ │ # internally — see datastore/auth/ckan/). │ │ ├── responses.py # CKAN envelope helpers (_success_response / _error_response) │ │ │ # + orjson-backed ORJSONResponse │ │ ├── error_handlers.py # APIError / HTTPException / RequestValidationError @@ -118,10 +130,26 @@ datastore-api/ │ │ └── endpoints/ # One module per resource group │ │ ├── __init__.py │ │ ├── health.py # /, /health, /ready (CKAN-shaped envelopes) -│ │ └── datastore.py # /api/3/action/datastore_* (6 routes; -│ │ # 1 implemented, 5 return HTTP 501) +│ │ └── datastore.py # /api/3/action/datastore_* │ │ -│ │ ── 2. CORE (cross-cutting, framework-agnostic) ────── +│ │ ── 2. AUTH PROVIDERS ─────────────────────── (one subpackage per AUTH_TYPE) +│ ├── auth/ +│ │ ├── __init__.py +│ │ ├── base.py # AuthProvider Protocol + Decision dataclass + +│ │ │ # default_key_id (JWT jti / sha256 cache-key helper) +│ │ ├── registry.py # get_auth_provider(config, **extras) — +│ │ │ # importlib dispatch by AUTH_TYPE +│ │ ├── ckan/ # AUTH_TYPE=ckan +│ │ │ ├── __init__.py # exports `Provider = CKANAuthProvider` +│ │ │ └── provider.py # datastore_authorize via CKANClient + TTL cache +│ │ ├── jwt/ # AUTH_TYPE=jwt +│ │ │ ├── __init__.py # exports `Provider = JWTAuthProvider` +│ │ │ └── provider.py # PyJWT verify (HS*/RS*/ES* + aud/iss/exp) +│ │ └── anonymous/ # AUTH_TYPE=anonymous +│ │ ├── __init__.py # exports `Provider = AnonymousAuthProvider` +│ │ └── provider.py # always allows; no identity +│ │ +│ │ ── 3. CORE (cross-cutting, framework-agnostic) ────── │ ├── core/ │ │ ├── __init__.py │ │ ├── config.py # Pydantic-Settings `Config` (env-driven) + @@ -133,7 +161,7 @@ datastore-api/ │ │ │ # HTTP_STATUS_TO_TYPE_LABEL map │ │ └── helper.py # Pure helpers (parse_authorization_header, …) │ │ -│ │ ── 3. SCHEMAS (Pydantic — boundary validation only) ── +│ │ ── 4. SCHEMAS (Pydantic — boundary validation only) ── │ ├── schemas/ # Inbound request bodies + outbound response │ │ ├── __init__.py # types. Never passed between services or │ │ ├── request.py # returned from engines. @@ -148,17 +176,20 @@ datastore-api/ │ │ └── validators.py # validators.py – FieldSpec, StringOrList, │ │ # PostgresType, helper fns │ │ -│ │ ── 4. SERVICES (business logic, plain Python) ────── +│ │ ── 5. SERVICES (business logic, plain Python) ────── │ ├── services/ # Orchestration: validate → call engine → │ │ ├── __init__.py # shape result. Inputs: plain types or │ │ ├── write.py # validated schemas. Outputs: typed response │ │ │ # models. No FastAPI, no raw SQL. -│ │ │ # write.py – create_datastore (real; -│ │ │ # upsert/delete pending) -│ │ └── read.py # read.py – placeholder (search / -│ │ # search_sql / info pending) +│ │ │ # write.py – create / upsert / delete +│ │ ├── read.py # read.py – search / search_sql / info +│ │ │ # (engine call, format +│ │ │ # dispatch, pagination links, +│ │ │ # function allow-list) +│ │ └── streaming.py # streaming.py – byte-yielding writers +│ │ # (objects/lists/csv/tsv) │ │ -│ │ ── 5. INFRASTRUCTURE (adapters to the outside world) ─ +│ │ ── 6. INFRASTRUCTURE (adapters to the outside world) ─ │ └── infrastructure/ │ ├── __init__.py │ ├── cache.py # CachePort (Protocol) + InMemoryCache + @@ -172,31 +203,47 @@ datastore-api/ │ ├── registry.py # get_datastore_engine + get_allowed_sql_functions; │ │ # dynamic `importlib` dispatch keyed on │ │ # context.config.DATASTORE_ENGINE -│ ├──bigquery/ # Engine package (one folder per backend). -│ | ├── __init__.py # Re-exports `BigQueryBackend` -│ | ├── backend.py # google-cloud-bigquery adapter (placeholder) -│ | ├── lib.py # Backend-specific helpers (optional) -│ | └── allowed_functions.txt # Per-engine datastore_search_sql -│ | # function allow-list — one name per -│ | # line, `#` comments allowed. +│ ├── bigquery/ # Engine package (one folder per backend). +│ | ├── __init__.py # Exports `Backend = BigQueryBackend` — +│ | | # registry imports `Backend`, so the +│ | | # concrete class name is engine-private. +│ | ├── backend.py # DatastoreBackend subclass (placeholder) +│ | ├── client.py # google-cloud-bigquery `Client` construction +│ | ├── lib.py # Backend-specific helpers (optional) +│ | └── allowed_functions.txt # Per-engine datastore_search_sql +│ | # function allow-list — one name per +│ | # line, `#` comments allowed. │ └── ducklake/ # Future planned engine └── tests/ ├── __init__.py ├── conftest.py # FakeCKAN, InMemoryCache, TestClient fixture; + │ # autouse _isolate_bigquery_env clears BQ envs; │ # CKAN pytest plugin disabled via pyproject - ├── test_datastore_create.py # End-to-end HTTP suite (TestClient) - └── test_write_service.py # Service-level units with a fake context + ├── test_health.py + ├── test_datastore_*.py # End-to-end per endpoint (TestClient) + ├── test_read_service.py # Direct service calls — no HTTP + ├── test_write_service.py + ├── auth/ # One folder per auth provider, mirrors datastore/auth/ + │ ├── test_base.py # Decision + default_key_id + │ ├── test_registry.py # AUTH_TYPE dispatch + │ ├── test_orchestration.py # api/auth.py boundary policy + │ ├── ckan/test_provider.py # CKAN provider + TTL cache + │ ├── jwt/test_provider.py + │ └── anonymous/test_provider.py + └── engines/ + ├── bigquery/test_*.py # Real BigQuery backend, fully mocked + └── ducklake/ # (placeholder for future engine) ``` -**Adding a new engine** — drop a sibling folder with the same four files -(`__init__.py` re-exports `BigQueryBackend`; `backend.py` is the -`DatastoreBackend` subclass; `lib.py` is optional helpers; -`allowed_functions.txt` lists allowed SQL functions). No edit to -`registry.py` or `config.py` is required — `DATASTORE_ENGINE` validates -against the set of engine subdirectories that exist at startup, and the -factory dispatches via `importlib.import_module`. The `ducklake.py` -adapter from the original plan will live at -`infrastructure/engines/ducklake/` when it lands. +**Adding a new engine** — drop a sibling folder with the same layout +(`__init__.py` exports `Backend = `; `backend.py` is the +`DatastoreBackend` subclass; `client.py` / `lib.py` for backend-specific +construction + helpers, both optional; `allowed_functions.txt` lists +allowed SQL functions). No edit to `registry.py` or `config.py` is +required — `DATASTORE_ENGINE` validates against the set of engine +subdirectories that exist at startup, and the factory dispatches via +`importlib.import_module` keyed off the `Backend` alias. The `ducklake` +adapter will live at `infrastructure/engines/ducklake/` when it lands. `scripts/` and `docs/` are intentionally absent today. Add them when there's a concrete need (seed scripts, operational runbooks). Until then the README + this file are the docs. @@ -205,30 +252,34 @@ adapter from the original plan will live at | Folder | Put here | Do NOT put here | |---|---|---| -| `datastore/main.py` | App factory, lifespan, middleware order, handler registration | Routes, business logic | +| `datastore/main.py` | App factory, lifespan (httpx, cache, auth provider, engines), middleware order, handler registration | Routes, business logic | | `datastore/api/endpoints/` | Route declarations, request parsing, response building | SQL, engine calls, validation rules — delegate to services | -| `datastore/api/context.py` | `RequestContext`, `AuthContext`, `ContextDep`, `get_context` (per-request DI bundle) | The logic those handles invoke — that lives in `services/` / `infrastructure/` | -| `datastore/api/auth.py` | CKAN `datastore_authorize` orchestration with TTL cache (pure async functions) | Raw CKAN HTTP plumbing — call `CKANClient` | +| `datastore/api/context.py` | `RequestContext`, `ContextDep`, `get_context`, `get_auth_provider`, `get_ckan_client` (per-request DI bundle) | The logic those handles invoke — that lives in `services/` / `auth/` / `infrastructure/` | +| `datastore/api/auth.py` | Provider-agnostic boundary policy (permission whitelist, anonymous-read rule, resource_id XOR package_id) | Concrete provider behaviour — CKAN/JWT/anonymous logic lives in `datastore/auth//` | | `datastore/api/responses.py` | CKAN envelope helpers, `ORJSONResponse` | Anything that needs DB access | | `datastore/api/error_handlers.py` | Exception → CKAN error envelope mapping | Business rules — raise `APIError` from wherever the rule lives | +| `datastore/auth//` | Concrete `AuthProvider` implementation: `__init__.py` exports `Provider = `; `provider.py` implements `authorize` + `key_id`. CKAN provider holds its own TTL cache. | Cross-provider policy (that's `api/auth.py`); FastAPI imports | +| `datastore/auth/base.py` | `AuthProvider` Protocol, `Decision` dataclass, `default_key_id` helper | Provider implementations | +| `datastore/auth/registry.py` | importlib factory keyed on `AUTH_TYPE` | Instance caching — the lifespan builds once and stashes on `app.state` | | `datastore/core/` | Config (`Config`), exceptions, constants, pure helpers | I/O, FastAPI imports, business orchestration | | `datastore/schemas/` | Pydantic `BaseModel` request / response / validator types | Methods that do work — schemas are data shapes only | | `datastore/services/` | Validation that needs cross-input context, calls to engines/cache/CKAN, result shaping | `fastapi`/`starlette` imports, raw SQL strings, HTTP clients (call adapters) | -| `datastore/infrastructure/` | Adapters: cache (Redis / in-memory), CKAN HTTP client, storage engines (BigQuery / DuckLake) | Business rules, FastAPI types, orchestration | -| `tests/` | Test code only | Fixtures that reach into production internals through back doors — go through the public API | +| `datastore/infrastructure/` | Adapters: cache (Redis / in-memory), CKAN HTTP client, storage engines (BigQuery / DuckLake) | Business rules, FastAPI types, orchestration, auth providers (those are at `datastore/auth/`) | +| `tests/` | Test code only — `tests/auth//` mirrors `datastore/auth//`; `tests/engines//` mirrors `datastore/infrastructure/engines//` | Fixtures that reach into production internals through back doors — go through the public API | ### Hard rules 1. **Only `datastore/api/` and `datastore/main.py` may import from `fastapi` or `starlette`.** - Greppable invariant: `rg "from (fastapi|starlette)" datastore/services datastore/infrastructure datastore/core` must return nothing. + Greppable invariant: `rg "from (fastapi|starlette)" datastore/services datastore/infrastructure datastore/core datastore/auth` must return nothing. 2. **Only `datastore/schemas/` and `datastore/core/config.py` may import from `pydantic` / `pydantic_settings`.** - Engines and services pass plain dicts, tuples, and dataclasses. + Engines, services, and auth providers pass plain dicts, tuples, and dataclasses. 3. **Engines return a lazy row iterator of tuples, never `list[dict]`.** Streaming peak memory ≈ 1 row regardless of result size. 4. **Pydantic validates at the boundary; orjson serialises out.** Don't use `model.model_dump()` on hot paths — build dicts inline and `orjson.dumps()`. -5. **No `container.py` / DI framework.** FastAPI's `Depends` plus the engine - `registry.py` factory are the only wiring mechanisms. +5. **Auth providers and storage engines are plugins, not registries to edit.** Drop a folder under `datastore/auth//` or `datastore/infrastructure/engines//` with `__init__.py` exporting `Provider` / `Backend`; `AUTH_TYPE` / `DATASTORE_ENGINE` are auto-validated against directories on disk. No `registry.py` or `config.py` edit required to add either. +6. **Auth caching is provider-private.** The only "auth cache" in the codebase is the TTL cache inside `datastore/auth/ckan/provider.py` (network round trip; worth caching). JWT and anonymous are local and never cache. +7. **No `container.py` / DI framework.** FastAPI's `Depends` plus the two `registry.py` factories (auth + engines) are the only wiring mechanisms. --- @@ -256,7 +307,7 @@ flowchart TB Redis[("Redis
StatefulSet or managed
auth + query cache")] end - CKAN["CKAN
/api/3/action/datastore_authorize"] + CKAN["CKAN
/api/3/action/datastore_authorize
(only when AUTH_TYPE=ckan)"] BQ["BigQuery API
datastore backend"] Client -->|HTTPS| Ingress @@ -268,8 +319,8 @@ flowchart TB Pod1 -.reads.-> Config Pod1 -.reads.-> Secret - Pod1 -->|auth cache| Redis - Pod1 -->|on cache miss| CKAN + Pod1 -->|auth cache (CKAN provider only)| Redis + Pod1 -.->|on cache miss| CKAN Pod1 -->|queries| BQ classDef ext fill:#fff5e6,stroke:#d97706,color:#7c2d12 @@ -288,13 +339,14 @@ flowchart LR Uvicorn --> MW["api/middleware.py\nbody-size + GZip"] MW --> Ctx["api/context.py\nget_context → RequestContext"] Ctx --> Routes["api/endpoints/\ndatastore.py + health.py"] - Routes --> Auth["api/auth.py\nauthorize (TTL cache)"] - Auth -->|cache hit/miss| Cache[("infrastructure/cache.py\nInMemory or Redis")] - Auth -->|cache miss| CKANSvc["CKAN\n/api/3/action/datastore_authorize"] - Routes --> Svc["services/\nwrite.py (read.py pending)"] + Routes --> Auth["api/auth.py\nboundary policy"] + Auth --> Provider["auth//provider.py\n(ckan / jwt / anonymous)"] + Provider -->|CKAN provider only| Cache[("infrastructure/cache.py\nInMemory or Redis")] + Provider -.->|CKAN provider, on miss| CKANSvc["CKAN\n/api/3/action/datastore_authorize"] + Routes --> Svc["services/\nwrite.py + read.py + streaming.py"] Svc --> Eng["infrastructure/engines/\nregistry.get_datastore_engine"] - Eng --> BQ["bigquery/backend.py (placeholder)"] - Eng --> DL[("ducklake.py (planned)")] + Eng --> BQ["bigquery/backend.py"] + Eng --> DL[("ducklake/ (planned)")] Routes --> Resp["api/responses.py\n_success_response / _error_response"] Resp --> Schema["schemas/responses.py\nResponseModel + Result"] @@ -309,21 +361,24 @@ flowchart LR | Layer | Lives in | Knows about | |---|---|---| | HTTP | `api/endpoints/`, `api/routes.py`, `api/middleware.py` | Request parsing, status codes, FastAPI | -| Request bundle | `api/context.py` (+ `api/auth.py` for the auth method) | Per-request handles: config, ckan client (bound), auth-with-cache | +| Request bundle | `api/context.py` | Per-request handles: config, api_key, auth_provider, ckan (Optional). `.authorize()` method delegates to `api/auth.py` | +| Auth boundary policy | `api/auth.py` | Permission whitelist, anonymous-read rule, validation — provider-agnostic | +| Auth providers | `auth//` | One per `AUTH_TYPE`. CKAN (network + TTL cache), JWT (PyJWT verify), anonymous (no-op) | | Response | `api/responses.py`, `schemas/responses.py` | CKAN envelope shape, orjson, typed result models | | Errors | `api/error_handlers.py`, `core/exceptions.py` | APIError taxonomy → status code + `__type` label | | Business logic | `services/` | Orchestration — no FastAPI, no raw SQL, no HTTP plumbing | -| Storage | `infrastructure/engines/` | Backend ABC + concrete adapters; SQL dialect, connection management, row iterators (when implemented) | +| Storage | `infrastructure/engines/` | Backend ABC + concrete adapters; SQL dialect, connection management, row iterators | | External adapters | `infrastructure/cache.py`, `infrastructure/ckan_client.py` | TTL cache (InMemory / Redis), httpx-based CKAN client | | Cross-cutting | `core/` | Config, constants, exceptions, pure helpers | **Key design rules** -- Endpoints call services; services call engines / CKAN client via `context.ckan`. Endpoints never touch SQL. -- `services/write.py` owns cross-cutting validation that requires context from multiple inputs (e.g., resolving `primary_key` against declared fields once that lands). -- Engines (when implemented) return `SearchResult` with a **lazy row iterator of tuples** — never `list[dict]`. Peak memory ≈ 1 row regardless of result size. +- Endpoints call `context.authorize(...)` then services; services call engines. Endpoints never touch SQL. +- `services/write.py` `datastore_create` is the only path that uses `context.ckan` (for `resource_create` on the dict-resource branch); the endpoint gates that branch on `AUTH_TYPE=ckan`. All other endpoints work standalone. +- Engines return `SearchResult` with a **lazy row iterator of tuples** — never `list[dict]`. Peak memory ≈ 1 row regardless of result size. - Pydantic validates inbound (`schemas/request.py`) and documents outbound (`schemas/responses.py`). Outbound serialisation goes through `_success_response` → `ORJSONResponse` → orjson. -- Per-request CKAN client binding happens once in `get_context` (`api/context.py`): the long-lived `httpx.AsyncClient` is owned by the lifespan; each request gets a shallow copy with `api_key` bound. -- No DI container. FastAPI's `Depends` + the engine `registry.py` factory are the only wiring mechanisms. +- The CKAN client is built once in the lifespan **only when `AUTH_TYPE=ckan`**; `get_context` binds the caller's `api_key` per request (a shallow `.bind(api_key)` copy). Under non-CKAN auth `app.state.ckan` is `None` and the per-request bound client is `None`. +- The auth provider is built once in the lifespan (with cache + cache_ttl + ckan client passed as kwargs); registry returns a fresh instance on every call so the lifespan owns instance reuse. +- No DI container. FastAPI's `Depends` + the two `registry.py` factories (auth + engines) are the only wiring mechanisms. **Pod-level shape** - One container per pod: the FastAPI app. Sidecars only for observability (e.g., OpenTelemetry collector). @@ -355,22 +410,27 @@ All three return the CKAN envelope shape `{help, success, result: {...}}`. |---|---|---|---| | GET | `/` | implemented | `{"message": APP_MESSAGE}` | | GET | `/health` | implemented | `{"status": "ok"}` — liveness; always 200 if process is up | -| GET | `/ready` | implemented (stub result) | `{"status": "ready"}` — should become 503 when backend `healthcheck()` fails (planned) | +| GET | `/ready` | implemented | `{"status": "ready"}` — calls `engine.healthcheck()` for rw + ro; 503 with a `Service Unavailable` envelope if either fails | ### 5.2 Datastore endpoints -Each endpoint takes a single `ContextDep`. The handler calls `context.auth.authorize(...)` and delegates to a service in `services/`. +Each endpoint takes a single `ContextDep`. The handler calls `context.authorize(...)` (which runs the boundary policy + delegates to the active `AuthProvider`) and then delegates to a service in `services/`. | Method | Path | Status | Body / Params | Response model | |---|---|---|---|---| | POST | `/api/3/action/datastore_create` | **implemented** | `DatastoreCreateRequest` | `DatastoreCreateResponse` | -| POST | `/api/3/action/datastore_upsert` | _501 stub_ | `DatastoreUpsertRequest` (TBD) | `DatastoreUpsertResponse` (TBD) | -| POST | `/api/3/action/datastore_delete` | _501 stub_ | `DatastoreDeleteRequest` (TBD) | `DatastoreDeleteResponse` (TBD) | -| GET | `/api/3/action/datastore_search` | _501 stub_ | query params | streaming JSON / CSV / TSV (planned) | -| GET | `/api/3/action/datastore_search_sql` | _501 stub_ | `sql`, `limit` | streaming JSON (planned) | -| GET | `/api/3/action/datastore_info` | _501 stub_ | `resource_id` | `DatastoreInfoResponse` (TBD) | +| POST | `/api/3/action/datastore_upsert` | **implemented** | `DatastoreUpsertRequest` | `DatastoreUpsertResponse` | +| POST | `/api/3/action/datastore_delete` | **implemented** | `DatastoreDeleteRequest` | `DatastoreDeleteResponse` | +| GET | `/api/3/action/datastore_search` | **implemented** (streaming) | `DatastoreSearchRequest` | `DatastoreSearchResponse` | +| GET | `/api/3/action/datastore_search_sql` | **implemented** (streaming) | `DatastoreSearchSQLRequest` | `DatastoreSearchResponse` | +| GET | `/api/3/action/datastore_info` | **implemented** | `DatastoreInfoRequest` | `DatastoreInfoResponse` | + +The BigQuery engine is wired end-to-end: DDL, MERGE-based upsert, DML delete, parameterised search, `_table_metadata` for Frictionless schema + unique_key round-trip, and a row-count fast path via `INFORMATION_SCHEMA.TABLE_STORAGE`. The DuckLake engine is the next concrete adapter — see §7. + +`datastore_create` accepts two shapes: -Stub endpoints raise `HTTPException(status_code=501, …)`; the error handler in `api/error_handlers.py` converts that to a CKAN envelope with `__type: "Not Implemented"`. +- `resource_id` — table name only. Works under any `AUTH_TYPE`. +- `resource` (dict) — calls `ckan.resource_create(...)` first to materialise a CKAN resource, then writes the datastore table. **Only valid under `AUTH_TYPE=ckan`**; the endpoint rejects this shape with a `Validation Error` under JWT / anonymous since there's no CKAN to land it. --- @@ -798,32 +858,31 @@ The original phase plan that used to live here has mostly shipped. This section ### Done -- [x] **Foundation** — `pyproject.toml`, `Dockerfile`, `Makefile`, `.env.example`, `docker-compose.yml`. App factory + lifespan in [datastore/main.py](datastore/main.py). Body-size middleware in [datastore/api/middleware.py](datastore/api/middleware.py). -- [x] **CKAN API surface** — `/api/3/action/datastore_*` mounted via [datastore/api/routes.py](datastore/api/routes.py). `datastore_create` implemented end-to-end; the other five datastore actions return `HTTP 501` (mapped to CKAN envelope `__type: "Not Implemented"`). Health endpoints (`/`, `/health`, `/ready`) return the CKAN envelope shape. -- [x] **Request validation** — `DatastoreCreateRequest` in [datastore/schemas/request.py](datastore/schemas/request.py) with strict `extra="forbid"`, exactly-one `resource_id` / `resource` invariant, and `FieldSpec` validators in [validators.py](datastore/schemas/validators.py). Pydantic errors → `RequestValidationError` → CKAN error envelope with a `fields` map. -- [x] **Response models** — [datastore/schemas/responses.py](datastore/schemas/responses.py) defines `ResponseModel` (`help` + `success`) and per-endpoint envelopes with a nested `Result` class. Routes declare `response_model=...` for OpenAPI; services return the typed inner `Result`. -- [x] **Error envelope** — handlers in [datastore/api/error_handlers.py](datastore/api/error_handlers.py); taxonomy in [datastore/core/exceptions.py](datastore/core/exceptions.py) (`ValidationError`, `AuthorizationError`, `NotFoundError`, `ConflictError`, `ServerError` + `HTTP_STATUS_TO_TYPE_LABEL`). -- [x] **Auth gate** — `context.auth.authorize(resource_id=…, package_id=…)` in [datastore/api/auth.py](datastore/api/auth.py); calls CKAN `datastore_authorize` on cache miss. Cache uses the `CachePort` Protocol so `InMemoryCache` and `RedisCache` are interchangeable based on `REDIS_URL`. TTL is `AUTH_CACHE_TTL`. The raw api_key never reaches the cache — it's hashed via `_key_id` (JWT `jti` or sha256 prefix). -- [x] **Request context** — `RequestContext` + `AuthContext` + `ContextDep` in [datastore/api/context.py](datastore/api/context.py). The CKAN client is bound to the caller's `api_key` once per request via `ckan.bind(api_key)`. -- [x] **Service layer** — [datastore/services/write.py](datastore/services/write.py)'s `create_datastore` orchestrates the new-vs-existing-resource flow, calls CKAN `resource_create` when the resource has no `id`, and hands off to the storage engine. -- [x] **Engine abstraction** — `DatastoreBackend` ABC + `SearchResult` / `WriteResult` dataclasses in [base.py](datastore/infrastructure/engines/base.py). Factory in [registry.py](datastore/infrastructure/engines/registry.py) dispatches dynamically via `importlib`; valid `DATASTORE_ENGINE` values are auto-discovered from `infrastructure/engines/*/` directories at process start. `BigQueryBackend` placeholder in [bigquery/backend.py](datastore/infrastructure/engines/bigquery/backend.py). -- [x] **Tests** — end-to-end TestClient suite in [tests/test_datastore_create.py](tests/test_datastore_create.py); service-level units with a fake context in [tests/test_write_service.py](tests/test_write_service.py). CKAN pytest plugin disabled via `addopts = "-p no:ckan -p no:ckan_fixtures"` in `pyproject.toml`. +- [x] **Foundation** — `pyproject.toml`, `Dockerfile`, `Makefile`, `.env.example`, `docker-compose.yml`. App factory + lifespan in [datastore/main.py](datastore/main.py); body-size middleware in [datastore/api/middleware.py](datastore/api/middleware.py); startup log line via `uvicorn.error` showing the active engine + auth provider + cache backend. +- [x] **All six `datastore_*` actions wired** — `create`, `upsert`, `delete`, `search`, `search_sql`, `info` mounted via [datastore/api/routes.py](datastore/api/routes.py). Every endpoint authorizes via `context.authorize(...)` and delegates to a service. +- [x] **Real BigQuery backend** — [datastore/infrastructure/engines/bigquery/](datastore/infrastructure/engines/bigquery/) implements DDL, parameterised `search`, MERGE-based `upsert` (`method=upsert` / `insert` / `update`), DML `delete` (whole-table drop, row delete, column drop), parameterised `search_sql`, and `info`. Frictionless schema + `unique_key` round-trip via the `_table_metadata` table. Row counts use the cheap `INFORMATION_SCHEMA.TABLE_STORAGE` fast path when filters don't apply. +- [x] **Streaming search** — [datastore/services/streaming.py](datastore/services/streaming.py) yields the CKAN envelope chunk-by-chunk for all four `records_format` values (`objects`, `lists`, `csv`, `tsv`); CSV/TSV ride the same JSON envelope (records is a multi-line string). Peak memory ≈ 1 row regardless of N. `_links.start` / `_links.next` carry full scheme + host with all non-`offset` params preserved. +- [x] **`datastore_search_sql` SQL safety** — schema rejects non-SELECT / multi-statement / unparseable SQL (sqlglot). [datastore/schemas/validators.py](datastore/schemas/validators.py)'s `parse_sql_references` pulls table + function names; endpoint authorizes each table as a `resource_id`; service rejects functions outside the engine's allow-list at `engines//allowed_functions.txt` (overridable via `SQL_FUNCTIONS_ALLOW_FILE`). +- [x] **Request validation** — Pydantic models in [datastore/schemas/request.py](datastore/schemas/request.py) with `extra="forbid"`. `datastore_info` / `datastore_delete` accept `resource_id` or `id` (normalised). Pydantic errors → CKAN error envelope with a `fields` map. +- [x] **Response models** — [datastore/schemas/responses.py](datastore/schemas/responses.py) — one envelope per endpoint with a nested `Result` class. Routes declare `response_model=...` for OpenAPI; services return the typed inner `Result`. +- [x] **Error envelope** — handlers in [datastore/api/error_handlers.py](datastore/api/error_handlers.py); taxonomy in [datastore/core/exceptions.py](datastore/core/exceptions.py). +- [x] **Pluggable auth providers** — `AUTH_TYPE` selects a folder under [datastore/auth/](datastore/auth/). Built-in: `ckan` (delegates to `datastore_authorize` with a provider-local TTL cache), `jwt` (PyJWT verify HS*/RS*/ES* + `aud`/`iss`/`exp`), `anonymous` (allow-all). Boundary policy in [datastore/api/auth.py](datastore/api/auth.py) is provider-agnostic. Adding a new provider = drop a folder; no registry / config edit. +- [x] **Standalone capability** — `CKANClient` is only constructed when `AUTH_TYPE=ckan`; `RequestContext.ckan` is `CKANClient | None`. `Config` validator rejects `AUTH_TYPE=ckan` + empty `CKAN_URL` at startup. `datastore_create` `resource` dict path is gated on CKAN auth; everything else runs without an upstream CKAN. +- [x] **`/ready` healthcheck** — lifespan builds rw + ro engine instances and stashes on `app.state`; `/ready` calls `engine.healthcheck()` on both and returns 503 + `Service Unavailable` envelope if either fails. +- [x] **Request context** — `RequestContext` + `ContextDep` in [datastore/api/context.py](datastore/api/context.py); CKAN client bound to the caller's `api_key` per request (or `None` under non-CKAN auth). `.authorize()` method delegates to `api/auth.py` policy + active provider. +- [x] **Engine + auth registries** — `DatastoreBackend` ABC + result dataclasses in [engines/base.py](datastore/infrastructure/engines/base.py); `AuthProvider` Protocol + `Decision` in [auth/base.py](datastore/auth/base.py). Each subpackage exports `Backend` / `Provider`; `DATASTORE_ENGINE` / `AUTH_TYPE` are validated against directories on disk at startup; registries dispatch via `importlib`. +- [x] **Postman collection** — [postman/collection.json](postman/collection.json) auto-generated from `example_payload/` by `postman/generate_postman.py`; covers every endpoint with a worked example. +- [x] **Tests** — ~290 tests across endpoint, service, auth provider, and engine layers. CKAN pytest plugin disabled via `addopts` in `pyproject.toml`. ### Next Rough priority order. Tick each box as the change set lands. -- [ ] **Wire the remaining datastore endpoints.** For each of `datastore_upsert`, `datastore_delete`, `datastore_search`, `datastore_search_sql`, `datastore_info`: - - [ ] Add the request schema to `schemas/request.py` (or a query-param dataclass for GETs). - - [ ] Add the response envelope to `schemas/responses.py` (subclass `ResponseModel`, define inner `Result`). - - [ ] Add the service function in `services/{read,write}.py` returning the inner `Result` model. - - [ ] Replace the `HTTPException(501)` in `endpoints/datastore.py` with the real handler + `response_model=...`. -- [ ] **Real BigQuery backend.** Replace the stub in `infrastructure/engines/bigquery/backend.py`. Initialise `bigquery.Client(project=BQ_PROJECT)` once in the lifespan; store `unique_key` + per-field `info` in the table description (JSON, 16 KB cap); implement parameterised `search` / `upsert` (MERGE) / `delete` (DML) / `search_sql` / `info` / `healthcheck`. Type map per §6.1. -- [ ] **Streaming search.** Once `datastore_search` is wired, add `stream_search` / `stream_csv` / `stream_tsv` helpers in `api/responses.py`. Engines must return `SearchResult` with a **lazy row iterator of tuples** — never `list[dict]`. The route returns `StreamingResponse` with `media_type` chosen by `records_format`. Target: peak memory ≈ 1 row regardless of result size. -- [ ] **Real `/ready` healthcheck.** Construct read/write engine instances in the lifespan (the current placeholder doesn't open a connection). Stash on `app.state`. `/ready` calls `engine.healthcheck()` for both and returns 503 if either fails. `terminationGracePeriodSeconds: 30` in the k8s manifest so streams drain. -- [ ] **DuckLake backend.** Second concrete engine implementing the same ABC. Single-replica `StatefulSet` + `PersistentVolumeClaim` in k8s. Local mode reads `DUCKDB_PATH`; DuckLake mode reads a catalog URL. -- [ ] **Observability.** JSON structured logger in `core/logging.py`; per-request middleware in `api/middleware.py` injects a `request_id` and logs `method`, `path`, `status`, `duration_ms`. The `log.debug` lines already in `auth.py` and `error_handlers.py` light up under `LOG_LEVEL=DEBUG`. -- [ ] **Opt-in query cache.** Auth decisions already cache via the existing `CachePort`. A separate query-result cache (small/hot SELECTs) was in the old plan but isn't on the critical path — defer until BigQuery + streaming land. +- [ ] **DuckLake backend.** Second concrete engine implementing `DatastoreBackend`. Single-replica `StatefulSet` + `PersistentVolumeClaim` in k8s. Local mode reads `DUCKDB_PATH`; DuckLake mode reads a catalog URL. +- [ ] **Observability.** JSON structured logger in `core/logging.py`; per-request middleware in `api/middleware.py` injects a `request_id` and logs `method`, `path`, `status`, `duration_ms`. The existing `log.debug` lines in auth + error handlers + the CKAN provider light up under `LOG_LEVEL=DEBUG`. +- [ ] **Per-table SQL auth for `datastore_search_sql`** — today the endpoint authorizes each table the schema extracts via the active provider, but CKAN's `datastore_search_sql_authorize` is a separate action that takes the SQL string. Wire it through `context.ckan` for the CKAN provider as a tighter check; JWT / anonymous providers stay table-by-table. +- [ ] **Opt-in query-result cache.** The CKAN auth provider already caches its own decisions. A separate cache for small / hot SELECTs would ride on the existing `CachePort`. Not on the critical path — defer until there's a workload that needs it. +- [ ] **`terminationGracePeriodSeconds: 30`** in the k8s manifest so streaming responses drain on SIGTERM. ### Guardrails diff --git a/README.md b/README.md index 8133ace..701a82d 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,20 @@ # Datastore API -A CKAN datastore like API for tabular data storage and querying, -built on the FastAPI framework with a pluggable storage engine -(BigQuery today; DuckLake on the roadmap). Exposes -`/api/3/action/datastore_*` action endpoints. - -Each request is authorised against an upstream CKAN instance via -`datastore_authorize` and TTL-cached (in-process by default; Redis when -`REDIS_URL` is set), so the heavy datastore work lives in this service -while CKAN remains the single source of truth for users, packages, -resources, and permissions. +A CKAN-shaped action API for tabular data storage and querying, built +on FastAPI with **two pluggable axes**: + +- **Storage engine** — `DATASTORE_ENGINE` selects a folder under + `datastore/infrastructure/engines/` (BigQuery today; DuckLake planned). +- **Auth provider** — `AUTH_TYPE` selects a folder under `datastore/auth/`. + Built-in: `ckan` (delegates to an upstream CKAN, TTL-cached), + `jwt` (verifies signature + claims locally), `anonymous` (allow-all, + for local dev / CI). + +Exposes `/api/3/action/datastore_*` endpoints. Runs **standalone** +under `AUTH_TYPE=anonymous` or `AUTH_TYPE=jwt` — no CKAN required — +or as a satellite to CKAN under `AUTH_TYPE=ckan`, in which case CKAN +remains the single source of truth for users, packages, resources, +and permissions, and the heavy datastore work lives here. ## Project structure @@ -19,8 +24,10 @@ datastore/ │ ├── api/ # HTTP layer — only layer that imports fastapi / starlette │ ├── routes.py # Top-level APIRouter; aggregates endpoints/ -│ ├── context.py # RequestContext (per-request DI bundle) -│ ├── auth.py # CKAN datastore_authorize with TTL cache +│ ├── context.py # RequestContext (per-request DI bundle: config, +│ │ # api_key, auth_provider, ckan); .authorize() method +│ ├── auth.py # Boundary policy (permission whitelist + anonymous-read +│ │ # rule); delegates to the active AuthProvider │ ├── middleware.py # ASGI middleware (e.g. BodySizeLimitMiddleware) │ ├── responses.py # Envelope response helpers (_success_response / _error_response) │ ├── error_handlers.py # Exception handlers (APIError → CKAN error envelope) @@ -28,6 +35,16 @@ datastore/ │ ├── health.py # /, /health, /ready │ └── datastore.py # /api/3/action/datastore_* │ +├── auth/ # Pluggable auth providers — one subpackage per type +│ ├── base.py # AuthProvider Protocol + Decision dataclass + +│ │ # default_key_id (JWT jti / sha256 helper) +│ ├── registry.py # get_auth_provider(config, **extras) — importlib dispatch +│ ├── ckan/ # AUTH_TYPE=ckan: calls /api/3/action/datastore_authorize +│ │ # via CKANClient; holds its own TTL cache (the only +│ │ # network-bound provider) so we don't hit CKAN per request +│ ├── jwt/ # AUTH_TYPE=jwt: verifies HS*/RS*/ES* signature + aud/iss +│ └── anonymous/ # AUTH_TYPE=anonymous: always allows; no identity +│ ├── core/ # Cross-cutting helpers — no I/O, no fastapi │ ├── config.py # Pydantic-Settings `Config` (env-driven) + get_config() │ ├── constants.py # Shared constants (type maps, defaults, …) @@ -47,26 +64,52 @@ datastore/ │ └── infrastructure/ # Adapters to outside systems ├── cache.py # InMemoryCache + RedisCache (CachePort protocol) - ├── ckan_client.py # CKAN action API client (httpx-backed) + ├── ckan_client.py # CKAN action API client (httpx-backed). Built in + │ # lifespan only when AUTH_TYPE=ckan; otherwise None. └── engines/ # Storage backends — one subpackage per engine ├── base.py # DatastoreBackend ABC + result dataclasses ├── registry.py # get_datastore_engine + get_allowed_sql_functions; │ # dynamic importlib dispatch keyed on │ # context.config.DATASTORE_ENGINE - ├──bigquery/ # Engine package (one folder per backend). - | ├── __init__.py # Re-exports `BigQueryBackend` - | ├── backend.py # google-cloud-bigquery adapter (placeholder) - | ├── lib.py # Backend-specific helpers (optional) - | └── allowed_functions.txt # Per-engine datastore_search_sql - | # function allow-list — one name per - | # line, `#` comments allowed. - └── ducklake/ # Future planned engine + ├── bigquery/ # Engine package (one folder per backend). + | ├── __init__.py # Exports `Backend = BigQueryBackend` — + | | # the registry imports `Backend`, so the + | | # concrete class name is engine-private. + | ├── backend.py # DatastoreBackend subclass + | ├── client.py # google-cloud-bigquery `Client` construction + | ├── lib.py # Backend-specific helpers + | ├── metadata.py # _table_metadata table — Frictionless schema + unique_key + | ├── search.py # SQL builder for datastore_search + | ├── types.py # Frictionless → BigQuery type map + | └── allowed_functions.txt # Per-engine datastore_search_sql + | # function allow-list — one name per + | # line, `#` comments allowed. + └── ducklake/ # Future planned engine + +postman/ # Importable Postman collection +├── collection.json # Auto-generated from example_payload/ +└── generate_postman.py # Generator script (regenerate after edits) ``` -To add a new engine (e.g. `ducklake`), drop a sibling folder with the -same four files. `DATASTORE_ENGINE` is validated against the set of -engine subdirectories that exist at process start, and the factory -dispatches via `importlib` — no `registry.py` / `config.py` edits. +To add a new engine (e.g. `ducklake`), drop a sibling folder following +the same layout (`__init__.py` exports `Backend = `, +`backend.py` subclasses `DatastoreBackend`, plus an `allowed_functions.txt`). +`DATASTORE_ENGINE` is validated against the set of engine subdirectories +that exist at process start, and the factory imports each engine's +`Backend` via `importlib` — no `registry.py` / `config.py` edits. + +## Column definitions + +**Goal:** make Frictionless schema the native column shape while staying +drop-in compatible with existing CKAN clients during migration. + +`datastore_create` accepts one of two input shapes: + +| Shape | Keys | Status | +|---|---|---| +| Frictionless `schema` | `schema` — [Frictionless Table Schema](https://specs.frictionlessdata.io/table-schema/) | Recommended | +| Legacy CKAN `fields` | `fields`, `primary_key` | Deprecated; emits a `warnings` entry | + ## Roadmap @@ -75,38 +118,53 @@ What's shipped and what's next. Tick each box as the change set lands. ### Done - [x] Foundation (app factory, lifespan, middleware, Dockerfile, Makefile, env config) -- [x] CKAN API surface mounted at `/api/3/action/datastore_*` (`datastore_create` live; 5 others return 501) -- [x] Health endpoints `/`, `/health`, `/ready` returning the CKAN envelope shape -- [x] Strict request validation (`DatastoreCreateRequest` + `FieldSpec`) -- [x] CKAN error envelope mapping (`APIError` taxonomy + handlers) +- [x] All six `datastore_*` actions wired end-to-end: + - `datastore_create`, `datastore_upsert`, `datastore_delete` + - `datastore_search` (streaming JSON / CSV / TSV; CKAN `_links` pagination) + - `datastore_search_sql` (sqlglot parses tables + functions; per-table + CKAN authorize; per-engine function allow-list) + - `datastore_info` (column schema + free-form `meta` dict) +- [x] Health endpoints `/`, `/health`, `/ready` returning the CKAN envelope shape. + `/ready` builds the rw + ro engine instances during lifespan and probes + `engine.healthcheck()` on each — 503 with a `Service Unavailable` envelope + if either fails (so k8s pulls the pod from the Service). +- [x] Strict request validation (Pydantic) + structured error envelopes - [x] CKAN auth gate with TTL cache (InMemory by default; Redis when `REDIS_URL` is set) - [x] Request context bundle (`RequestContext` / `ContextDep` / bound `CKANClient`) -- [x] Service-layer separation (`create_datastore`) -- [x] Engine abstraction + factory (`DatastoreBackend` ABC + `registry.py`) -- [x] Pydantic response models with nested `Result` per endpoint -- [x] End-to-end TestClient suite + service-level unit tests +- [x] Service / engine / streaming layer separation +- [x] Engine-agnostic registry — drop a folder under `infrastructure/engines//` + exporting `Backend`; `DATASTORE_ENGINE` is validated against engine directories + on disk, no registry / config edit required. +- [x] Real BigQuery backend (replace the placeholder in `infrastructure/engines/bigquery/backend.py`) ### Next - -- [ ] Wire the remaining datastore endpoints (`upsert`, `delete`, `search`, `search_sql`, `info`) -- [ ] Real BigQuery backend (replace the placeholder in `infrastructure/engines/bigquery/backend.py`) -- [ ] Streaming search responses (JSON / CSV / TSV; ≈ 1-row peak memory) -- [ ] Real `/ready` healthcheck — wire engine instances through the lifespan -- [ ] DuckLake backend (second concrete engine implementing the same ABC) - [ ] Observability — JSON structured logs + request-id middleware -- [ ] Opt-in query-result cache (deferred until BigQuery + streaming land) +- [ ] Opt-in query-result cache (deferred until BigQuery lands) +- [ ] DuckLake backend (future planned engine) -## CKAN-side requirement -This service does not implement its own user / permission model. -Every request is gated by a call to CKAN's `datastore_authorize` -action, which is **not part of stock CKAN** — it ships in the -[`ckanext-datastore-authz`](https://github.com/datopian/ckanext-datastore-authz) -extension. +## Auth -Before pointing this service at a CKAN instance, install the extension -on the CKAN side and confirm the action is reachable: +`AUTH_TYPE` selects the provider; each lives at `datastore/auth//`. + +| AUTH_TYPE | What it does | Required env | +|---|---|---| +| `ckan` (default) | Calls CKAN `/api/3/action/datastore_authorize` per request. TTL-cached inside the provider so we don't hit CKAN repeatedly. | `CKAN_URL` | +| `jwt` | Verifies the bearer JWT signature + optional `aud` / `iss`. No external service. | `JWT_SECRET` (HS*) or `JWT_PUBLIC_KEY` (RS*/ES*) | +| `anonymous` | Allows every call; no identity. Local dev / CI without auth. | _(none)_ | + +The orchestration in `datastore/api/auth.py` is provider-agnostic — it +owns only the boundary policy (permission whitelist, `resource_id` XOR +`package_id` rule, and the anonymous-read rule: `permission=read` calls +forward to the provider without a credential; everything else +hard-fails when the `Authorization` header is missing). + +**CKAN provider.** Uses the `datastore_authorize` action, which is **not +part of stock CKAN** — it ships in the +[`ckanext-datastore-authz`](https://github.com/datopian/ckanext-datastore-authz) +extension. Before pointing this service at a CKAN instance, install +the extension and confirm the action is reachable: ```sh curl -s "$CKAN_URL/api/3/action/datastore_authorize" \ @@ -115,12 +173,21 @@ curl -s "$CKAN_URL/api/3/action/datastore_authorize" \ -d '{"resource_id": ""}' | jq ``` -If that returns a CKAN envelope with `success: true` and a -`result.{package, resource}` body, you're set. If you get 404, the -extension isn't installed or isn't enabled in CKAN's `ckan.plugins`. +A CKAN envelope with `success: true` and a `result.{package, resource}` +body means you're set. 404 means the extension isn't enabled in +`ckan.plugins`. -For local dev without a CKAN at all, set `AUTH_ENABLED=false` in `.env` -— the auth gate returns a stub decision and every request passes. +**Adding a new provider.** Drop `datastore/auth//` with an +`__init__.py` exporting `Provider = ` and a `provider.py` +implementing the `AuthProvider` Protocol (`base.py`). No registry edit +required — `AUTH_TYPE` is validated against the directories on disk at +startup, same auto-discovery as `DATASTORE_ENGINE`. + +**Standalone caveat.** `datastore_create` accepts two shapes: +`resource_id` (table name only) and `resource` (a CKAN resource dict — +the service calls `ckan.resource_create(...)` first, then writes the +datastore table). The dict form is only valid under `AUTH_TYPE=ckan`; +under JWT / anonymous it's rejected with a clear validation error. @@ -153,12 +220,21 @@ Every entry below maps 1:1 to a field on `datastore.core.config.Config`. See [.e | `MAX_REQUEST_BODY_MB` | `50` | Reject request bodies larger than this (MB) | | `DATASTORE_ENGINE` | `bigquery` | Storage backend — must match a folder under `infrastructure/engines/`; validated at startup | | `SQL_FUNCTIONS_ALLOW_FILE` | _(empty)_ | Override path to the `datastore_search_sql` function allow-list; defaults to `/allowed_functions.txt` | -| `BQ_PROJECT` | _(empty)_ | Google Cloud project ID for the BigQuery backend | +| `BIGQUERY_PROJECT` | _(empty)_ | Google Cloud project ID. Required when `DATASTORE_ENGINE=bigquery`; unset → `/ready` returns 503 with a clear warning. | +| `BIGQUERY_DATASET` | _(empty)_ | BigQuery dataset that holds per-resource tables + the engine-managed `_table_metadata`. Required when `DATASTORE_ENGINE=bigquery`; unset → metadata store is disabled and writes fall through to placeholder mode. | +| `BIGQUERY_CREDENTIALS` | _(empty)_ | Read-write service-account creds. Accepts a JSON blob (leading `{`), a path to a service-account JSON file, or empty (→ Application Default Credentials). | +| `BIGQUERY_CREDENTIALS_RO` | _(empty)_ | Read-only service-account creds (same format). Empty → falls back to `BIGQUERY_CREDENTIALS` so single-credential deployments work. | +| `BIGQUERY_USE_QUERY_CACHE` | `true` | Use BigQuery's 24h query-results cache on `datastore_search` / `datastore_search_sql` / `datastore_info`. Identical SELECTs return free + fast on cache hits. Set `false` to force a fresh scan. | | `REDIS_URL` | _(empty)_ | Redis URL for cache; empty → in-process `InMemoryCache` | -| `CKAN_URL` | _(empty)_ | Base URL of the CKAN instance (required when `AUTH_ENABLED=true`) | +| `CKAN_URL` | _(empty)_ | Base URL of the CKAN instance (required when `AUTH_TYPE=ckan`) | | `HTTP_TIMEOUT_SECONDS` | `10` | Timeout for outbound CKAN calls (seconds) | -| `AUTH_ENABLED` | `true` | CKAN auth gate; set to `false` for local dev / CI without a CKAN | -| `AUTH_CACHE_TTL` | `10` | TTL for cached `datastore_authorize` decisions (seconds) | +| `AUTH_TYPE` | `ckan` | Auth provider — must match a folder under `datastore/auth/`. Built-in: `ckan`, `jwt`, `anonymous` | +| `AUTH_CACHE_TTL` | `10` | TTL for cached auth decisions (seconds) | +| `JWT_ALGORITHM` | `HS256` | JWT signing algorithm. HS* uses `JWT_SECRET`; RS*/ES* uses `JWT_PUBLIC_KEY` | +| `JWT_SECRET` | _(empty)_ | HS* shared secret. Required when `AUTH_TYPE=jwt` and `JWT_ALGORITHM=HS*` | +| `JWT_PUBLIC_KEY` | _(empty)_ | RS*/ES* PEM-encoded public key. Required for RS*/ES* | +| `JWT_AUDIENCE` | _(empty)_ | Expected `aud` claim. Empty = skip audience check | +| `JWT_ISSUER` | _(empty)_ | Expected `iss` claim. Empty = skip issuer check | | `LOG_LEVEL` | `INFO` | Stdlib logging level (`DEBUG` / `INFO` / `WARNING` / `ERROR` / `CRITICAL`) | ## API Documentation @@ -175,7 +251,8 @@ Handler in `datastore/api/endpoints/.py` (parse → call service → r ### Request context -Each endpoint takes a single `Context` that bundles the per-request handles (`auth`, `ckan`, `config`, and more as we grow). The bundle wires them together so handlers stay one-liner. +Each endpoint takes a single `Context` that bundles the per-request +handles. The bundle wires them together so handlers stay one-liner. ```python from datastore.api.context import Context @@ -186,21 +263,33 @@ async def datastore_create( payload: DatastoreCreateRequest, context: Context, ): - # Authorize against CKAN. Pass `resource_id` (existing resource) - # or `package_id` (new resource under that package) — exactly one. - data_dict = await context.auth.authorize( + # Run policy + delegate to the active AuthProvider (CKAN / JWT / + # anonymous). Pass `resource_id` (existing) or `package_id` (new) — + # exactly one. + data_dict = await context.authorize( resource_id=payload.resource_id, permission="create", # read | create | update | delete | patch ) - # The service does the actual work (CKAN resource_create, engine.create, …). + # The service does the actual work (engine.create; CKAN resource_create + # when AUTH_TYPE=ckan and the request supplies a `resource` dict). result = await create_datastore(context, data_dict) return _success_response(request, result) ``` -- `context.auth` — `AuthContext`: cached `datastore_authorize` permission check. Holds the bound `api_key`, the cache, the TTL, and the CKAN client it delegates to. -- `context.ckan` — `CKANClient` already bound to the caller's `api_key`. Call `resource_create` / `resource_patch` / `datastore_authorize` directly; the api_key travels with the client. -- `context.config` — the loaded `Config` instance. +- `context.authorize(...)` — runs the boundary policy and delegates to + the active `AuthProvider`. Returns the `data_dict` shape + `{"resource": , "package": }` ready to merge + with the request payload. +- `context.ckan` — `CKANClient | None`, already bound to the caller's + `api_key`. `None` under non-CKAN auth (standalone). Code paths that + need CKAN must guard for `None`. +- `context.api_key` — the raw bearer string (parsed from the + `Authorization` header). Provider-internal use; endpoints rarely + touch it. +- `context.auth_provider` — the active provider instance (built once + in the lifespan, stored on `app.state.auth_provider`). +- `context.config` — the loaded `Config`. @@ -213,8 +302,17 @@ class DatastoreCreateResponse(ResponseModel): class Result(BaseModel): resource_id: str package_id: str | None = None - fields: list[FieldSpec] - primary_key: list[str] = Field(default_factory=list) + # Canonical Frictionless Table Schema (carries `primaryKey` inside). + schema: dict[str, Any] + # Legacy mirror — marked deprecated in OpenAPI / IDE tooltips. + fields: Annotated[ + list[FieldSpec], + Field(deprecated="use 'schema' (Frictionless Table Schema) instead"), + ] + primary_key: Annotated[ + list[str], + Field(deprecated="use 'schema.primaryKey' instead"), + ] records: list[dict[str, Any]] | None = None # when include_records=True total: int | None = None # when include_total=True @@ -257,13 +355,33 @@ raise NotFoundError(f"resource '{rid}' not found") ### Testing -Two layers of tests live in [tests/](tests/): - -- **End-to-end** ([test_datastore_create.py](tests/test_datastore_create.py)) — uses the `client` fixture in [tests/conftest.py](tests/conftest.py), which wires up `FakeCKAN` (in-memory CKAN stand-in) and `InMemoryCache` via `app.dependency_overrides`. No real network calls. -- **Service-level** ([test_write_service.py](tests/test_write_service.py)) — calls `create_datastore` directly with a fake context. Fast, no HTTP, isolates orchestration from FastAPI plumbing. +Tests live in [tests/](tests/), organised by what they exercise: -`FakeCKAN` exposes `add_resource(...)`, `add_package(...)`, `deny(api_key)` to set up scenarios, and an `authorize_calls` counter to assert cache behaviour. +``` +tests/ +├── conftest.py # FakeCKAN + InMemoryCache + TestClient fixture +├── test_health.py # /, /health, /ready +├── test_datastore_*.py # End-to-end per endpoint (TestClient) +├── test_read_service.py # Direct service calls — no HTTP +├── test_write_service.py +│ +├── auth/ # Auth layer — one folder per provider +│ ├── test_base.py # Decision + default_key_id +│ ├── test_registry.py # AUTH_TYPE dispatch +│ ├── test_orchestration.py # api/auth.py boundary policy +│ ├── ckan/test_provider.py # CKAN provider + TTL cache +│ ├── jwt/test_provider.py # JWT signature / aud / iss / exp +│ └── anonymous/test_provider.py +│ +└── engines/ + ├── bigquery/test_*.py # Real BigQuery backend, fully mocked + └── ducklake/ # (placeholder for future engine) +``` -Mark slow / network-bound tests with `@pytest.mark.integration` so they can be skipped in CI by default. +The `client` fixture in `conftest.py` wires up `FakeCKAN` (in-memory +CKAN stand-in) and an `InMemoryCache` via `app.dependency_overrides`, +and installs a `CKANAuthProvider` backed by the fake. No real network +calls. `FakeCKAN` exposes `add_resource(...)`, `add_package(...)`, +`deny(api_key)` and an `authorize_calls` counter to assert cache +behaviour. -The CKAN pytest plugin auto-installed system-wide is disabled for this project via `addopts = "-p no:ckan -p no:ckan_fixtures"` in `pyproject.toml` — otherwise it tries to load a CKAN `.ini` we don't have. diff --git a/datastore/api/auth.py b/datastore/api/auth.py index bb0ea54..059bad8 100644 --- a/datastore/api/auth.py +++ b/datastore/api/auth.py @@ -1,43 +1,45 @@ -"""CKAN authorization — pure async functions. No state, no FastAPI. +"""Auth orchestration — boundary validation + anonymous-read policy. -`AuthContext` (in `app/api/context.py`) wraps these into a per-request -object: it holds the state (api_key, cache, ttl, enabled) and exposes -methods that delegate here. +Provider-agnostic. Owns only the pieces that apply to every provider: + - the anonymous-read policy (some permissions skip the credential check), + - validation of `permission` and the `resource_id` XOR `package_id` rule. + +Caching is a provider concern (network-bound providers cache; local ones +don't). Today only the CKAN provider caches — see `auth/ckan/provider.py`. + +`RequestContext.authorize(...)` (in `api/context.py`) is the public seam +endpoints use; it delegates here. """ from __future__ import annotations -import base64 -import hashlib -import logging from typing import Any, Literal, get_args -import orjson - +from datastore.auth.base import AuthProvider from datastore.core.exceptions import AuthorizationError, ValidationError -from datastore.infrastructure.cache import CachePort -from datastore.infrastructure.ckan_client import CKANClient - -log = logging.getLogger(__name__) Permission = Literal["read", "create", "update", "delete", "patch"] ALLOWED_PERMISSIONS: frozenset[str] = frozenset(get_args(Permission)) +# Permissions an unauthenticated caller is allowed to attempt. For these +# we forward to the provider with `credential=None`; the provider decides +# (e.g. CKAN checks resource visibility). Anything outside this set +# hard-fails on missing credentials before the provider is called. +ANONYMOUS_PERMISSIONS: frozenset[str] = frozenset({"read"}) + -# --- public ------------------------------------------------------------------ async def authorize( *, api_key: str | None, - cache: CachePort, - cache_ttl: int, - enabled: bool, - ckan: CKANClient, + provider: AuthProvider, resource_id: str | None, package_id: str | None, permission: Permission | None = None, ) -> dict[str, Any]: - """CKAN `datastore_authorize` with TTL cache. + """Run policy checks, delegate to the provider, return endpoint data_dict. + Endpoints merge the returned dict into their `data_dict`: + `{"resource": , "package": }` """ if bool(resource_id) == bool(package_id): raise ValidationError("exactly one of resource_id or package_id required") @@ -47,112 +49,15 @@ async def authorize( f"permission must be one of {sorted(ALLOWED_PERMISSIONS)}" ) - if not enabled: - log.debug("auth disabled; returning stub for resource_id=%s package_id=%s", - resource_id, package_id) - return _disabled_stub(resource_id, package_id) - - if not api_key: - raise AuthorizationError("Access denied: Action requires an authenticated user") - - # Adapter enforces TTL: `cache.set(..., ttl=cache_ttl)` writes an entry - # that expires `cache_ttl` seconds after this write. - scope, target = ("res", resource_id) if resource_id else ("pkg", package_id) - assert target is not None # narrowed by the validation above - cache_key = _cache_key(api_key, scope, target, permission) - - cached = await _safe_get(cache, cache_key) - if cached is not None: - log.debug("auth cache HIT scope=%s target=%s perm=%s", scope, target, permission) - return _decode(cached) + if not api_key and permission not in ANONYMOUS_PERMISSIONS: + raise AuthorizationError( + "Access denied: Action requires an authenticated user" + ) - log.debug("auth cache MISS scope=%s target=%s perm=%s -> CKAN", scope, target, permission) - result = await ckan.datastore_authorize( + decision = await provider.authorize( + credential=api_key, resource_id=resource_id, package_id=package_id, permission=permission, ) - await _safe_set(cache, cache_key, orjson.dumps(result), cache_ttl) - log.debug("auth cache STORE scope=%s target=%s perm=%s ttl=%ds", - scope, target, permission, cache_ttl) - return result - - -# --- cache helpers ----------------------------------------------------------- - -def _cache_key( - api_key: str, - scope: str, - identifier: str, - permission: str | None, -) -> str: - return f"auth:{_key_id(api_key)}:{scope}:{identifier}:{permission}" - - -async def _safe_get(cache: CachePort, key: str) -> bytes | None: - try: - return await cache.get(key) - except Exception: # noqa: BLE001 — cache failure must not block requests - log.warning("auth cache GET failed; falling back to CKAN", exc_info=True) - return None - - -async def _safe_set(cache: CachePort, key: str, value: bytes, ttl: int) -> None: - try: - await cache.set(key, value, ttl) - except Exception: # noqa: BLE001 — same fail-open policy on writes - log.warning("auth cache SET failed; skipping cache", exc_info=True) - - -# --- pure helpers ------------------------------------------------------------ - - -def _key_id(api_key: str) -> str: - """Stable, non-reversible id for the api_key. - - JWT tokens use their `jti` claim; opaque tokens use a sha256 prefix. - The raw key never reaches the cache. - """ - jti = _jwt_jti(api_key) - if jti: - return f"jti:{jti}" - return "h:" + hashlib.sha256(api_key.encode()).hexdigest()[:16] - - -def _jwt_jti(token: str) -> str | None: - """Extract the `jti` claim from an unverified JWT, or None if not a JWT.""" - parts = token.split(".") - if len(parts) != 3: - return None - try: - segment = parts[1] - padded = segment + "=" * (-len(segment) % 4) - payload = orjson.loads(base64.urlsafe_b64decode(padded)) - except (ValueError, TypeError, orjson.JSONDecodeError): - return None - if not isinstance(payload, dict): - return None - jti = payload.get("jti") - return jti if isinstance(jti, str) and jti else None - - -def _disabled_stub( - resource_id: str | None, package_id: str | None -) -> dict[str, Any]: - """Decision returned when `AUTH_ENABLED=false` (local dev / CI without CKAN).""" - if resource_id is not None: - return { - "package": {"id": None, "_auth_disabled": True}, - "resource": {"id": resource_id, "_auth_disabled": True}, - } - return { - "package": {"id": package_id, "_auth_disabled": True}, - "resource": {"package_id": package_id, "_auth_disabled": True}, - } - - -def _decode(value: bytes) -> dict[str, Any]: - parsed = orjson.loads(value) - if not isinstance(parsed, dict): - raise AuthorizationError("cached auth entry is malformed") - return parsed + return {"resource": decision.resource or {}, "package": decision.package or {}} diff --git a/datastore/api/context.py b/datastore/api/context.py index c918030..0882806 100644 --- a/datastore/api/context.py +++ b/datastore/api/context.py @@ -8,41 +8,52 @@ from datastore.api import auth as auth_fns from datastore.api.auth import Permission +from datastore.auth.base import AuthProvider from datastore.core.config import Config, get_config from datastore.core.helper import parse_authorization_header -from datastore.infrastructure.cache import CachePort from datastore.infrastructure.ckan_client import CKANClient -# --- FastAPI dependency seams ------------------------------------------------ ConfigDep = Annotated[Config, Depends(get_config)] -def get_cache(request: Request) -> CachePort: - """Cache adapter installed by the app lifespan in `request.app.state.cache`.""" - cache = getattr(request.app.state, "cache", None) - if cache is None: - raise RuntimeError("cache is not initialised; check the lifespan wiring") - return cache # type: ignore[no-any-return] +def get_ckan_client(request: Request) -> CKANClient | None: + """CKAN client installed by the app lifespan in `request.app.state.ckan`. + + `None` under non-CKAN auth (the lifespan skips construction when + `AUTH_TYPE != "ckan"` — the datastore runs standalone). + """ + return getattr(request.app.state, "ckan", None) -def get_ckan_client(request: Request) -> CKANClient: - """CKAN client installed by the app lifespan in `request.app.state.ckan`.""" - ckan = getattr(request.app.state, "ckan", None) - if ckan is None: - raise RuntimeError("ckan client is not initialised; check the lifespan wiring") - return ckan # type: ignore[no-any-return] +def get_auth_provider(request: Request) -> AuthProvider: + """Auth provider installed by the app lifespan.""" + provider = getattr(request.app.state, "auth_provider", None) + if provider is None: + raise RuntimeError( + "auth provider is not initialised; check the lifespan wiring" + ) + return provider # type: ignore[no-any-return] -# --- AuthContext ------------------------------------------------------------- @dataclass(slots=True) -class AuthContext: - """Per-request auth state. Delegates the real work to `app.api.auth`.""" +class RequestContext: + """Per-request facade — the one dep an endpoint takes. + + `ckan` is None under non-CKAN auth (the datastore runs standalone). + Code paths that need CKAN — today only `datastore_create`'s `resource` + dict branch — must guard for that. + Usage: + async def handler(payload: ..., context: Context): + data_dict = await ctx.authorize(resource_id=..., permission=...) + if ctx.ckan is not None: + created = await ctx.ckan.resource_create(resource=...) + """ + + config: Config api_key: str | None = field(repr=False) - cache: CachePort - cache_ttl: int - enabled: bool - ckan: CKANClient + auth_provider: AuthProvider + ckan: CKANClient | None async def authorize( self, @@ -52,50 +63,26 @@ async def authorize( ) -> dict[str, Any]: return await auth_fns.authorize( api_key=self.api_key, - cache=self.cache, - cache_ttl=self.cache_ttl, - enabled=self.enabled, - ckan=self.ckan, + provider=self.auth_provider, resource_id=resource_id, package_id=package_id, permission=permission, ) -# --- RequestContext ---------------------------------------------------------- -@dataclass(slots=True) -class RequestContext: - """Per-request facade — the one dep an endpoint takes. - - Usage: - async def handler(payload: ..., context: Context): - decision = await ctx.auth.authorize(resource_id=..., permission=...) - created = await ctx.ckan.resource_create(resource=...) - - Add new sub-contexts here as the app grows (e.g. `engine`, `events`). - """ - - config: Config - auth: AuthContext - ckan: CKANClient - - def get_context( config: ConfigDep, - cache: Annotated[CachePort, Depends(get_cache)], - ckan: Annotated[CKANClient, Depends(get_ckan_client)], + ckan: Annotated[CKANClient | None, Depends(get_ckan_client)], + provider: Annotated[AuthProvider, Depends(get_auth_provider)], authorization: Annotated[str | None, Header(alias="Authorization")] = None, ) -> RequestContext: api_key = parse_authorization_header(authorization) - bound_ckan = ckan.bind(api_key) - auth = AuthContext( + return RequestContext( + config=config, api_key=api_key, - cache=cache, - cache_ttl=config.AUTH_CACHE_TTL, - enabled=config.AUTH_ENABLED, - ckan=bound_ckan, + auth_provider=provider, + ckan=ckan.bind(api_key) if ckan is not None else None, ) - return RequestContext(config=config, auth=auth, ckan=bound_ckan) Context = Annotated[RequestContext, Depends(get_context)] diff --git a/datastore/api/endpoints/datastore.py b/datastore/api/endpoints/datastore.py index ae78d62..eca9dc0 100644 --- a/datastore/api/endpoints/datastore.py +++ b/datastore/api/endpoints/datastore.py @@ -2,25 +2,38 @@ from typing import Annotated -from fastapi import APIRouter, HTTPException, Query +from fastapi import APIRouter, Query from starlette.requests import Request from starlette.responses import StreamingResponse from datastore.api.context import Context -from datastore.api.responses import ORJSONResponse, _success_response +from datastore.api.responses import _deprecation_warnings, _success_response +from datastore.core.exceptions import ValidationError from datastore.schemas.request import ( DatastoreCreateRequest, + DatastoreDeleteRequest, + DatastoreInfoRequest, DatastoreSearchRequest, DatastoreSearchSQLRequest, DatastoreUpsertRequest, ) from datastore.schemas.responses import ( DatastoreCreateResponse, + DatastoreDeleteResponse, + DatastoreInfoResponse, DatastoreSearchResponse, DatastoreUpsertResponse, ) -from datastore.services.read import search_datastore, search_sql_datastore -from datastore.services.write import create_datastore, upsert_datastore +from datastore.services.read import ( + info_datastore, + search_datastore, + search_sql_datastore, +) +from datastore.services.write import ( + create_datastore, + delete_datastore, + upsert_datastore, +) router = APIRouter(tags=["datastore"]) @@ -33,30 +46,37 @@ async def datastore_create( ): """`POST /api/3/datastore_create` — authorize, then run the create flow.""" + if payload.resource is not None and context.config.AUTH_TYPE != "ckan": + raise ValidationError( + "`resource` dict is only supported for ckan auth; for other auth types," + "use `resource_id` instead" + ) + if payload.resource_id: - data_dict = await context.auth.authorize( + data_dict = await context.authorize( resource_id=payload.resource_id, permission="create", ) else: - data_dict = await context.auth.authorize( + data_dict = await context.authorize( package_id=payload.resource.get("package_id"), permission="create", ) - + data_dict.update( { "resource": payload.resource_id or payload.resource, - "fields": payload.fields, + "schema": payload.schema, "records": payload.records, - "primary_key": payload.primary_key, "include_records": payload.include_records, "include_total": payload.include_total, } ) result = await create_datastore(context, data_dict) - return _success_response(request, result) + warnings = _deprecation_warnings(payload) + + return _success_response(request, result, warnings=warnings or None) @router.post("/datastore_upsert", response_model=DatastoreUpsertResponse) @@ -66,7 +86,7 @@ async def datastore_upsert( context: Context, ): """`POST /api/3/datastore_upsert` — authorize, then upsert / insert / update rows.""" - data_dict = await context.auth.authorize( + data_dict = await context.authorize( resource_id=payload.resource_id, permission="update", ) @@ -75,11 +95,6 @@ async def datastore_upsert( return _success_response(request, result) -@router.post("/datastore_delete") -def datastore_delete() -> ORJSONResponse: - raise HTTPException(status_code=501, detail="datastore_delete is not implemented") - - @router.get("/datastore_search", response_model=DatastoreSearchResponse) async def datastore_search( request: Request, @@ -95,14 +110,12 @@ async def datastore_search( iterator in a `StreamingResponse` with a fixed `application/json` media type. """ - data_dict = await context.auth.authorize( + data_dict = await context.authorize( resource_id=params.resource_id, permission="read", ) data_dict.update(params.model_dump()) - body_iter = await search_datastore( - context, data_dict, request_url=str(request.url) - ) + body_iter = await search_datastore(context, data_dict, request_url=str(request.url)) return StreamingResponse(body_iter, media_type="application/json") @@ -113,23 +126,61 @@ async def datastore_search_sql( params: Annotated[DatastoreSearchSQLRequest, Query()], ): """`GET /api/3/datastore_search_sql` — execute a raw SQL SELECT and stream. - Accepts a single `sql` query parameter; + Accepts a single `sql` query parameter; """ for resource_id in params.resource_ids: - await context.auth.authorize( - resource_id=resource_id, permission="read" - ) + await context.authorize(resource_id=resource_id, permission="read") data_dict = params.model_dump() | { "function_names": params.function_names, + "limit": params.limit, + "offset": params.offset, } - - body_iter = await search_sql_datastore( - context, data_dict, request_url=str(request.url) - ) + + body_iter = await search_sql_datastore(context, data_dict, request_url=str(request.url)) return StreamingResponse(body_iter, media_type="application/json") -@router.get("/datastore_info") -def datastore_info() -> ORJSONResponse: - raise HTTPException(status_code=501, detail="datastore_info is not implemented") +@router.get("/datastore_info", response_model=DatastoreInfoResponse) +async def datastore_info( + request: Request, + context: Context, + params: Annotated[DatastoreInfoRequest, Query()], +): + """`GET /api/3/datastore_info` — return table metadata. + + Authorizes the caller on `resource_id` (same gate as `datastore_search`), + then asks the read-only engine for its `InfoResult`. The response is + small enough to skip streaming; we go through the standard + `_success_response` envelope. + + Body shape: + result.fields — column schema, list of {"id", "type", ...} + result.meta — free-form dict (engine-specific extras) + """ + await context.authorize(resource_id=params.resource_id, permission="read") + result = await info_datastore(context, params.model_dump()) + return _success_response(request, result) + + +@router.post("/datastore_delete", response_model=DatastoreDeleteResponse) +async def datastore_delete( + request: Request, + payload: DatastoreDeleteRequest, + context: Context, +): + """`POST /api/3/datastore_delete` — delete rows or drop the table. + + Body: + `resource_id` / `id` (one required) — table to delete from. + `filters` (optional dict) — only rows matching every key/value + pair are deleted. Omit → whole table is dropped. + `force` (optional bool) — required to delete from a CKAN + read-only resource. + + Returns the original `filters` echoed back (CKAN convention) so the + caller can confirm what the server actually applied. + """ + await context.authorize(resource_id=payload.resource_id, permission="delete") + result = await delete_datastore(context, payload.model_dump()) + return _success_response(request, result) diff --git a/datastore/api/endpoints/health.py b/datastore/api/endpoints/health.py index fd0d079..6142fae 100644 --- a/datastore/api/endpoints/health.py +++ b/datastore/api/endpoints/health.py @@ -1,27 +1,59 @@ from __future__ import annotations +from types import SimpleNamespace + from fastapi import APIRouter from starlette.requests import Request +from starlette.responses import JSONResponse from datastore.api.responses import _success_response from datastore.core.config import get_config +from datastore.infrastructure.engines.registry import get_datastore_engine from datastore.schemas.responses import StatusResponse, WelcomeResponse -router = APIRouter(tags=["health"]) +welcome_router = APIRouter(tags=["health"]) + +probe_router = APIRouter(tags=["health"]) -@router.get("/", response_model=WelcomeResponse) + +@welcome_router.get("/", response_model=WelcomeResponse) def welcome(request: Request): return _success_response( request, WelcomeResponse.Result(message=get_config().APP_MESSAGE), ) -@router.get("/health", response_model=StatusResponse) + +@probe_router.get("/health", response_model=StatusResponse) def health(request: Request): + """Liveness — always 200 while the process is up.""" return _success_response(request, StatusResponse.Result(status="ok")) -@router.get("/ready", response_model=StatusResponse) +@probe_router.get("/ready", response_model=StatusResponse) def ready(request: Request): + """Readiness — 200 when both rw and ro engines pass `healthcheck()`, + 503 otherwise. Probes both modes because the credential split means + one can fail while the other works.""" + ctx = SimpleNamespace(config=get_config()) + + failing: list[str] = [] + for mode in ("rw", "ro"): + try: + engine = get_datastore_engine(ctx, mode=mode) # type: ignore[arg-type] + if not engine.healthcheck(): + failing.append(mode) + except Exception: + failing.append(mode) + + if failing: + return JSONResponse( + status_code=503, + content={ + "help": str(request.url), + "success": False, + "result": {"status": "not_ready"}, + }, + ) return _success_response(request, StatusResponse.Result(status="ready")) diff --git a/datastore/api/responses.py b/datastore/api/responses.py index bce46d3..0269e43 100644 --- a/datastore/api/responses.py +++ b/datastore/api/responses.py @@ -25,18 +25,48 @@ def _help(request: Request) -> str: return str(request.url) +def _deprecation_warnings(payload: BaseModel) -> list[str]: + """Build body-level warnings from `Field(deprecated=...)` metadata. + + For every field the caller explicitly provided (`model_fields_set`) + whose declaration carries a `deprecated` string, emit one warning of + the form ``"'' is deprecated: "``. Pulling the + message off the model keeps the wording in one place — the field's + own declaration — so endpoints never duplicate it. + + `model_fields_set` is used instead of reading the value: it answers + "did the caller send this?" without invoking the field accessor, + which would itself emit a `DeprecationWarning` we don't want at + runtime. + """ + out: list[str] = [] + for name in payload.model_fields_set: + msg = type(payload).model_fields[name].deprecated + if isinstance(msg, str) and msg: + out.append(f"'{name}' is deprecated — {msg}.") + return out + + def _success_response( request: Request, result: BaseModel | dict[str, Any], *, status_code: int = 200, + warnings: list[str] | None = None, ) -> ORJSONResponse: # `result` may be a Pydantic model or a plain dict; orjson's default # handler in `_orjson_default` dumps Pydantic models via `model_dump()`. - return ORJSONResponse( - {"help": _help(request), "success": True, "result": result}, - status_code=status_code, - ) + # `warnings` is non-fatal advisory text (e.g. deprecated-input notices) — + # surfaced at envelope level so any client reading the body sees them + # without having to parse the result block. Omitted when empty. + body: dict[str, Any] = { + "help": _help(request), + "success": True, + "result": result, + } + if warnings: + body["warnings"] = warnings + return ORJSONResponse(body, status_code=status_code) def _error_response( diff --git a/datastore/api/routes.py b/datastore/api/routes.py index 90fa176..da46d21 100644 --- a/datastore/api/routes.py +++ b/datastore/api/routes.py @@ -5,5 +5,7 @@ from datastore.api.endpoints import datastore, health api_router = APIRouter() -api_router.include_router(health.router) -api_router.include_router(datastore.router, prefix="/api/3/action") +api_router.include_router(health.welcome_router) +api_router.include_router(health.probe_router) +api_router.include_router(health.probe_router, prefix="/api/3/action") +api_router.include_router(datastore.router, prefix="/api/3/action") \ No newline at end of file diff --git a/datastore/auth/__init__.py b/datastore/auth/__init__.py new file mode 100644 index 0000000..3f45053 --- /dev/null +++ b/datastore/auth/__init__.py @@ -0,0 +1,7 @@ +"""Auth providers — pluggable authentication/authorization backends. + +One subpackage per provider (`ckan/`, `jwt/`, `anonymous/`); each exports +`Provider = ` so the registry can `importlib.import_module` +it without listing names statically. Add a new provider by dropping a +sibling folder with the same layout. +""" diff --git a/datastore/auth/anonymous/__init__.py b/datastore/auth/anonymous/__init__.py new file mode 100644 index 0000000..6b5a330 --- /dev/null +++ b/datastore/auth/anonymous/__init__.py @@ -0,0 +1,3 @@ +from datastore.auth.anonymous.provider import AnonymousAuthProvider as Provider + +__all__ = ["Provider"] diff --git a/datastore/auth/anonymous/provider.py b/datastore/auth/anonymous/provider.py new file mode 100644 index 0000000..5b8c612 --- /dev/null +++ b/datastore/auth/anonymous/provider.py @@ -0,0 +1,29 @@ +"""Anonymous provider — always allows, no identity. + +Use for local dev or CI without a real auth backend. Every call returns +an empty `Decision`; no signature, no claims, no resource metadata. +""" + +from __future__ import annotations + +from datastore.auth.base import Decision + + +class AnonymousAuthProvider: + name = "anonymous" + + def __init__(self, **_: object) -> None: + pass + + async def authorize( + self, + *, + credential: str | None, + resource_id: str | None, + package_id: str | None, + permission: str | None, + ) -> Decision: + return Decision() + + def key_id(self, credential: str) -> str: + return "anon" diff --git a/datastore/auth/base.py b/datastore/auth/base.py new file mode 100644 index 0000000..ee44ed2 --- /dev/null +++ b/datastore/auth/base.py @@ -0,0 +1,58 @@ +"""Auth provider contract — `AuthProvider` Protocol + `Decision` dataclass. + +A provider answers: is this credential allowed to do `permission` on +this `resource_id` / `package_id`? + +Providers RAISE `AuthorizationError` to deny; a returned `Decision` +always means allowed. `subject` and `claims` carry caller identity (when +known); `resource` and `package` carry CKAN-style metadata (CKAN +provider only — generic providers leave them None). +""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from typing import Any, Protocol + + +@dataclass(slots=True, frozen=True) +class Decision: + subject: str | None = None + claims: dict[str, Any] | None = None + resource: dict[str, Any] | None = None + package: dict[str, Any] | None = None + + +class AuthProvider(Protocol): + """Auth provider interface. One instance per app, built in lifespan.""" + + name: str + + async def authorize( + self, + *, + credential: str | None, + resource_id: str | None, + package_id: str | None, + permission: str | None, + ) -> Decision: ... + + def key_id(self, credential: str) -> str: + """Stable, non-reversible id for cache keys. Raw credential never stored.""" + ... + + +def default_key_id(credential: str) -> str: + """sha256 prefix of the full credential string. + + Security note: deliberately ignores any embedded JWT `jti` claim. An + unverified `jti` from the token's payload can be forged to collide + with a cached authorization decision for a different (verified) + token — the cache lookup is keyed before signature verification, so + a forged `jti:` lookup would return the cached decision for + the legitimate user with the same `jti`. Hashing the whole + credential keeps the cache identity tied to bytes-on-the-wire and + makes any collision strictly equivalent to a sha256 collision. + """ + return "h:" + hashlib.sha256(credential.encode()).hexdigest()[:16] diff --git a/datastore/auth/ckan/__init__.py b/datastore/auth/ckan/__init__.py new file mode 100644 index 0000000..525559c --- /dev/null +++ b/datastore/auth/ckan/__init__.py @@ -0,0 +1,3 @@ +from datastore.auth.ckan.provider import CKANAuthProvider as Provider + +__all__ = ["Provider"] diff --git a/datastore/auth/ckan/provider.py b/datastore/auth/ckan/provider.py new file mode 100644 index 0000000..b052b7e --- /dev/null +++ b/datastore/auth/ckan/provider.py @@ -0,0 +1,144 @@ +"""CKAN provider — defers to `/api/3/action/datastore_authorize`, with TTL cache. + +Caching is scoped to this provider: CKAN's `datastore_authorize` is a +network round-trip on every call, so wrapping it with a TTL cache cuts +duplicate work. Other providers (JWT signature check, anonymous no-op) +are local and cheap — they don't need caching, so the cache lives here +rather than in the orchestration layer. +""" + +from __future__ import annotations + +import logging +from typing import Any + +import orjson + +from datastore.auth.base import Decision, default_key_id +from datastore.core.exceptions import AuthorizationError +from datastore.infrastructure.cache import CachePort +from datastore.infrastructure.ckan_client import CKANClient + +log = logging.getLogger(__name__) + + +class CKANAuthProvider: + name = "ckan" + + def __init__( + self, + *, + ckan: CKANClient, + cache: CachePort, + cache_ttl: int, + **_: object, + ) -> None: + self._ckan = ckan + self._cache = cache + self._cache_ttl = cache_ttl + + async def authorize( + self, + *, + credential: str | None, + resource_id: str | None, + package_id: str | None, + permission: str | None, + ) -> Decision: + scope, target = ("res", resource_id) if resource_id else ("pkg", package_id) + assert target is not None # orchestration validates one-of upstream + cache_key = self._cache_key(credential, scope, target, permission) + + cached = await _safe_get(self._cache, cache_key) + if cached is not None: + try: + decision = _decision_from_bytes(cached) + log.debug( + "ckan auth cache HIT scope=%s target=%s perm=%s", + scope, target, permission, + ) + return decision + except (AuthorizationError, ValueError, TypeError) as e: + # Treat a corrupt cache entry as a miss — fall through + # to CKAN. Blocking auth on a poisoned cache would be a + # self-inflicted outage. + log.warning( + "ckan auth cache entry malformed for scope=%s target=%s: " + "%s — falling back to CKAN", + scope, target, e, + ) + + log.debug( + "ckan auth cache MISS scope=%s target=%s perm=%s -> CKAN", + scope, target, permission, + ) + ckan = self._ckan.bind(credential) + result = await ckan.datastore_authorize( + resource_id=resource_id, + package_id=package_id, + permission=permission, + ) + # `subject` rides through the cache (orjson-serialised). Never + # store the raw credential there — use the same hash we already + # derive for the cache key. + decision = Decision( + subject=self.key_id(credential) if credential else None, + resource=result.get("resource"), + package=result.get("package"), + ) + await _safe_set( + self._cache, cache_key, _decision_to_bytes(decision), self._cache_ttl, + ) + return decision + + def key_id(self, credential: str) -> str: + return default_key_id(credential) + + def _cache_key( + self, + credential: str | None, + scope: str, + target: str, + permission: str | None, + ) -> str: + key_id = self.key_id(credential) if credential else "anon" + return f"auth:ckan:{key_id}:{scope}:{target}:{permission}" + + +# --- cache plumbing ---------------------------------------------------------- +# Fail-open: cache failures must not block the request. We log and fall +# through to CKAN (a slow request is better than a wrong one). + + +async def _safe_get(cache: CachePort, key: str) -> bytes | None: + try: + return await cache.get(key) + except Exception: # noqa: BLE001 + log.warning("ckan auth cache GET failed; falling back to CKAN", exc_info=True) + return None + + +async def _safe_set(cache: CachePort, key: str, value: bytes, ttl: int) -> None: + try: + await cache.set(key, value, ttl) + except Exception: # noqa: BLE001 + log.warning("ckan auth cache SET failed; skipping cache", exc_info=True) + + +def _decision_to_bytes(d: Decision) -> bytes: + return orjson.dumps( + {"subject": d.subject, "claims": d.claims, + "resource": d.resource, "package": d.package}, + ) + + +def _decision_from_bytes(value: bytes) -> Decision: + parsed: Any = orjson.loads(value) + if not isinstance(parsed, dict): + raise AuthorizationError("cached auth entry is malformed") + return Decision( + subject=parsed.get("subject"), + claims=parsed.get("claims"), + resource=parsed.get("resource"), + package=parsed.get("package"), + ) diff --git a/datastore/auth/jwt/__init__.py b/datastore/auth/jwt/__init__.py new file mode 100644 index 0000000..893c60c --- /dev/null +++ b/datastore/auth/jwt/__init__.py @@ -0,0 +1,3 @@ +from datastore.auth.jwt.provider import JWTAuthProvider as Provider + +__all__ = ["Provider"] diff --git a/datastore/auth/jwt/provider.py b/datastore/auth/jwt/provider.py new file mode 100644 index 0000000..d8f518e --- /dev/null +++ b/datastore/auth/jwt/provider.py @@ -0,0 +1,69 @@ +"""JWT provider — verifies signature + optional `aud` / `iss` claims. + +Verifies against the configured key (HS* secret or RS*/ES* PEM public +key). Decoded claims become `Decision.claims`; `sub` becomes `subject`. + +Does NOT contact any external service. Authorization is implicit: a +valid JWT = allowed. Endpoints that need finer-grained policy can +inspect `Decision.claims` themselves. +""" + +from __future__ import annotations + +import jwt +from jwt import InvalidTokenError, PyJWTError + +from datastore.auth.base import Decision, default_key_id +from datastore.core.config import Config +from datastore.core.exceptions import AuthorizationError + + +class JWTAuthProvider: + name = "jwt" + + def __init__(self, *, config: Config, **_: object) -> None: + algo = config.JWT_ALGORITHM + self._algorithm = algo + self._audience = config.JWT_AUDIENCE or None + self._issuer = config.JWT_ISSUER or None + if algo.startswith("HS"): + if not config.JWT_SECRET: + raise ValueError( + f"JWT_SECRET required when JWT_ALGORITHM={algo}" + ) + self._key: str = config.JWT_SECRET + else: + if not config.JWT_PUBLIC_KEY: + raise ValueError( + f"JWT_PUBLIC_KEY required when JWT_ALGORITHM={algo}" + ) + self._key = config.JWT_PUBLIC_KEY + + async def authorize( + self, + *, + credential: str | None, + resource_id: str | None, + package_id: str | None, + permission: str | None, + ) -> Decision: + if not credential: + raise AuthorizationError("Access denied: JWT token required") + try: + claims = jwt.decode( + credential, + self._key, + algorithms=[self._algorithm], + audience=self._audience, + issuer=self._issuer, + ) + except InvalidTokenError as exc: + raise AuthorizationError(f"invalid JWT: {exc}") from exc + except PyJWTError as exc: + raise AuthorizationError("JWT verification failed") from exc + sub = claims.get("sub") + subject = sub if isinstance(sub, str) else None + return Decision(subject=subject, claims=claims) + + def key_id(self, credential: str) -> str: + return default_key_id(credential) diff --git a/datastore/auth/registry.py b/datastore/auth/registry.py new file mode 100644 index 0000000..f69a0e0 --- /dev/null +++ b/datastore/auth/registry.py @@ -0,0 +1,29 @@ +"""Provider factory — dispatch by `Config.AUTH_TYPE` via importlib. + +Adding a new provider = drop `datastore/auth//` with `__init__.py` +exporting `Provider = `. No edit here. + +The lifespan calls this once at startup and stores the result on +`app.state.auth_provider`; there's no instance cache here on purpose — +the only cache in the auth path is the CKAN provider's per-decision +TTL cache (see `auth/ckan/provider.py`). +""" + +from __future__ import annotations + +import importlib +from typing import Any + +from datastore.auth.base import AuthProvider +from datastore.core.config import Config + + +def get_auth_provider(config: Config, **extras: Any) -> AuthProvider: + """Construct the provider for `config.AUTH_TYPE`. + + `extras` are forwarded to the provider constructor (e.g. `ckan=`, + `cache=`, `cache_ttl=`). Providers absorb unused kwargs via `**_`. + """ + module = importlib.import_module(f"datastore.auth.{config.AUTH_TYPE}") + provider: AuthProvider = module.Provider(config=config, **extras) + return provider diff --git a/datastore/core/config.py b/datastore/core/config.py index abcc2e8..788f0e6 100644 --- a/datastore/core/config.py +++ b/datastore/core/config.py @@ -4,12 +4,24 @@ from pathlib import Path from typing import Literal -from pydantic import Field, field_validator +from pydantic import Field, field_validator, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict _ENGINES_DIR = ( Path(__file__).resolve().parent.parent / "infrastructure" / "engines" ) +_AUTH_DIR = Path(__file__).resolve().parent.parent / "auth" + + +def _subdirs(root: Path) -> set[str]: + if not root.is_dir(): + return set() + return { + p.name for p in root.iterdir() + if p.is_dir() + and not p.name.startswith(("_", ".")) + and p.name != "__pycache__" + } def _available_engines() -> set[str]: @@ -19,14 +31,12 @@ def _available_engines() -> set[str]: without listing them statically. Backends gitignored for local dev (e.g. a test engine) work locally and stay invisible upstream. """ - if not _ENGINES_DIR.is_dir(): - return set() - return { - p.name for p in _ENGINES_DIR.iterdir() - if p.is_dir() - and not p.name.startswith(("_", ".")) - and p.name != "__pycache__" - } + return _subdirs(_ENGINES_DIR) + + +def _available_auth_types() -> set[str]: + """Auth provider names = `datastore/auth//` directories on disk.""" + return _subdirs(_AUTH_DIR) class Config(BaseSettings): @@ -83,10 +93,60 @@ def _check_engine_available(cls, v: str) -> str: ) # BigQuery settings - BQ_PROJECT: str = Field( + BIGQUERY_PROJECT: str = Field( default="", description="Google Cloud project ID for BigQuery", ) + BIGQUERY_DATASET: str = Field( + default="", + description=( + "BigQuery dataset that holds the datastore tables. Both the " + "per-resource data tables and the internal `_table_metadata` " + "table live here. Required when DATASTORE_ENGINE=bigquery." + ), + ) + BIGQUERY_CREDENTIALS: str = Field( + default="", + description=( + "Service-account credentials for the read-write engine. " + "Either JSON blob or path to a service-account JSON file." + ), + ) + BIGQUERY_CREDENTIALS_RO: str = Field( + default="", + description=( + "Service-account credentials for the read-only engine. " + "Either JSON blob or path to a service-account JSON file." + ), + ) + BIGQUERY_USE_QUERY_CACHE: bool = Field( + default=True, + description=( + "Use BigQuery's built-in 24h query-results cache on read paths " + "(datastore_search / datastore_search_sql / datastore_info). " + "Identical, deterministic SELECTs return free + fast on cache " + "hits. Set False for freshness-sensitive deployments or to " + "force a fresh scan in tests." + ), + ) + + # Per-row system columns + INCLUDE_UPDATED_AT: bool = Field( + default=True, + description=( + "Add a `_updated_at` TIMESTAMP system column on each resource tables. " + ), + ) + + # Search + SEARCH_RESULT_ROWS_MAX: int = Field( + default=32000, + ge=1, + description=( + "Hard cap on `datastore_search` / `datastore_search_sql` `limit`. " + "Requests above this return 400." + ), + ) # Redis settings REDIS_URL: str = Field( @@ -106,22 +166,73 @@ def _check_engine_available(cls, v: str) -> str: description="Timeout for CKAN API requests in seconds", ) - # Authentication - AUTH_ENABLED: bool = Field( - default=True, - description="Enable CKAN-based authentication", + # Authentication. `AUTH_TYPE` selects the provider package under + # `datastore/auth//`. Drop a sibling folder to add one. + AUTH_TYPE: str = Field( + default="ckan", + description=( + "Auth provider — must match a `datastore/auth//` package. " + "Built-in: `ckan`, `jwt`, `anonymous` (no auth)." + ), ) AUTH_CACHE_TTL: int = Field( default=300, description="TTL for auth cache entries in seconds", ) + @field_validator("AUTH_TYPE") + @classmethod + def _check_auth_type(cls, v: str) -> str: + available = _available_auth_types() + if v not in available: + raise ValueError( + f"AUTH_TYPE={v!r} has no provider package; " + f"available: {sorted(available)}" + ) + return v + + # JWT settings (consumed by `datastore/auth/jwt` only). + JWT_ALGORITHM: Literal[ + "HS256", "HS384", "HS512", "RS256", "RS384", "RS512", "ES256", "ES384" + ] = Field( + default="HS256", + description=( + "JWT signing algorithm. HS* uses JWT_SECRET; " + "RS*/ES* uses JWT_PUBLIC_KEY (PEM)." + ), + ) + JWT_SECRET: str = Field( + default="", + description="HS* shared secret. Required when AUTH_TYPE=jwt and JWT_ALGORITHM=HS*.", + ) + JWT_PUBLIC_KEY: str = Field( + default="", + description="RS*/ES* PEM-encoded public key. Required for RS*/ES*.", + ) + JWT_AUDIENCE: str = Field( + default="", + description="Expected `aud` claim. Empty = skip audience check.", + ) + JWT_ISSUER: str = Field( + default="", + description="Expected `iss` claim. Empty = skip issuer check.", + ) + # Logging LOG_LEVEL: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field( default="INFO", description="Logging level", ) + @model_validator(mode="after") + def _check_ckan_url_required_for_ckan_auth(self) -> Config: + if self.AUTH_TYPE == "ckan" and not self.CKAN_URL: + raise ValueError( + "CKAN_URL must be set when AUTH_TYPE=ckan " + "(use AUTH_TYPE=anonymous or jwt to run standalone)" + ) + return self + @lru_cache diff --git a/datastore/core/constants.py b/datastore/core/constants.py index f3bc6b3..6a8ceae 100644 --- a/datastore/core/constants.py +++ b/datastore/core/constants.py @@ -1,4 +1,3 @@ - from __future__ import annotations POSTGRES_TYPES: dict[str, str] = { @@ -59,3 +58,77 @@ } +# Canonical Postgres type → Frictionless field type. Used when converting +# the legacy `fields` shape into a Frictionless schema for `datastore_create`. +# Many-to-one on purpose: all width-variants of integer map to `integer`, +# all of timestamp to `datetime`, etc. Anything without a closer match +# falls through to `string`. +POSTGRES_TO_FRICTIONLESS: dict[str, str] = { + "int2": "integer", + "int4": "integer", + "int8": "integer", + "float4": "number", + "float8": "number", + "numeric": "number", + "bool": "boolean", + "text": "string", + "varchar": "string", + "char": "string", + "bytea": "string", + "date": "date", + "time": "time", + "timetz": "time", + "timestamp": "datetime", + "timestamptz": "datetime", + "json": "object", + "jsonb": "object", + "uuid": "string", + "inet": "string", + "cidr": "string", + "macaddr": "string", + "xml": "string", +} + + +# Frictionless field type → canonical Postgres type. Used when deriving the +# legacy `fields` shape from a Frictionless schema. Lossy in the other +# direction (e.g., a Frictionless `integer` could have been any width); we +# pick the widest/most-permissive Postgres type so the column accepts +# anything the Frictionless type implies. +FRICTIONLESS_TO_POSTGRES: dict[str, str] = { + "integer": "int8", + "number": "numeric", + "boolean": "bool", + "string": "text", + "date": "date", + "time": "timetz", + "datetime": "timestamptz", + "object": "jsonb", + "array": "jsonb", + "geojson": "jsonb", + "geopoint": "text", + "any": "text", +} + + +# Frictionless field types the datastore accepts. A narrow, opinionated +# subset of the full Frictionless vocabulary — values outside this set +# (`duration`, `year`, `yearmonth`, …) are rejected at the request +# boundary so storage layout stays predictable and engine type maps +# don't need to grow ad-hoc. +ALLOWED_FRICTIONLESS_TYPES: frozenset[str] = frozenset({ + "integer", "number", "boolean", "string", + "date", "time", "datetime", + "object", "array", + "geojson", "geopoint", + "any", +}) + + +# Field names reserved for engine-managed system columns. User schemas +# that try to declare these must be rejected at the request boundary — +# silently dropping them would leave the response advertising a column +# the engine refuses to populate. +RESERVED_SYSTEM_COLUMN_NAMES: frozenset[str] = frozenset({ + "_id", "_updated_at", +}) diff --git a/datastore/infrastructure/engines/base.py b/datastore/infrastructure/engines/base.py index 78b1519..9949ff4 100644 --- a/datastore/infrastructure/engines/base.py +++ b/datastore/infrastructure/engines/base.py @@ -1,6 +1,9 @@ +from __future__ import annotations + from abc import ABC, abstractmethod +from collections.abc import Iterator from dataclasses import dataclass -from typing import Iterator +from typing import Any, Protocol, runtime_checkable @dataclass @@ -13,14 +16,15 @@ class SearchResult: returns. Don't materialise this iterator anywhere except inside the streaming serialiser. """ - fields: list[dict] # [{"id": "col_name", "type": "text"}, ...] - records: Iterator[tuple] + + schema: dict[str, Any] # {"fields": [{"name": "col", "type": "string"}, ...]} + records: Iterator[tuple[Any, ...]] total: int | None = None records_truncated: bool = False @property def columns(self) -> list[str]: - return [f["id"] for f in self.fields] + return [f["name"] for f in self.schema.get("fields", [])] @dataclass(slots=True) @@ -29,27 +33,103 @@ class WriteResult: total: int | None = None -class DatastoreBackend(ABC): +@dataclass +class InfoResult: + """Table metadata returned by `datastore_info`. + + + """ + + schema: dict[str, Any] + meta: dict[str, Any] + + +@runtime_checkable +class MetadataStore(Protocol): + """Per-engine storage for table-level metadata. + + Holds one row per `resource_id`, keyed by the resource_id itself. The + canonical column shape is `(resource_id, schema, created_at, + updated_at)` where `schema` is a Frictionless Table Schema dict. + + Each engine subpackage provides a concrete implementation + (e.g. `bigquery/metadata.py: BigQueryMetadataStore`) so the SQL + dialect, connection management, and column types stay engine-private. + The backend constructs its store in `__init__`, calls `initialize()` + once at startup to create the underlying table, and calls `upsert` + from `create()` whenever a caller declares a new resource. + + Adding a new engine = drop a sibling `metadata.py` implementing this + Protocol; the backend wires it in by holding `self.metadata`. + """ + def initialize(self) -> None: + """Create the metadata table if it doesn't exist. Idempotent.""" + + def insert(self, resource_id: str, schema: dict[str, Any]) -> None: + """Insert a new metadata row for `resource_id`. + + Sets `created_at` and `updated_at` to now. Fails if a row with + the same `resource_id` already exists — that's a real conflict + that callers should surface (a second `datastore_create` for an + already-declared resource). + """ + + def update(self, resource_id: str, schema: dict[str, Any]) -> None: + """Update the metadata row for `resource_id`. + + Replaces `schema` and bumps `updated_at`; `created_at` is + preserved. Keyed on `resource_id`; no-op when the row is absent. + """ + + def get(self, resource_id: str) -> dict[str, Any] | None: + """Return the stored Frictionless schema for `resource_id`, + or `None` when no row exists.""" + + def delete(self, resource_id: str) -> None: + """Remove the metadata row for `resource_id`. No-op when absent.""" + + +class DatastoreBackend(ABC): @abstractmethod def initialize(self) -> None: """Called on app startup to set up connections.""" @abstractmethod - def create(self, resource_id: str, fields: list, unique_keys: list, - records: list | None, include_total: bool) -> WriteResult: + def create( + self, + resource_id: str, + schema: dict[str, Any], + records: list[dict[str, Any]] | None, + include_total: bool, + ) -> WriteResult: """Create/alter table, optionally with bulk insert. + `schema` is a Frictionless Table Schema descriptor — the service + normalises both the legacy `fields` input and a caller-supplied + Frictionless schema down to this shape before dispatch. Engines + read columns from `schema["fields"]` and the unique key from + `schema.get("primaryKey")`. + `include_total=True` → after the insert, recompute and return the total row count via `WriteResult.total`. `False` → leave it `None`. """ @abstractmethod - def search(self, resource_id: str, filters: dict | None, - q: str | dict | None, - distinct: bool, plain: bool, language: str, limit: int, - offset: int, fields: list | None, sort: str | None, - include_total: bool) -> SearchResult: + def search( + self, + resource_id: str, + filters: dict[str, Any] | None, + q: str | dict[str, Any] | None, + distinct: bool, + plain: bool, + language: str, + limit: int, + offset: int, + fields: list[str] | None, + sort: str | None, + include_total: bool, + ) -> SearchResult: """Query records. Returns SearchResult with lazy row iterator. `q` is a CKAN-style full-text query: `str` scans every text column, @@ -58,8 +138,13 @@ def search(self, resource_id: str, filters: dict | None, """ @abstractmethod - def upsert(self, resource_id: str, records: list, method: str, - include_total: bool) -> WriteResult: + def upsert( + self, + resource_id: str, + records: list[dict[str, Any]], + method: str, + include_total: bool, + ) -> WriteResult: """Insert / update / upsert records. `include_total=True` → after the write, recompute and return the @@ -71,12 +156,19 @@ def search_sql(self, sql: str, limit: int) -> SearchResult: """Execute raw SQL SELECT. Returns SearchResult with lazy row iterator.""" @abstractmethod - def delete(self, resource_id: str, filters: dict | None) -> WriteResult: - """Delete records (filtered) or drop table (no filters).""" + def delete( + self, + resource_id: str, + filters: dict[str, Any] | None, + fields: list[str] | None = None, + ) -> WriteResult: + """Drop the table (both None), delete rows by `filters`, or + drop columns by `fields`. `filters` and `fields` are mutually + exclusive.""" @abstractmethod - def info(self, resource_id: str) -> dict: - """Return table metadata: fields with types, primary_key, row count.""" + def info(self, resource_id: str) -> InfoResult: + """Return table metadata: column schema + free-form `meta` dict.""" @abstractmethod def get_columns(self, resource_id: str) -> list[str]: diff --git a/datastore/infrastructure/engines/bigquery/__init__.py b/datastore/infrastructure/engines/bigquery/__init__.py index 1c48fa3..df36917 100644 --- a/datastore/infrastructure/engines/bigquery/__init__.py +++ b/datastore/infrastructure/engines/bigquery/__init__.py @@ -7,4 +7,6 @@ from datastore.infrastructure.engines.bigquery.backend import BigQueryBackend -__all__ = ["BigQueryBackend"] +Backend = BigQueryBackend + +__all__ = ["Backend", "BigQueryBackend"] diff --git a/datastore/infrastructure/engines/bigquery/backend.py b/datastore/infrastructure/engines/bigquery/backend.py index cb47517..1c6cfa6 100644 --- a/datastore/infrastructure/engines/bigquery/backend.py +++ b/datastore/infrastructure/engines/bigquery/backend.py @@ -1,53 +1,504 @@ +"""BigQuery backend. + +Public surface is `BigQueryBackend` — the `DatastoreBackend` ABC. +File layout (top to bottom): + + 1. Lifecycle (`__init__`, `initialize`). + 2. Low-level client wrappers (`_data_table_ref`, `_run_query`) — + every BigQuery call is routed through `_run_query` so transport / + SQL errors surface as `ServerError` with `resource_id` + operation + name baked in, never as raw `google.api_core` exceptions. + 3. Create helpers (`_create_data_table`, `_alter_data_table`, + `_insert_records`, and the branch helpers `_apply_new_resource` / + `_apply_existing_resource`). + 4. CKAN action methods (`create`, `upsert`, `search`, `search_sql`, + `delete`, `info`, `get_columns`, `healthcheck`). +""" + from __future__ import annotations +import logging from typing import Any +from datastore.core.config import Config +from datastore.core.exceptions import ( + NotFoundError, + ServerError, + ValidationError, +) from datastore.infrastructure.engines.base import ( DatastoreBackend, + InfoResult, + MetadataStore, SearchResult, WriteResult, ) +from datastore.infrastructure.engines.bigquery.lib import ( + SYSTEM_COLUMN_NAMES, + alter_clauses, + column_defs, + delete_sql, + drop_columns_sql, + insert_sql, + merge_sql, + qualify_table_refs, + reject_unsupported_type_changes, + schema_diff, + strip_limit_offset, + unfiltered_table_name, + update_sql, +) + +log = logging.getLogger(__name__) class BigQueryBackend(DatastoreBackend): + # ----- lifecycle ------------------------------------------------------ def __init__( self, *, context: Any = None, + config: Config | None = None, mode: str = "rw", ) -> None: self.mode = mode self.context = context + self.config = config self.client: Any = None + # `metadata` is set in `initialize()` once the client is built. + # Stays `None` in placeholder mode (no BIGQUERY_PROJECT / + # BIGQUERY_DATASET) so the rest of the app can boot — `create()` + # skips the data + metadata writes in that mode rather than crash. + self.metadata: MetadataStore | None = None def initialize(self) -> None: - """Initialize the BigQuery client.""" - pass + """Build the BigQuery client when configured; no-op otherwise. + + Lenient on missing config: if `BIGQUERY_PROJECT` is unset, log a + warning and leave `client=None`. Lets the rest of the app boot + without real GCP creds — `/ready` will return 503 (healthcheck + returns False with no client) so the misconfiguration is loud + enough in production without being fatal at import time. + + When the client is built, also constructs the `MetadataStore` + and runs its `initialize()` so the `_table_metadata` table + exists. Only the read-write engine creates DDL — the read-only + engine constructs the store for `get()` but skips `initialize()` + so it doesn't need CREATE privileges. + """ + if self.config is None or not self.config.BIGQUERY_PROJECT.strip(): + log.warning( + "BigQueryBackend: BIGQUERY_PROJECT unset (mode=%s); client " + "not built — /ready will return 503 until configured.", + self.mode, + ) + return + from datastore.infrastructure.engines.bigquery.client import build_client + from datastore.infrastructure.engines.bigquery.metadata import ( + BigQueryMetadataStore, + ) + + self.client = build_client(self.config, self.mode) + log.info( + "BigQuery client initialised: project=%s mode=%s", + self.config.BIGQUERY_PROJECT, self.mode, + ) + + dataset = self.config.BIGQUERY_DATASET.strip() + if not dataset: + log.warning( + "BigQueryBackend: BIGQUERY_DATASET unset (mode=%s); " + "metadata store disabled — `datastore_create` will not " + "record per-resource schemas until configured.", + self.mode, + ) + return + + self.metadata = BigQueryMetadataStore( + client=self.client, + project=self.config.BIGQUERY_PROJECT, + dataset=dataset, + ) + if self.mode == "rw": + self.metadata.initialize() + + # ----- table refs + low-level client wrappers ------------------------ + + @property + def _include_updated_at(self) -> bool: + """Read the `_updated_at` system-column toggle from config. + + Defaults to `True` when no config is attached (test scaffolds + that build the backend directly without `initialize()`). + """ + return getattr(self.config, "INCLUDE_UPDATED_AT", True) + + def _data_table_ref(self, resource_id: str) -> str: + """Backtick-quoted `project.dataset.` for SQL. + + Backticks make resource_ids with hyphens (CKAN UUIDs) parse + without further escaping. + """ + return ( + f"`{self.config.BIGQUERY_PROJECT}" + f".{self.config.BIGQUERY_DATASET}.{resource_id}`" + ) + + def _read_job_config(self, params: list | None = None) -> Any: + """QueryJobConfig for read paths — enables BigQuery's query cache. + + BigQuery caches the result of every deterministic SELECT for + ~24h; an identical query hits the cache and returns free + fast + (no bytes scanned, sub-100ms typically). The flag is on by + default in BigQuery, but every read site builds its config + through this helper so: + - the read-side contract is explicit in the code, + - the `BIGQUERY_USE_QUERY_CACHE` opt-out actually flows + through to the wire (e.g. integration tests that need + a fresh scan can set it to False). + + Write paths (DDL / DML) don't go through this — BigQuery's + cache only applies to SELECT anyway. + """ + from google.cloud import bigquery + return bigquery.QueryJobConfig( + query_parameters=params or [], + use_query_cache=getattr( + self.config, "BIGQUERY_USE_QUERY_CACHE", True, + ), + ) + + def _run_query( + self, + sql: str, + *, + op: str, + resource_id: str, + job_config: Any = None, + ) -> Any: + """Submit `sql`, wait for completion, and return the QueryJob. + + Wraps every `client.query` call so any + `google.api_core` / transport error becomes a CKAN-shaped + `ServerError` carrying the action name (`op`) and target + `resource_id`. Callers never have to know about Google's + exception hierarchy. + + Returning the `QueryJob` (rather than its `.result()` value) + lets callers grab whichever output they need without a second + helper: rows from `job.result()`, DML row counts from + `job.num_dml_affected_rows`. DDL / MERGE callers simply ignore + the return value — the `.result()` call inside has already + waited for completion. + """ + try: + job = self.client.query(sql, job_config=job_config) + job.result() + return job + except Exception as e: + raise ServerError( + f"BigQuery {op} failed for resource {resource_id!r}: {e}" + ) from e + + # ----- create helpers (DDL + records + branch orchestration) -------- + def _create_data_table(self, resource_id: str, schema: dict) -> None: + """`CREATE TABLE IF NOT EXISTS` with columns derived from the + Frictionless schema. Idempotent — a second call on the same + resource is a no-op DDL on the BigQuery side.""" + cols = column_defs(schema, include_updated_at=self._include_updated_at) + if not cols: + log.warning( + "BigQueryBackend.create: schema for %r has no fields; " + "skipping CREATE TABLE.", + resource_id, + ) + return + sql = ( + f"CREATE TABLE IF NOT EXISTS {self._data_table_ref(resource_id)} " + f"({', '.join(cols)})" + ) + self._run_query(sql, op="CREATE TABLE", resource_id=resource_id) + log.info("BigQuery table created: %s", resource_id) + + def _alter_data_table( + self, resource_id: str, old_schema: dict, new_schema: dict + ) -> None: + """Apply the schema diff as DDL. + + Three diff classes: + - **Added columns** → `ALTER TABLE ADD COLUMN IF NOT EXISTS`. + - **Type changes** → `ALTER TABLE ALTER COLUMN SET DATA TYPE` + when BigQuery accepts the transition (`types.can_widen`). + Unsupported transitions raise `ConflictError` BEFORE any + DDL runs so a single bad column can't half-apply the others. + - **Removed columns** → logged and skipped; dropping a column + would lose user data on a metadata edit. + + All ADD / ALTER clauses go in a single `ALTER TABLE` statement + so BigQuery applies them atomically. + """ + added, type_changes, removed = schema_diff(old_schema, new_schema) + reject_unsupported_type_changes(type_changes) + + if removed: + log.info( + "BigQueryBackend.alter: columns %s dropped from schema " + "for %r — keeping BigQuery columns to preserve rows.", + removed, resource_id, + ) + + clauses = alter_clauses(added, type_changes, new_schema) + if not clauses: + return + sql = ( + f"ALTER TABLE {self._data_table_ref(resource_id)} " + f"{', '.join(clauses)}" + ) + self._run_query(sql, op="ALTER TABLE", resource_id=resource_id) + log.info( + "BigQuery table altered: %s (added=%s, type_changes=%s)", + resource_id, added, type_changes, + ) + def _insert_records( + self, resource_id: str, schema: dict, records: list + ) -> None: + """Insert rows via DML `INSERT INTO ... SELECT FROM UNNEST(@rows)`. + + Why DML rather than `Client.insert_rows_json`: the streaming + insert API parks rows in a streaming buffer for 30–90 minutes, + and DML statements (UPDATE / DELETE / MERGE) cannot touch rows + still in that buffer. That makes `datastore_create` + immediate + `datastore_upsert` impossible. DML INSERT writes straight to + table storage, so any follow-up upsert/update on the same + primaryKey works without delay. + + Rows ride as a single JSON-array string parameter `@rows`; + BigQuery unpacks it inside the SQL — one statement regardless + of batch size, no Python-side serialisation pass needed (JSON + columns are handled by `PARSE_JSON(JSON_QUERY(...))` inside + the SELECT). + + Empty `records` is a no-op. SQL/transport errors propagate as + `ServerError` via `_run_query`. + """ + import orjson + + if not records: + return + try: + sql = insert_sql( + self._data_table_ref(resource_id), + schema, + include_updated_at=self._include_updated_at, + ) + except ValueError as e: + raise ValidationError(str(e)) from e + + from google.cloud import bigquery + + # `MAX(_id)` is computed inline in the INSERT SQL — saves a + # separate round-trip per call (the older two-statement form + # cost ~1s of BigQuery job overhead for nothing). + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "rows", "STRING", orjson.dumps(records).decode("utf-8") + ), + ] + ) + try: + self._run_query( + sql, op="INSERT", resource_id=resource_id, + job_config=job_config, + ) + except ServerError as e: + raise _translate_bigquery_error( + e, resource_id, "insert" + ) from e + log.info( + "BigQuery rows inserted: %s (%d row(s))", + resource_id, len(records), + ) + + def _merge_records( + self, resource_id: str, schema: dict, records: list + ) -> None: + """Upsert rows via `MERGE` keyed on `schema.primaryKey`. + + Rows whose primary-key columns match an existing row are + UPDATEd; others are INSERTed. The full payload travels as a + single JSON-array string parameter so we issue one statement + regardless of batch size. + + Empty `records` is a no-op. Missing primary key on the stored + schema raises `ValidationError` — upsert can't dedup without + one; the caller can fall back to `method="insert"` or declare + a primaryKey on the resource. + """ + import orjson + + if not records: + return + try: + sql = merge_sql( + self._data_table_ref(resource_id), + schema, + include_updated_at=self._include_updated_at, + ) + except ValueError as e: + raise ValidationError(str(e)) from e + + from google.cloud import bigquery + + # `MAX(_id)` is inlined in the MERGE's WHEN NOT MATCHED clause + # so the upsert is a single round-trip. + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "rows", "STRING", orjson.dumps(records).decode("utf-8") + ), + ] + ) + try: + self._run_query( + sql, op="MERGE", resource_id=resource_id, + job_config=job_config, + ) + except ServerError as e: + raise _translate_bigquery_error(e, resource_id, "upsert") from e + log.info( + "BigQuery rows upserted: %s (%d row(s))", + resource_id, len(records), + ) + + def _update_records( + self, resource_id: str, schema: dict, records: list + ) -> None: + """Update existing rows via DML `UPDATE`, keyed on + `schema.primaryKey`. + + Update-only semantics: every row in `records` must match an + existing row by primary key. After the statement runs we + compare `num_dml_affected_rows` against the row count and + raise `NotFoundError` if any row had no matching key — DML + UPDATE itself treats misses as a silent no-op, so the count + check is what gives the caller a real signal. + + Empty `records` is a no-op. Missing primary key or all-PK + schema raises `ValidationError` (via `update_sql`'s + `ValueError` re-raise). + """ + import orjson + + if not records: + return + try: + sql = update_sql( + self._data_table_ref(resource_id), + schema, + include_updated_at=self._include_updated_at, + ) + except ValueError as e: + raise ValidationError(str(e)) from e + + from google.cloud import bigquery + + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "rows", "STRING", orjson.dumps(records).decode("utf-8") + ), + ] + ) + try: + job = self._run_query( + sql, op="UPDATE", resource_id=resource_id, + job_config=job_config, + ) + except ServerError as e: + raise _translate_bigquery_error(e, resource_id, "update") from e + affected = job.num_dml_affected_rows or 0 + if affected < len(records): + missing = len(records) - affected + raise NotFoundError( + f"datastore_update: {missing} of {len(records)} row(s) " + f"had no matching primary key in resource {resource_id!r}; " + "use method='upsert' to insert missing rows" + ) + log.info( + "BigQuery rows updated: %s (%d row(s))", resource_id, affected, + ) + + def _apply_new_resource( + self, resource_id: str, schema: dict, records: list + ) -> None: + """First-time declaration: create the table, seed it, record it. + + `metadata.insert` is the final step so any failure earlier + leaves the metadata store untouched and the resource appears + un-declared on retry. + """ + assert self.metadata is not None + self._create_data_table(resource_id, schema) + self._insert_records(resource_id, schema, records) + self.metadata.insert(resource_id, schema) + + def _apply_existing_resource( + self, + resource_id: str, + old_schema: dict, + new_schema: dict, + records: list, + ) -> None: + """Re-declaration on an existing resource: migrate the table, + append rows, then update the metadata row. + + If alter OR the record insert raises, `metadata.update` is + skipped and the metadata stays at the old schema version. + """ + assert self.metadata is not None + self._alter_data_table(resource_id, old_schema, new_schema) + self._insert_records(resource_id, new_schema, records) + self.metadata.update(resource_id, new_schema) + + # ----- CKAN action methods ------------------------------------------- def create( self, resource_id: str, - fields: list, - unique_keys: list, + schema: dict, records: list | None, include_total: bool, ) -> WriteResult: - """Create/alter table, optionally with records insert. + """Declare a resource: DDL → records insert → metadata write. + + The order is load-bearing — see `_apply_new_resource` / + `_apply_existing_resource` for the per-branch sequence. Any + failure short-circuits before the metadata write so the + metadata row never describes a state the actual table doesn't + match. - Placeholder: echoes inputs. Real impl (Phase 8) issues - `CREATE TABLE IF NOT EXISTS`, bulk-inserts records, and runs - `COUNT(*)` when `include_total=True`. + Placeholder mode (no project/dataset) is a no-op echo so the + unit suite can exercise the call path without GCP creds. """ + if self.metadata is not None: + existing = self.metadata.get(resource_id) + rows = records or [] + if existing is None: + self._apply_new_resource(resource_id, schema, rows) + else: + self._apply_existing_resource( + resource_id, existing, schema, rows + ) + return { - "fields": fields, + "schema": schema, "records": records, - "unique_keys": unique_keys, "include_total": include_total, - "total": len(records) if include_total else None, + "total": len(records or []) if include_total else None, } - def upsert( self, resource_id: str, @@ -55,19 +506,64 @@ def upsert( method: str, include_total: bool, ) -> WriteResult: - """Insert / update / upsert records. - Placeholder: echoes inputs so the call path is exercised end-to-end: - - "insert" → `insert_rows_json` (or DML INSERT for large batches) - - "update" → DML `UPDATE ... WHERE IN @keys` - - "upsert" → `MERGE` with `UNNEST(@records)` as source - and runs `COUNT(*)` when `include_total=True`. + """Insert / update / upsert records into an existing resource. + + Method dispatch: + - **"upsert"** (default): `MERGE` keyed on `schema.primaryKey`. + Rows that match an existing key are UPDATEd; the rest are + INSERTed. Requires a `primaryKey` on the stored schema. + - **"insert"**: plain streaming insert (no PK check). Faster + than upsert; raises if any row collides with an existing + primary key (BigQuery row-level errors). + - **"update"**: DML `UPDATE` keyed on `schema.primaryKey`. + Every row must match an existing row — otherwise + `NotFoundError` is raised after the statement runs. Requires + a `primaryKey`. + + The resource must have been declared by `datastore_create` + first; the schema (column types + primaryKey) is read from the + metadata store and used to build the SQL. Calling `upsert` on + an undeclared resource raises `NotFoundError`. + + Placeholder mode (no project/dataset) is a no-op echo so the + unit suite can exercise the call path without GCP creds. """ + if self.metadata is None: + # Placeholder mode — echo (matches the create() pattern). + return { + "resource_id": resource_id, + "records": records, + "method": method, + "include_total": include_total, + "total": len(records or []), + } + + schema = self.metadata.get(resource_id) + if schema is None: + raise NotFoundError( + f"resource {resource_id!r} is not declared; call " + "datastore_create before upsert" + ) + + rows = records or [] + if method == "insert": + self._insert_records(resource_id, schema, rows) + elif method == "upsert": + self._merge_records(resource_id, schema, rows) + elif method == "update": + self._update_records(resource_id, schema, rows) + else: + raise ValidationError( + f"unknown upsert method {method!r}; expected one of " + "'upsert', 'insert', 'update'" + ) + return { "resource_id": resource_id, "records": records, "method": method, "include_total": include_total, - "total": len(records) + "total": len(rows) if include_total else None, } def search( @@ -84,52 +580,617 @@ def search( sort: str | None, include_total: bool, ) -> SearchResult: - """Query records. Returns SearchResult with lazy row iterator. + """Run a parameterised SELECT against the data table. + + Pipeline: + 1. Resolve schema from `_table_metadata` (404 if undeclared). + 2. Build search + (optional) count SQL via `search.py`. + Validation of `fields` / `sort` / `filters` / `q` columns + happens inside the builders so a bad request becomes a + clean 400, never reaches BigQuery. + 3. Submit both queries. When only an unfiltered total is + needed, fall back to `__TABLES__.row_count` — free vs the + COUNT(*) billing. + 4. Return a row iterator that yields tuples in projection + order; memory stays bounded by the RowIterator's page + size, not the result set size. - Placeholder: returns an empty result set so the call path is - exercised end-to-end. Real impl (Phase 8) builds a parameterised - SELECT honouring `filters` / `q` / `distinct` / `sort`, optionally - runs `COUNT(*)` when `include_total=True`, and yields tuples - page-by-page from `query_job.result()`. + `plain` and `language` are accepted for CKAN compatibility but + currently have no effect on the BigQuery side — `SEARCH()` + tokenises uniformly regardless of `plain`, and we don't expose + the analyzer arg. + + Placeholder mode (no metadata store) returns an empty result so + the unit suite can exercise the call path without GCP creds. """ - column_metadata: list[dict] = ( - [{"id": c, "type": "any"} for c in fields] if fields else [] + from datastore.infrastructure.engines.bigquery.search import ( + build_count, + build_search, + needs_count_query, ) + + if self.metadata is None: + # Placeholder mode (no GCP creds) — echo the requested + # field shape so the unit suite can exercise the streaming + # writer + envelope plumbing without a real backend. + stub_schema = { + "fields": [ + {"name": c, "type": "any"} for c in (fields or []) + ], + } + return SearchResult( + schema=stub_schema, + records=iter([]), + total=0 if include_total else None, + records_truncated=False, + ) + + schema = self.metadata.get(resource_id) + if schema is None: + raise NotFoundError( + f"resource {resource_id!r} is not declared; call " + "datastore_create first" + ) + + try: + sql, params, projected = build_search( + table_ref=self._data_table_ref(resource_id), + schema=schema, + include_updated_at=self._include_updated_at, + fields=fields, + filters=filters, + q=q, + distinct=distinct, + sort=sort, + limit=limit, + offset=offset, + ) + except ValueError as e: + raise ValidationError(str(e)) from e + + # Read-path configs use the query-results cache (see + # _read_job_config). Identical search params hit a 24h cache + # entry — free + fast on the second call. + job_config = self._read_job_config(params=params) + + # Fire both jobs before waiting on either: BigQuery's + # `client.query()` is non-blocking, so the count and the page + # query run in parallel — wall time ≈ max(both). + count_job = None + if include_total and needs_count_query( + filters=filters, q=q, distinct=distinct, + ): + count_sql, count_params = build_count( + table_ref=self._data_table_ref(resource_id), + schema=schema, + include_updated_at=self._include_updated_at, + fields=fields, + filters=filters, + q=q, + distinct=distinct, + ) + count_cfg = self._read_job_config(params=count_params) + count_job = self.client.query(count_sql, job_config=count_cfg) + + search_job = self.client.query(sql, job_config=job_config) + + try: + row_iter = search_job.result() + except Exception as e: + raise ServerError( + f"BigQuery search failed for resource {resource_id!r}: {e}" + ) from e + + total: int | None = None + if include_total: + if count_job is None: + # Unfiltered + non-distinct → metadata row_count (free). + total = self._count_rows(resource_id) + else: + try: + rows = list(count_job.result()) + except Exception as e: + raise ServerError( + f"BigQuery search COUNT failed for resource " + f"{resource_id!r}: {e}" + ) from e + total = int(rows[0]["n"]) if rows else 0 + return SearchResult( - fields=column_metadata, - records=iter([]), - total=0 if include_total else None, + schema=projected, + records=(tuple(row.values()) for row in row_iter), + total=total, records_truncated=False, ) def search_sql(self, sql: str, limit: int) -> SearchResult: - """Execute raw SQL SELECT. Returns SearchResult with lazy row iterator. + """Execute a vetted SELECT/WITH statement and stream tuples. + + Safety relies on three layers, none of which this method itself + re-checks (validation already happened upstream): + 1. The request schema rejects non-SELECT / multi-statement + / unparseable SQL (`schemas/request.py:DatastoreSearchSQLRequest`). + 2. The endpoint authorises every referenced table against + CKAN as a resource_id, and the service rejects function + calls outside the engine's allow-list. + 3. **The load-bearing guard:** this engine is built with the + read-only credential (`mode="ro"` selects `BIGQUERY_CREDENTIALS_RO`), + so BigQuery IAM physically refuses any DML / DDL even if + upstream checks were bypassed. The assertion below catches + the dev mistake of dispatching `search_sql` through the + rw engine. - Placeholder: returns an empty result set. Real impl will call - `client.query(sql, job_config=…)` and yield tuples from - `query_job.result()` page-by-page, setting `records_truncated=True` - if the iterator hit `limit`. + Result schema is read from BigQuery's job schema (column types + come back as BQ types and are mapped to Frictionless via + `frictionless_type_from_bigquery`). Row output is bounded by + `limit` via `itertools.islice` so a runaway SELECT without an + embedded LIMIT can't pin the streaming response open forever. """ + from itertools import islice + + from datastore.infrastructure.engines.bigquery.types import ( + frictionless_type_from_bigquery, + ) + + if self.client is None: + return SearchResult( + schema={"fields": []}, + records=iter([]), + records_truncated=False, + ) + + if self.mode != "ro": + raise ServerError( + "datastore_search_sql must run on a read-only engine; " + "got mode=" + repr(self.mode) + ) + + # User refers to tables by their CKAN resource_id; BigQuery + # needs a fully-qualified `project.dataset.table` reference + # with backticks. The qualifier walks the AST, prepends the + # configured project + dataset to every non-CTE table ref, + # and re-emits as BigQuery dialect. + try: + qualified_sql = qualify_table_refs( + sql, + project=self.config.BIGQUERY_PROJECT, + dataset=self.config.BIGQUERY_DATASET, + ) + except Exception as e: + raise ServerError( + f"failed to qualify table references in SQL: {e}" + ) from e + + # Pick the cheapest viable path for `total`: + # + # 1. Plain `SELECT cols FROM table [LIMIT/OFFSET]` (no + # WHERE/GROUP/JOIN/aggregate) → read `total_rows` from + # `INFORMATION_SCHEMA.TABLE_STORAGE`. Free metadata query, + # no bytes scanned. + # + # 2. Anything that filters, joins, aggregates, or otherwise + # changes row count → wrap the user's SQL (LIMIT/OFFSET + # stripped) in `SELECT COUNT(*) FROM (...)`. Same pattern + # datastore_search uses for filtered/distinct queries. + # + # `RowIterator.total_rows` alone won't do — it's the row count + # of the destination temp table (post-LIMIT page size), so + # building pagination from it would always say "last page". + count_sql: str | None + count_params: list = [] + try: + table = unfiltered_table_name(qualified_sql) + if table is not None: + count_sql = ( + "SELECT total_rows AS n FROM " + f"`{self.config.BIGQUERY_PROJECT}." + f"{self.config.BIGQUERY_DATASET}." + "INFORMATION_SCHEMA.TABLE_STORAGE` " + "WHERE table_name = @table_name" + ) + from google.cloud import bigquery + count_params = [ + bigquery.ScalarQueryParameter( + "table_name", "STRING", table, + ), + ] + else: + inner = strip_limit_offset(qualified_sql) + count_sql = f"SELECT COUNT(*) AS n FROM ({inner})" + except Exception as e: + log.warning( + "search_sql: could not build COUNT query (%s); " + "total will be omitted", + e, + ) + count_sql = None + + # Submit COUNT first (non-blocking) so it runs in parallel with + # the data query. A COUNT failure is non-fatal — log and degrade + # `total` to None; a data-query failure is the user's primary + # request, so it propagates as ServerError. + count_job = None + if count_sql: + try: + count_cfg = self._read_job_config(params=count_params) + count_job = self.client.query(count_sql, job_config=count_cfg) + except Exception as e: + log.warning("search_sql COUNT submit failed: %s", e) + + try: + data_job = self.client.query( + qualified_sql, job_config=self._read_job_config(), + ) + row_iter = data_job.result() + except Exception as e: + raise ServerError(f"BigQuery search_sql failed: {e}") from e + + total: int | None = None + if count_job is not None: + try: + count_rows = list(count_job.result()) + total = int(count_rows[0]["n"]) if count_rows else 0 + except Exception as e: + log.warning("search_sql COUNT failed: %s", e) + + schema_fields = [ + { + "name": field.name, + "type": frictionless_type_from_bigquery(field.field_type), + } + for field in (row_iter.schema or []) + ] + + rows = (tuple(r.values()) for r in islice(row_iter, limit)) return SearchResult( - fields=[], - records=iter([]), + schema={"fields": schema_fields}, + records=rows, + total=total, records_truncated=False, ) - def delete(self, resource_id: str, filters: dict | None) -> WriteResult: - """Delete records (filtered) or drop table (no filters).""" - {} + def delete( + self, + resource_id: str, + filters: dict[str, Any] | None, + fields: list[str] | None = None, + ) -> WriteResult: + """Drop the table (both None), delete rows by `filters`, or + drop columns by `fields`. Schema layer enforces mutual + exclusivity.""" + if self.metadata is None: + return WriteResult() + + schema = self.metadata.get(resource_id) + if schema is None: + raise NotFoundError( + f"resource {resource_id!r} is not declared; nothing to delete" + ) + + if fields is not None: + self._drop_columns(resource_id, schema, fields) + return WriteResult() + + if filters is None: + self._drop_data_table(resource_id) + self.metadata.delete(resource_id) + return WriteResult() + + self._delete_rows(resource_id, schema, filters) + return WriteResult() + + def _drop_data_table(self, resource_id: str) -> None: + """`DROP TABLE IF EXISTS` for the resource's data table.""" + sql = f"DROP TABLE IF EXISTS {self._data_table_ref(resource_id)}" + self._run_query(sql, op="DROP TABLE", resource_id=resource_id) + log.info("BigQuery table dropped: %s", resource_id) + + def _delete_rows( + self, + resource_id: str, + schema: dict, + filters: dict[str, Any], + ) -> None: + """Parameterised ``DELETE FROM … WHERE …`` from the filter map.""" + from google.cloud import bigquery + try: + sql, params = delete_sql( + self._data_table_ref(resource_id), schema, filters, + ) + except ValueError as e: + raise ValidationError(str(e)) from e + + job_config = bigquery.QueryJobConfig(query_parameters=params) + try: + self._run_query( + sql, op="DELETE", resource_id=resource_id, + job_config=job_config, + ) + except ServerError as e: + raise _translate_bigquery_error(e, resource_id, "delete") from e + log.info( + "BigQuery rows deleted: %s (filters=%s)", + resource_id, sorted(filters.keys()) or "", + ) + + def _drop_columns( + self, + resource_id: str, + schema: dict[str, Any], + fields: list[str], + ) -> None: + """``ALTER TABLE DROP COLUMN …`` + rewrite the stored schema. + Rejects system columns, unknown columns, and PK columns.""" + assert self.metadata is not None + + existing = { + f["name"] + for f in schema.get("fields", []) + if f.get("name") + } + pk_raw = schema.get("primaryKey") + pk: set[str] = ( + {pk_raw} if isinstance(pk_raw, str) + else set(pk_raw or []) + ) + + # System-column check first: `_id` / `_updated_at` aren't in + # the stored schema, so the unknown-column check would shadow + # them with a less specific error. + reserved = [c for c in fields if c in SYSTEM_COLUMN_NAMES] + if reserved: + raise ValidationError( + f"cannot drop engine-reserved system column(s): " + f"{sorted(reserved)}" + ) + unknown = [c for c in fields if c not in existing] + if unknown: + raise ValidationError( + f"cannot drop unknown column(s): {sorted(unknown)}" + ) + pk_violations = [c for c in fields if c in pk] + if pk_violations: + raise ValidationError( + f"cannot drop primary-key column(s): " + f"{sorted(pk_violations)}; re-create the resource with " + "a new primaryKey instead" + ) + + sql = drop_columns_sql(self._data_table_ref(resource_id), fields) + self._run_query(sql, op="ALTER DROP COLUMN", resource_id=resource_id) + + drop_set = set(fields) + new_schema: dict[str, Any] = { + **schema, + "fields": [ + f for f in schema.get("fields", []) + if f.get("name") not in drop_set + ], + } + self.metadata.update(resource_id, new_schema) + log.info( + "BigQuery columns dropped: %s (%s)", resource_id, sorted(fields), + ) + + def info(self, resource_id: str) -> InfoResult: + """Return the table schema + row stats for a resource. + + Reads `schema` from the engine-managed `_table_metadata` (not + BigQuery's `INFORMATION_SCHEMA`) so the `primaryKey` and per- + field `info` data dictionary round-trip exactly as declared at + `datastore_create`. Row count comes from a `COUNT(*)` on the + data table. + + Placeholder mode (no metadata store) returns a stub so the unit + suite can exercise the call path without GCP creds. + """ + if self.metadata is None: + return InfoResult( + schema={"fields": []}, + meta={"resource_id": resource_id, "total": 0}, + ) - def info(self, resource_id: str) -> dict: - """Return table metadata: fields with types, primary_key, row count.""" - {} + schema = self.metadata.get(resource_id) + if schema is None: + raise NotFoundError( + f"resource {resource_id!r} is not declared; call " + "datastore_create first" + ) + + total = self._count_rows(resource_id) + + pk_raw = schema.get("primaryKey") + pk: list[str] = ( + [pk_raw] if isinstance(pk_raw, str) else list(pk_raw or []) + ) + + return InfoResult( + schema=schema, + meta={ + "resource_id": resource_id, + "total": total, + "primary_key": pk, + }, + ) + + def _count_rows(self, resource_id: str) -> int: + """`COUNT(*)` against the data table; returns 0 on missing table. + + A missing data table while metadata exists is an inconsistent + state (manual cleanup, partial drop). Logging it as a warning + and returning 0 keeps `datastore_info` informative rather than + 500-ing the whole call. + """ + sql = ( + f"SELECT COUNT(*) AS n FROM " + f"{self._data_table_ref(resource_id)}" + ) + try: + job = self._run_query( + sql, op="COUNT", resource_id=resource_id, + job_config=self._read_job_config(), + ) + rows = list(job.result()) + except ServerError as e: + log.warning( + "COUNT(*) failed for resource %r; reporting total=0: %s", + resource_id, e, + ) + return 0 + if not rows: + return 0 + return int(rows[0]["n"]) def get_columns(self, resource_id: str) -> list[str]: - """Return column names for a table.""" - {} + """Return column names for a table. + + Placeholder — replaced when real `search` lands. Empty list + keeps callers from crashing on the dead code path. + """ + return [] def healthcheck(self) -> bool: - """Return True if backend is reachable. Called by /ready probe.""" - { - "status": "ok", - } + """Probe the BigQuery client with `SELECT 1`. Returns False on + any failure so `/ready` can return 503 instead of crashing. + """ + if self.client is None: + return False + if ( + self.config is not None + and self.config.BIGQUERY_PROJECT.strip() + and self.metadata is None + ): + log.warning( + "BigQuery healthcheck failed (mode=%s): metadata store " + "unavailable — set BIGQUERY_DATASET.", + self.mode, + ) + return False + try: + self.client.query("SELECT 1").result() + return True + except Exception as e: + log.warning( + "BigQuery healthcheck failed (mode=%s): %s", self.mode, e + ) + return False + + +def _translate_bigquery_error( + exc: ServerError, resource_id: str, action: str +) -> Exception: + """Map known BigQuery error signatures (raised on INSERT / MERGE / + UPDATE against the JSON-array source) to clear `ValidationError`s. + + BigQuery's raw messages are technically accurate but unhelpful — + e.g. *"Scalar subquery produced more than one element"* really + means "your records have duplicate primary keys" and *"Bad double + value: jk"* means "you sent the string 'jk' for a `number` + column". Both surface as 400 ValidationError with a message that + names the actual problem. + + Patterns handled: + - duplicate primaryKey rows in the batch; + - per-column type mismatches (`Bad value: …`, + `Could not cast …`, `Could not parse …`); + - out-of-range numeric values (`Value out of range …`); + - bad date / time / timestamp literals (`Invalid : …`). + + Other errors pass through unchanged so the caller can re-raise as + a generic `ServerError`. + """ + import re + + from datastore.core.exceptions import ValidationError + + msg = str(exc) + + if "Scalar subquery produced more than one element" in msg: + return ValidationError( + "Found duplicated rows with the same primary key. " + f"Deduplicate the input batch and retry the {action} operation." + ) + + # `Bad int64 value: ` etc. — type-coercion failure on CAST(JSON_VALUE). + m = re.search( + r"Bad (int64|double|bool|numeric|bignumeric) value: (.+?)(?:;|\\n|$)", + msg, + re.IGNORECASE, + ) + if m: + bq_type, bad_value = m.group(1).lower(), m.group(2).strip() + return ValidationError( + f"Value {bad_value!r} is not a valid " + f"{_FRIENDLY_BQ_TYPE.get(bq_type, bq_type)}. " + "Check that each record's column values match the resource " + "schema's declared types." + ) + + # `Could not cast literal "" to type ` / + # `Could not parse '' as ` — alternative phrasings for + # the same coercion failure, depending on BigQuery version / path. + m = re.search( + r"Could not (?:cast literal|parse) ['\"](.+?)['\"] " + r"(?:to type|as) (\w+)", + msg, + ) + if m: + bad_value, bq_type = m.group(1), m.group(2).lower() + return ValidationError( + f"Value {bad_value!r} is not a valid " + f"{_FRIENDLY_BQ_TYPE.get(bq_type, bq_type)}. " + "Check that each record's column values match the resource " + "schema's declared types." + ) + + # `Value out of range for INT64: ` / `Numeric value … out of range` — + # the value parsed but doesn't fit the column type's range. + m = re.search( + r"(?:Value )?out of range(?: for (\w+))?:? (.+?)(?:;|\\n|$)", + msg, + re.IGNORECASE, + ) + if m: + bq_type = (m.group(1) or "").lower() + bad_value = m.group(2).strip() + friendly = _FRIENDLY_BQ_TYPE.get(bq_type, bq_type or "the column type") + return ValidationError( + f"Value {bad_value!r} is out of range for {friendly}. " + "Use a wider type or check the input range." + ) + + # `Invalid date: `, `Invalid timestamp: `, `Invalid time: ` — + # date/time literal that couldn't be parsed. + m = re.search( + r"Invalid (date|timestamp|datetime|time)(?: value)?: (.+?)(?:;|\\n|$)", + msg, + re.IGNORECASE, + ) + if m: + friendly, bad_value = m.group(1).lower(), m.group(2).strip() + return ValidationError( + f"Value {bad_value!r} is not a valid {friendly}. " + "Check that each record's column values match the resource " + "schema's declared types." + ) + + return exc + + +# BigQuery column-type name → Frictionless / user-friendly name. +_FRIENDLY_BQ_TYPE: dict[str, str] = { + "int64": "integer", + "double": "number", + "float64": "number", + "numeric": "number", + "bignumeric": "number", + "bool": "boolean", + "string": "string", + "date": "date", + "datetime": "datetime", + "timestamp": "timestamp", + "time": "time", + "json": "object", + "bytes": "string", +} + diff --git a/datastore/infrastructure/engines/bigquery/client.py b/datastore/infrastructure/engines/bigquery/client.py new file mode 100644 index 0000000..46d2d90 --- /dev/null +++ b/datastore/infrastructure/engines/bigquery/client.py @@ -0,0 +1,52 @@ +"""BigQuery `Client` construction. + +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Literal + +if TYPE_CHECKING: + from google.cloud import bigquery + + from datastore.core.config import Config + +Mode = Literal["rw", "ro"] + + +def build_client(config: Config, mode: Mode) -> bigquery.Client: + from google.cloud import bigquery + + project = config.BIGQUERY_PROJECT.strip() + if not project: + raise RuntimeError( + "BIGQUERY_PROJECT is required when DATASTORE_ENGINE=bigquery" + ) + + # Each mode reads its own credential variable independently. An + # empty RO credential falls through to ADC (Application Default + # Credentials) — never to the RW key, since that would silently + # give read paths write privileges and defeat the credential + # split. + creds_raw = ( + config.BIGQUERY_CREDENTIALS_RO + if mode == "ro" + else config.BIGQUERY_CREDENTIALS + ).strip() + + if not creds_raw: + return bigquery.Client(project=project) + return bigquery.Client( + project=project, credentials=_credentials_from_raw(creds_raw) + ) + + +def _credentials_from_raw(raw: str): + from google.oauth2 import service_account + + if raw.startswith("{"): + return service_account.Credentials.from_service_account_info( + json.loads(raw) + ) + return service_account.Credentials.from_service_account_file(raw) diff --git a/datastore/infrastructure/engines/bigquery/lib.py b/datastore/infrastructure/engines/bigquery/lib.py index e69de29..216478f 100644 --- a/datastore/infrastructure/engines/bigquery/lib.py +++ b/datastore/infrastructure/engines/bigquery/lib.py @@ -0,0 +1,470 @@ +"""Side-effect-free helpers for the BigQuery backend: schema diffs, +DDL clause rendering, DML statement builders, JSON extractors.""" + +from __future__ import annotations + +from datastore.core.exceptions import ConflictError +from datastore.infrastructure.engines.bigquery.types import ( + bigquery_type, + can_widen, +) + +# Frictionless types that map to BigQuery `JSON`. +JSON_FRICTIONLESS_TYPES = frozenset({"object", "array", "geojson"}) + +# Engine-managed columns. `_id` always present; `_updated_at` opt-in +# via `Config.INCLUDE_UPDATED_AT`. Same-named user fields are dropped. +SYSTEM_COLUMN_NAMES: frozenset[str] = frozenset({"_id", "_updated_at"}) + + +def _system_col_defs(include_updated_at: bool) -> tuple[str, ...]: + return ( + ("`_id` INT64", "`_updated_at` TIMESTAMP") + if include_updated_at + else ("`_id` INT64",) + ) + + +def _system_col_insert_list(include_updated_at: bool) -> str: + return "`_id`, `_updated_at`" if include_updated_at else "`_id`" + + +def column_defs(schema: dict, *, include_updated_at: bool = True) -> list[str]: + """Render `schema.fields` as ``\\`name\\` TYPE`` for `CREATE TABLE`, + prepending system columns and skipping any field that collides.""" + cols: list[str] = list(_system_col_defs(include_updated_at)) + for f in schema.get("fields", []): + name = f.get("name") + if not name or name in SYSTEM_COLUMN_NAMES: + continue + cols.append(f"`{name}` {bigquery_type(f.get('type'))}") + return cols + + +def schema_diff( + old_schema: dict, new_schema: dict +) -> tuple[list[str], list[tuple[str, str | None, str | None]], list[str]]: + """Return `(added, type_changes, removed)` between two schemas. + Types are raw Frictionless values; dialect mapping is the caller's job.""" + old_by_name = { + f["name"]: f for f in old_schema.get("fields", []) if f.get("name") + } + new_by_name = { + f["name"]: f for f in new_schema.get("fields", []) if f.get("name") + } + + added = [n for n in new_by_name if n not in old_by_name] + type_changes = [ + (n, old_by_name[n].get("type"), new_by_name[n].get("type")) + for n in new_by_name + if n in old_by_name + and old_by_name[n].get("type") != new_by_name[n].get("type") + ] + removed = [n for n in old_by_name if n not in new_by_name] + return added, type_changes, removed + + +def reject_unsupported_type_changes( + type_changes: list[tuple[str, str | None, str | None]], +) -> None: + """Raise `ConflictError` if any transition isn't a BigQuery widening.""" + unsupported = [ + f"'{name}' ({old_t} → {new_t})" + for name, old_t, new_t in type_changes + if not can_widen(bigquery_type(old_t), bigquery_type(new_t)) + ] + if not unsupported: + return + head = ( + f"Cannot change column type for {unsupported[0]}" + if len(unsupported) == 1 + else f"Cannot change column types: {', '.join(unsupported)}" + ) + raise ConflictError( + f"{head}. BigQuery does not support this conversion in place. " + "To apply, recreate the resource with the new schema." + ) + + +def alter_clauses( + added: list[str], + type_changes: list[tuple[str, str | None, str | None]], + new_schema: dict, +) -> list[str]: + """Per-column clauses for a single `ALTER TABLE`.""" + new_by_name = { + f["name"]: f for f in new_schema.get("fields", []) if f.get("name") + } + clauses: list[str] = [] + for name in added: + clauses.append( + f"ADD COLUMN IF NOT EXISTS `{name}` " + f"{bigquery_type(new_by_name[name].get('type'))}" + ) + for name, _, new_t in type_changes: + clauses.append( + f"ALTER COLUMN `{name}` SET DATA TYPE {bigquery_type(new_t)}" + ) + return clauses + + +def insert_sql( + table_ref: str, schema: dict, *, include_updated_at: bool = True +) -> str: + """Render `INSERT INTO ... SELECT FROM UNNEST(JSON_QUERY_ARRAY(@rows))`. + + DML (not streaming) so follow-up MERGE/UPDATE stays consistent. + `_id` = `(SELECT IFNULL(MAX(_id), 0) FROM tbl) + ROW_NUMBER() OVER ()` + — one MAX baseline per batch, ROW_NUMBER per row. + """ + fields = [ + f for f in schema.get("fields", []) + if f.get("name") and f["name"] not in SYSTEM_COLUMN_NAMES + ] + if not fields: + raise ValueError("schema has no user fields; cannot INSERT") + + data_cols = ", ".join(f"`{f['name']}`" for f in fields) + data_extractors = ", ".join(_json_extract(f) for f in fields) + sys_cols = _system_col_insert_list(include_updated_at) + id_expr = ( + f"(SELECT IFNULL(MAX(`_id`), 0) FROM {table_ref}) " + f"+ ROW_NUMBER() OVER ()" + ) + sys_vals = ( + f"{id_expr}, CURRENT_TIMESTAMP()" + if include_updated_at + else id_expr + ) + return ( + f"INSERT INTO {table_ref} ({sys_cols}, {data_cols}) " + f"SELECT {sys_vals}, {data_extractors} " + f"FROM UNNEST(JSON_QUERY_ARRAY(@rows)) AS r" + ) + + +def merge_sql( + table_ref: str, schema: dict, *, include_updated_at: bool = True +) -> str: + """Render `MERGE` keyed by `schema.primaryKey`. + + Matched rows update only if a non-PK column differs (so + `_updated_at` advances only on real changes). Unmatched rows + insert with `_id` = `(SELECT MAX(_id) FROM tbl) + _rn`. + """ + fields = [ + f for f in schema.get("fields", []) + if f.get("name") and f["name"] not in SYSTEM_COLUMN_NAMES + ] + pk_raw = schema.get("primaryKey") + pk: list[str] = ( + [pk_raw] if isinstance(pk_raw, str) else list(pk_raw or []) + ) + if not pk: + raise ValueError( + "schema has no 'primaryKey'; upsert requires one to " + "match existing rows" + ) + + pk_set = set(pk) + non_pk = [f for f in fields if f["name"] not in pk_set] + + using_cols = ", ".join( + f"{_json_extract(f)} AS `{f['name']}`" for f in fields + ) + on_clause = " AND ".join(f"T.`{n}` = S.`{n}`" for n in pk) + insert_cols = ", ".join(f"`{f['name']}`" for f in fields) + insert_vals = ", ".join(f"S.`{f['name']}`" for f in fields) + + parts = [ + f"MERGE {table_ref} T", + f"USING (SELECT {using_cols}, ROW_NUMBER() OVER () AS _rn " + f"FROM UNNEST(JSON_QUERY_ARRAY(@rows)) AS r) S", + f"ON {on_clause}", + ] + if non_pk: + diff_predicate = " OR ".join(_diff_expr(f) for f in non_pk) + matched_assignments = [ + f"T.`{f['name']}` = S.`{f['name']}`" for f in non_pk + ] + if include_updated_at: + matched_assignments.append( + "T.`_updated_at` = CURRENT_TIMESTAMP()" + ) + parts.append( + f"WHEN MATCHED AND ({diff_predicate}) " + f"THEN UPDATE SET {', '.join(matched_assignments)}" + ) + + sys_cols = _system_col_insert_list(include_updated_at) + id_value = f"(SELECT IFNULL(MAX(`_id`), 0) FROM {table_ref}) + S._rn" + sys_vals = ( + f"{id_value}, CURRENT_TIMESTAMP()" + if include_updated_at + else id_value + ) + parts.append( + f"WHEN NOT MATCHED THEN INSERT " + f"({sys_cols}, {insert_cols}) " + f"VALUES ({sys_vals}, {insert_vals})" + ) + return " ".join(parts) + + +def update_sql( + table_ref: str, schema: dict, *, include_updated_at: bool = True +) -> str: + """Render `UPDATE T SET ... FROM (SELECT ... FROM UNNEST(@rows)) S + WHERE `. Caller must compare affected rows to input size + and raise `NotFoundError` for unmatched keys.""" + fields = [ + f for f in schema.get("fields", []) + if f.get("name") and f["name"] not in SYSTEM_COLUMN_NAMES + ] + pk_raw = schema.get("primaryKey") + pk: list[str] = ( + [pk_raw] if isinstance(pk_raw, str) else list(pk_raw or []) + ) + if not pk: + raise ValueError( + "schema has no 'primaryKey'; update requires one to " + "match existing rows" + ) + + pk_set = set(pk) + non_pk = [f for f in fields if f["name"] not in pk_set] + if not non_pk and not include_updated_at: + raise ValueError( + "schema has no non-key columns to update and the " + "`_updated_at` system column is disabled; nothing to SET" + ) + + using_cols = ", ".join( + f"{_json_extract(f)} AS `{f['name']}`" for f in fields + ) + set_parts = [ + f"T.`{f['name']}` = S.`{f['name']}`" for f in non_pk + ] + if include_updated_at: + set_parts.append("T.`_updated_at` = CURRENT_TIMESTAMP()") + set_clause = ", ".join(set_parts) + where_clause = " AND ".join(f"T.`{n}` = S.`{n}`" for n in pk) + + return ( + f"UPDATE {table_ref} T " + f"SET {set_clause} " + f"FROM (SELECT {using_cols} " + f"FROM UNNEST(JSON_QUERY_ARRAY(@rows)) AS r) S " + f"WHERE {where_clause}" + ) + + +def _diff_expr(field: dict) -> str: + """NULL-safe inequality between `T.` and `S.`. JSON + columns are canonicalised via `TO_JSON_STRING` first.""" + name = field["name"] + if field.get("type") in JSON_FRICTIONLESS_TYPES: + return ( + f"TO_JSON_STRING(T.`{name}`) IS DISTINCT FROM " + f"TO_JSON_STRING(S.`{name}`)" + ) + return f"T.`{name}` IS DISTINCT FROM S.`{name}`" + + +def _json_extract(field: dict) -> str: + """Typed extraction of a field from JSON row variable `r`.""" + name = field["name"] + fr_type = field.get("type") + bq_type = bigquery_type(fr_type) + path = f"'$.{name}'" + if fr_type in JSON_FRICTIONLESS_TYPES: + return f"PARSE_JSON(JSON_QUERY(r, {path}))" + if bq_type == "STRING": + return f"JSON_VALUE(r, {path})" + return f"CAST(JSON_VALUE(r, {path}) AS {bq_type})" + + +# Frictionless type → BigQuery scalar parameter type for filter values. +# JSON / array / geojson absent — equality on those is rejected. +_FILTER_PARAM_TYPE: dict[str, str] = { + "integer": "INT64", + "number": "FLOAT64", + "boolean": "BOOL", + "string": "STRING", + "date": "DATE", + "datetime": "TIMESTAMP", + "time": "TIME", + "any": "STRING", +} + + +def drop_columns_sql(table_ref: str, columns: list[str]) -> str: + """Render ``ALTER TABLE DROP COLUMN …``. Caller must + validate column names against the schema first (identifiers can't + be parameterised).""" + if not columns: + raise ValueError("drop_columns_sql requires at least one column") + clauses = ", ".join(f"DROP COLUMN `{c}`" for c in columns) + return f"ALTER TABLE {table_ref} {clauses}" + + +def delete_sql( + table_ref: str, + schema: dict, + filters: dict, +) -> tuple[str, list]: + """Render parameterised ``DELETE FROM WHERE …``. Empty + ``filters`` yields ``WHERE TRUE`` (BigQuery requires a WHERE on + every DELETE). Returns ``(sql, query_parameters)``.""" + from google.cloud import bigquery + + if filters is None or not isinstance(filters, dict): + raise ValueError( + "delete filters must be a dict; use the DROP path when no " + "filter is intended" + ) + + type_map: dict[str, str] = {} + for f in schema.get("fields", []): + name = f.get("name") + if name and name not in SYSTEM_COLUMN_NAMES: + type_map[name] = f.get("type") or "string" + # System columns are always filterable. + type_map["_id"] = "integer" + type_map["_updated_at"] = "datetime" + + params: list = [] + clauses: list[str] = [] + for col, value in filters.items(): + if col not in type_map: + raise ValueError( + f"filters references unknown column {col!r}" + ) + ftype = type_map[col] + if ftype in JSON_FRICTIONLESS_TYPES: + raise ValueError( + f"filters cannot target JSON/array/geojson column " + f"{col!r}; use datastore_search_sql for structural matches" + ) + bq_type = _FILTER_PARAM_TYPE.get(ftype, "STRING") + name = f"f{len(params)}" + if isinstance(value, list): + params.append( + bigquery.ArrayQueryParameter(name, bq_type, value) + ) + clauses.append(f"`{col}` IN UNNEST(@{name})") + elif value is None: + clauses.append(f"`{col}` IS NULL") + else: + params.append( + bigquery.ScalarQueryParameter(name, bq_type, value) + ) + clauses.append(f"`{col}` = @{name}") + + where = " AND ".join(clauses) if clauses else "TRUE" + return f"DELETE FROM {table_ref} WHERE {where}", params + + +def unfiltered_table_name( + sql: str, *, dialect: str = "bigquery", +) -> str | None: + """Return the single source table name when `sql` is a plain + `SELECT cols FROM [LIMIT/OFFSET]` — i.e. the result row + count equals the source table's row count. + + Returns None whenever any clause could change the row count: + WHERE, GROUP BY, HAVING, JOIN, DISTINCT, QUALIFY, aggregate + functions, set ops (UNION/EXCEPT/INTERSECT), subqueries, or more + than one source table. The caller falls back to a real + `COUNT(*) FROM ()` in those cases. + + Used by `datastore_search_sql` to route the unfiltered total + through `INFORMATION_SCHEMA.TABLE_STORAGE` — free metadata read, + no bytes scanned — instead of a full table scan via COUNT(*). + """ + import sqlglot + from sqlglot import expressions as exp + + try: + tree = sqlglot.parse_one(sql, dialect=dialect) + except Exception: + return None + + if not isinstance(tree, exp.Select): + return None + + blockers = ("where", "group", "having", "joins", "distinct", "qualify") + if any(tree.args.get(k) for k in blockers): + return None + + # Nested SELECTs (subqueries) can reduce / expand rows in ways + # the surrounding clauses don't reveal — bail. + if sum(1 for _ in tree.find_all(exp.Select)) > 1: + return None + + aggregates = (exp.Count, exp.Sum, exp.Avg, exp.Min, exp.Max) + if next(tree.find_all(*aggregates), None) is not None: + return None + + cte_aliases = { + c.alias_or_name for c in tree.find_all(exp.CTE) + if c.alias_or_name + } + tables = [ + t for t in tree.find_all(exp.Table) + if t.name and t.name not in cte_aliases + ] + if len(tables) != 1: + return None + return tables[0].name + + +def strip_limit_offset(sql: str, *, dialect: str = "bigquery") -> str: + """Return `sql` with its LIMIT and OFFSET clauses removed. + + Used by `datastore_search_sql` to wrap the user's filtered query in + a `SELECT COUNT(*) FROM (...)` for the total — the count has to + ignore the page size or it would just report the current page's + row count, breaking `total_pages` / `next` links. + """ + import sqlglot + + tree = sqlglot.parse_one(sql, dialect=dialect) + tree.set("limit", None) + tree.set("offset", None) + return tree.sql(dialect=dialect) + + +def qualify_table_refs(sql: str, project: str, dataset: str) -> str: + """Rewrite every non-CTE table reference to its fully-qualified + BigQuery form and re-serialise the SQL in BigQuery dialect. + + Users pass `datastore_search_sql` SQL with table refs that look + like CKAN resource_ids (`FROM "uuid"` or `FROM uuid`). BigQuery + needs `project.dataset.uuid` with backticked identifiers — so the + backend parses the user's SQL (postgres dialect, which accepts + double-quoted identifiers), tags each unqualified table with the + configured project + dataset, and serialises out as BigQuery SQL. + + Tables that already carry a `catalog` (project) are left alone — + callers who fully-qualify their refs win against the auto-prefix. + CTE aliases are also skipped (they're defined inline, not external + tables). + """ + import sqlglot + from sqlglot import expressions as exp + + tree = sqlglot.parse_one(sql, dialect="postgres") + cte_aliases = { + cte.alias_or_name for cte in tree.find_all(exp.CTE) + if cte.alias_or_name + } + for table in tree.find_all(exp.Table): + name = table.name + if not name or name in cte_aliases: + continue + if table.args.get("catalog") is not None: + continue + table.set("catalog", exp.to_identifier(project, quoted=True)) + table.set("db", exp.to_identifier(dataset, quoted=True)) + return tree.sql(dialect="bigquery") diff --git a/datastore/infrastructure/engines/bigquery/metadata.py b/datastore/infrastructure/engines/bigquery/metadata.py new file mode 100644 index 0000000..8c47069 --- /dev/null +++ b/datastore/infrastructure/engines/bigquery/metadata.py @@ -0,0 +1,220 @@ +"""BigQuery implementation of the `MetadataStore` Protocol. + +Stores one row per `resource_id` in a hidden `_table_metadata` table +that lives alongside the user data tables in `BIGQUERY_DATASET`. The +row carries the Frictionless schema declared at `datastore_create` +time plus `created_at` / `updated_at` timestamps so callers can +reconstruct the column declaration without re-parsing user tables. + +The table is created on engine startup (`initialize()`) and updated via +parameterised `MERGE` from `create()`. Other engines (DuckLake, +Postgres, …) provide their own implementation of the same Protocol — +the backend layer only depends on the methods declared in +`engines/base.py:MetadataStore`. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +import orjson + +from datastore.core.exceptions import ServerError + +if TYPE_CHECKING: + from google.cloud import bigquery + +log = logging.getLogger(__name__) + +# Hidden by convention: BigQuery treats leading-underscore tables as +# internal, hiding them from default list / autocomplete in most UIs. +METADATA_TABLE_NAME = "_table_metadata" + + +class BigQueryMetadataStore: + """`MetadataStore` backed by a BigQuery table. + + Schema (DDL applied by `initialize`): + + resource_id STRING NOT NULL + schema JSON NOT NULL + created_at TIMESTAMP NOT NULL + updated_at TIMESTAMP NOT NULL + + The table is keyed on `resource_id` at the application layer + (BigQuery has no enforced PK / unique constraints); the `MERGE` in + `upsert()` provides single-row semantics. + """ + + def __init__( + self, + *, + client: bigquery.Client, + project: str, + dataset: str, + table_name: str = METADATA_TABLE_NAME, + ) -> None: + self.client = client + self.project = project + self.dataset = dataset + self.table_name = table_name + + @property + def table_ref(self) -> str: + """Fully-qualified `project.dataset.table` reference for SQL.""" + return f"`{self.project}.{self.dataset}.{self.table_name}`" + + def initialize(self) -> None: + """Create the metadata table if it doesn't exist. Idempotent. + + Uses `CREATE TABLE IF NOT EXISTS` so concurrent pods racing to + start up don't trip over each other. The dataset itself is + assumed to exist — creating datasets is an out-of-band ops task, + not something the application does at request time. + """ + ddl = f""" + CREATE TABLE IF NOT EXISTS {self.table_ref} ( + resource_id STRING NOT NULL, + schema JSON NOT NULL, + created_at TIMESTAMP NOT NULL, + updated_at TIMESTAMP NOT NULL + ) + """ + self._run(ddl, op="metadata CREATE TABLE", resource_id=None) + log.info( + "BigQuery metadata table ready: %s.%s.%s", + self.project, self.dataset, self.table_name, + ) + + def insert(self, resource_id: str, schema: dict) -> None: + """Insert a new metadata row for `resource_id`. + + Sets `created_at` / `updated_at` to now. Fails if a row already + exists for this `resource_id` — that's a genuine conflict + (duplicate `datastore_create`) that callers should surface. + """ + sql = f""" + INSERT INTO {self.table_ref} + (resource_id, schema, created_at, updated_at) + VALUES ( + @resource_id, + PARSE_JSON(@schema), + CURRENT_TIMESTAMP(), + CURRENT_TIMESTAMP() + ) + """ + self._run( + sql, + op="metadata INSERT", + resource_id=resource_id, + job_config=self._schema_params(resource_id, schema), + ) + + def update(self, resource_id: str, schema: dict) -> None: + """Update the metadata row keyed by `resource_id`. + + Replaces `schema` and bumps `updated_at`; `created_at` is + preserved. Plain `UPDATE` — no MERGE, no insert fallback. When + no row matches the predicate the statement is a no-op. + """ + sql = f""" + UPDATE {self.table_ref} + SET schema = PARSE_JSON(@schema), + updated_at = CURRENT_TIMESTAMP() + WHERE resource_id = @resource_id + """ + self._run( + sql, + op="metadata UPDATE", + resource_id=resource_id, + job_config=self._schema_params(resource_id, schema), + ) + + def _schema_params( + self, resource_id: str, schema: dict + ) -> "bigquery.QueryJobConfig": + """Build the `(resource_id, schema)` parameter set shared by + `insert` and `update`. Keeps the SQL strings free of inline + values and the marshalling rule in one place.""" + from google.cloud import bigquery + + return bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("resource_id", "STRING", resource_id), + bigquery.ScalarQueryParameter( + "schema", "STRING", orjson.dumps(schema).decode("utf-8") + ), + ] + ) + + def get(self, resource_id: str) -> dict | None: + """Return the stored Frictionless schema for `resource_id`, + or `None` when no row exists.""" + sql = f""" + SELECT TO_JSON_STRING(schema) AS schema_json + FROM {self.table_ref} + WHERE resource_id = @resource_id + LIMIT 1 + """ + rows = list( + self._run( + sql, + op="metadata SELECT", + resource_id=resource_id, + job_config=self._resource_id_params(resource_id), + ) + ) + if not rows: + return None + raw = rows[0]["schema_json"] + parsed: Any = orjson.loads(raw) + return parsed if isinstance(parsed, dict) else None + + def delete(self, resource_id: str) -> None: + """Remove the metadata row for `resource_id`. No-op when absent.""" + sql = f"DELETE FROM {self.table_ref} WHERE resource_id = @resource_id" + self._run( + sql, + op="metadata DELETE", + resource_id=resource_id, + job_config=self._resource_id_params(resource_id), + ) + + def _run( + self, + sql: str, + *, + op: str, + resource_id: str | None, + job_config: "bigquery.QueryJobConfig | None" = None, + ) -> Any: + """Run a metadata SQL statement and wait for completion. + + Wraps every `client.query` so transport / SQL failures arrive at + callers as `ServerError` with the operation name + the + `resource_id` being touched (or `` for `initialize`), + rather than raw `google.api_core` exceptions. + """ + try: + return self.client.query(sql, job_config=job_config).result() + except Exception as e: + target = resource_id if resource_id is not None else "" + raise ServerError( + f"BigQuery {op} failed for resource {target!r}: {e}" + ) from e + + def _resource_id_params( + self, resource_id: str + ) -> "bigquery.QueryJobConfig": + """Job config carrying just the `resource_id` parameter (for + `get` and `delete` which don't bind a schema).""" + from google.cloud import bigquery + + return bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter( + "resource_id", "STRING", resource_id + ), + ] + ) diff --git a/datastore/infrastructure/engines/bigquery/search.py b/datastore/infrastructure/engines/bigquery/search.py new file mode 100644 index 0000000..ee1b21b --- /dev/null +++ b/datastore/infrastructure/engines/bigquery/search.py @@ -0,0 +1,347 @@ +"""SQL builders for `datastore_search` on the BigQuery backend. + +Pure helpers — no I/O. Each builder returns the SQL string and a +``QueryJobConfig`` carrying the parameter bindings; the backend submits +both via ``client.query`` and wraps errors. Splitting filters / sort / +projection / count into this module keeps ``backend.py`` focused on +orchestration and makes the builders trivially unit-testable. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from datastore.infrastructure.engines.bigquery.lib import ( + JSON_FRICTIONLESS_TYPES, + SYSTEM_COLUMN_NAMES, +) + +if TYPE_CHECKING: + from google.cloud import bigquery + +# Frictionless type → BigQuery scalar parameter type. JSON/array/geojson +# deliberately absent — filter equality against those is rejected (see +# `build_search`). Anything else falls back to STRING. +_PARAM_TYPE: dict[str, str] = { + "integer": "INT64", + "number": "FLOAT64", + "boolean": "BOOL", + "string": "STRING", + "date": "DATE", + "datetime": "TIMESTAMP", + "time": "TIME", + "any": "STRING", +} + +# System columns are always available for projection / filter / sort. +_SYSTEM_FIELD_DEFS: dict[str, dict] = { + "_id": {"name": "_id", "type": "integer"}, + "_updated_at": {"name": "_updated_at", "type": "datetime"}, +} + + +def _column_type_map(schema: dict, *, include_updated_at: bool) -> dict[str, str]: + """Map column name → Frictionless type, including system columns.""" + out: dict[str, str] = {} + for f in schema.get("fields", []): + name = f.get("name") + if name and name not in SYSTEM_COLUMN_NAMES: + out[name] = f.get("type") or "string" + out["_id"] = "integer" + if include_updated_at: + out["_updated_at"] = "datetime" + return out + + +def _ordered_columns(schema: dict, *, include_updated_at: bool) -> list[str]: + """Default projection order: `_id`, user fields, then `_updated_at`. + + `_id` first matches CKAN convention (it's the row identifier). + `_updated_at` trails so user columns stay together in tabular UIs. + """ + user = [ + f["name"] + for f in schema.get("fields", []) + if f.get("name") and f["name"] not in SYSTEM_COLUMN_NAMES + ] + cols = ["_id", *user] + if include_updated_at: + cols.append("_updated_at") + return cols + + +def parse_sort(sort_str: str, allowed: set[str]) -> list[tuple[str, str]]: + """Parse a CKAN-style sort string into validated `(col, dir)` pairs. + + Format: ``"col1 asc, col2 desc"`` — direction defaults to ``ASC`` + when omitted. Validates every column against `allowed`; raises + `ValueError` on unknown columns or non-`asc`/`desc` direction + tokens. Validation is what makes it safe to inline the column + name into the generated SQL (no parameter binding for identifiers). + """ + out: list[tuple[str, str]] = [] + for part in sort_str.split(","): + part = part.strip() + if not part: + continue + tokens = part.split() + if len(tokens) > 2: + raise ValueError( + f"sort entry {part!r} has too many tokens; expected " + "'' or ' asc|desc'" + ) + col = tokens[0] + direction = (tokens[1].upper() if len(tokens) == 2 else "ASC") + if col not in allowed: + raise ValueError( + f"sort references unknown column {col!r}" + ) + if direction not in ("ASC", "DESC"): + raise ValueError( + f"sort direction for {col!r} must be ASC or DESC, " + f"got {direction!r}" + ) + out.append((col, direction)) + return out + + +def project_schema(schema: dict, projected_cols: list[str]) -> dict: + """Filter the table schema to just the projected columns, preserving + order. System columns get synthesised entries when projected. + """ + by_name = { + f["name"]: f + for f in schema.get("fields", []) + if f.get("name") + } + out_fields: list[dict] = [] + for col in projected_cols: + if col in by_name: + out_fields.append(by_name[col]) + elif col in _SYSTEM_FIELD_DEFS: + out_fields.append(_SYSTEM_FIELD_DEFS[col]) + out = {"fields": out_fields} + if "primaryKey" in schema: + out["primaryKey"] = schema["primaryKey"] + return out + + +def _make_param(name: str, fr_type: str, value: Any) -> "bigquery.ScalarQueryParameter": + from google.cloud import bigquery + bq_type = _PARAM_TYPE.get(fr_type, "STRING") + return bigquery.ScalarQueryParameter(name, bq_type, value) + + +def _make_array_param(name: str, fr_type: str, values: list[Any]) -> "bigquery.ArrayQueryParameter": + from google.cloud import bigquery + bq_type = _PARAM_TYPE.get(fr_type, "STRING") + return bigquery.ArrayQueryParameter(name, bq_type, values) + + +def _build_where( + *, + filters: dict | None, + q: str | dict | None, + type_map: dict[str, str], + table_alias: str, + params: list, +) -> str: + """Render the `WHERE` clause and append parameters in place. + + Filter values bind with the column's typed Frictionless → BigQuery + parameter type so INTEGER columns get `INT64` params, etc. Lists + become `IN UNNEST(@p)`. Full-text uses BigQuery's native `SEARCH()` + (tokenised, leverages search indexes when present); string `q` + searches the whole row, dict `q` searches per column. + + Raises `ValueError` for unknown columns or filters against + JSON/array/geojson columns (no clean equality semantics in BQ). + The backend converts this to `ValidationError`. + """ + clauses: list[str] = [] + + if filters: + for col, value in filters.items(): + if col not in type_map: + raise ValueError( + f"filters references unknown column {col!r}" + ) + ftype = type_map[col] + if ftype in JSON_FRICTIONLESS_TYPES: + raise ValueError( + f"filters cannot target JSON/array/geojson column " + f"{col!r}; use datastore_search_sql for structural " + "matches" + ) + name = f"f{len(params)}" + if isinstance(value, list): + params.append(_make_array_param(name, ftype, value)) + clauses.append(f"`{col}` IN UNNEST(@{name})") + elif value is None: + clauses.append(f"`{col}` IS NULL") + else: + params.append(_make_param(name, ftype, value)) + clauses.append(f"`{col}` = @{name}") + + if isinstance(q, str): + name = f"f{len(params)}" + params.append(_make_param(name, "string", q)) + # `SEARCH(, @q)` matches against every searchable column + # of the row — BigQuery's native full-text. Honours search + # indexes when defined; falls back to a tokenised scan otherwise. + clauses.append(f"SEARCH({table_alias}, @{name})") + elif isinstance(q, dict): + for col, term in q.items(): + if col not in type_map: + raise ValueError( + f"q references unknown column {col!r}" + ) + name = f"f{len(params)}" + params.append(_make_param(name, "string", str(term))) + clauses.append(f"SEARCH(`{col}`, @{name})") + + return " AND ".join(clauses) + + +def build_search( + *, + table_ref: str, + schema: dict, + include_updated_at: bool, + fields: list[str] | None, + filters: dict | None, + q: str | dict | None, + distinct: bool, + sort: str | None, + limit: int, + offset: int, +) -> tuple[str, list, dict]: + """Build a parameterised SELECT. + + Returns `(sql, parameters, result_schema)` where `result_schema` is + the Frictionless schema of the projected columns (used by the + streaming writer for column ordering + types). Parameters are a + list of `ScalarQueryParameter` / `ArrayQueryParameter` ready to + drop into a `QueryJobConfig`. + + Layout: ``SELECT [DISTINCT] cols FROM target AS t [WHERE ...] + [ORDER BY ...] LIMIT N OFFSET M``. Sort defaults to `_id ASC` when + `_id` is projected (CKAN's row-id ordering convention); otherwise + no default sort — caller must specify or accept BigQuery's order + (undefined). + + Raises `ValueError` for unknown columns in `fields` / `sort` / + `filters` / `q`. The backend converts to `ValidationError` so the + caller sees a clean 400 instead of a 500. + """ + type_map = _column_type_map(schema, include_updated_at=include_updated_at) + all_cols = set(type_map) + default_cols = _ordered_columns(schema, include_updated_at=include_updated_at) + + if fields is None: + projected = list(default_cols) + else: + for f in fields: + if f not in all_cols: + raise ValueError( + f"fields references unknown column {f!r}" + ) + projected = list(fields) + if not projected: + raise ValueError("`fields` must select at least one column") + + sort_pairs: list[tuple[str, str]] + if sort: + sort_pairs = parse_sort(sort, all_cols) + elif "_id" in projected: + sort_pairs = [("_id", "ASC")] + else: + sort_pairs = [] + + params: list = [] + where = _build_where( + filters=filters, q=q, type_map=type_map, + table_alias="t", params=params, + ) + + parts: list[str] = [] + projection = ", ".join(f"`{c}`" for c in projected) + parts.append( + f"SELECT {'DISTINCT ' if distinct else ''}{projection} " + f"FROM {table_ref} AS t" + ) + if where: + parts.append(f"WHERE {where}") + if sort_pairs: + parts.append( + "ORDER BY " + ", ".join(f"`{c}` {d}" for c, d in sort_pairs) + ) + parts.append(f"LIMIT {int(limit)} OFFSET {int(offset)}") + + sql = " ".join(parts) + return sql, params, project_schema(schema, projected) + + +def build_count( + *, + table_ref: str, + schema: dict, + include_updated_at: bool, + fields: list[str] | None, + filters: dict | None, + q: str | dict | None, + distinct: bool, +) -> tuple[str, list]: + """Build a parameterised `COUNT(*)` for the same row set. + + Wraps a `SELECT [DISTINCT] cols FROM target WHERE ...` so the + count matches the projection / dedup of the data query. Doesn't + apply LIMIT/OFFSET — total is independent of paging. + + Raises `ValueError` on the same conditions as `build_search` (the + backend should run validation once via `build_search` first; this + builder re-runs it as defense in depth). + """ + type_map = _column_type_map(schema, include_updated_at=include_updated_at) + all_cols = set(type_map) + default_cols = _ordered_columns(schema, include_updated_at=include_updated_at) + + if fields is None: + projected = list(default_cols) + else: + for f in fields: + if f not in all_cols: + raise ValueError( + f"fields references unknown column {f!r}" + ) + projected = list(fields) + + params: list = [] + where = _build_where( + filters=filters, q=q, type_map=type_map, + table_alias="t", params=params, + ) + + projection = ", ".join(f"`{c}`" for c in projected) + inner_parts = [ + f"SELECT {'DISTINCT ' if distinct else ''}{projection} " + f"FROM {table_ref} AS t" + ] + if where: + inner_parts.append(f"WHERE {where}") + inner = " ".join(inner_parts) + + sql = f"SELECT COUNT(*) AS n FROM ({inner})" + return sql, params + + +def needs_count_query( + *, + filters: dict | None, + q: str | dict | None, + distinct: bool, +) -> bool: + """`True` when total must come from a real COUNT — anything that + narrows / dedupes the result set. `False` lets the backend take + the cheap `__TABLES__.row_count` path.""" + return bool(filters) or bool(q) or distinct diff --git a/datastore/infrastructure/engines/bigquery/types.py b/datastore/infrastructure/engines/bigquery/types.py new file mode 100644 index 0000000..e4c6529 --- /dev/null +++ b/datastore/infrastructure/engines/bigquery/types.py @@ -0,0 +1,116 @@ +"""Frictionless ↔ BigQuery type mapping. + +One module per engine owns the dialect translation between the +canonical Frictionless Table Schema vocabulary the rest of the app +speaks and the storage engine's native types. Keeping it isolated +makes it easy to add a new engine (DuckLake / Postgres / …) without +touching anything outside its own subpackage. + +The mapping is intentionally permissive: unknown Frictionless types +fall through to `STRING` rather than raising, so a slightly newer +schema spec (or a custom type) still loads. Strict validation lives +upstream in `schemas/validators.py:validate_frictionless_schema`. +""" + +from __future__ import annotations + +# Many-to-one on purpose: every Frictionless type maps to the widest +# BigQuery type that can hold its values. `year` → INT64 keeps it +# arithmetically useful; `yearmonth` / `duration` stay STRING because +# BigQuery has no native equivalent that round-trips losslessly. +FRICTIONLESS_TO_BIGQUERY: dict[str, str] = { + "integer": "INT64", + "number": "FLOAT64", + "string": "STRING", + "boolean": "BOOL", + "date": "DATE", + "time": "TIME", + "datetime": "TIMESTAMP", + "duration": "STRING", + "object": "JSON", + "array": "JSON", + "geojson": "JSON", + "geopoint": "STRING", + "year": "INT64", + "yearmonth": "STRING", + "any": "STRING", +} + +_DEFAULT_BIGQUERY_TYPE = "STRING" + + +def bigquery_type(frictionless_type: str | None) -> str: + """Resolve a Frictionless field type to a BigQuery column type. + + Returns `STRING` for unknown or absent types so a new Frictionless + spec (or a custom dialect extension) doesn't break table creation. + Strict validation of the schema descriptor itself happens upstream + at the request boundary. + """ + if not frictionless_type: + return _DEFAULT_BIGQUERY_TYPE + return FRICTIONLESS_TO_BIGQUERY.get( + frictionless_type, _DEFAULT_BIGQUERY_TYPE + ) + + +# BigQuery's `ALTER TABLE ... ALTER COLUMN ... SET DATA TYPE` only +# supports a narrow set of widening transitions. Keys are the current +# BigQuery type; values are the set of types the column may be altered +# to without rewriting the table. Anything outside this map needs a +# planned rebuild and is rejected at the request boundary. +# +# Source: BigQuery DDL docs — INT64/NUMERIC may widen to wider numeric +# types; DATE may widen to DATETIME/TIMESTAMP. No string/JSON/bool +# transitions are supported. +BIGQUERY_ALLOWED_TYPE_CHANGES: dict[str, set[str]] = { + "INT64": {"NUMERIC", "BIGNUMERIC", "FLOAT64"}, + "NUMERIC": {"BIGNUMERIC", "FLOAT64"}, + "DATE": {"DATETIME", "TIMESTAMP"}, +} + + +def can_widen(old_bq_type: str, new_bq_type: str) -> bool: + """Return True iff BigQuery accepts an in-place `ALTER COLUMN SET + DATA TYPE` from `old_bq_type` to `new_bq_type`. + + Identity (no change) is trivially allowed. + """ + if old_bq_type == new_bq_type: + return True + return new_bq_type in BIGQUERY_ALLOWED_TYPE_CHANGES.get(old_bq_type, set()) + + +# Inverse of `FRICTIONLESS_TO_BIGQUERY` — used when reading a result +# schema back from BigQuery (e.g. `datastore_search_sql`) and surfacing +# it to clients as Frictionless types. Many-to-one collapses some BQ +# precision distinctions (NUMERIC / BIGNUMERIC / FLOAT64 → number). +BIGQUERY_TO_FRICTIONLESS: dict[str, str] = { + "INT64": "integer", + "INTEGER": "integer", + "FLOAT64": "number", + "FLOAT": "number", + "NUMERIC": "number", + "BIGNUMERIC": "number", + "BOOL": "boolean", + "BOOLEAN": "boolean", + "STRING": "string", + "BYTES": "string", + "DATE": "date", + "TIME": "time", + "DATETIME": "datetime", + "TIMESTAMP": "datetime", + "JSON": "object", +} + + +def frictionless_type_from_bigquery(bq_type: str | None) -> str: + """Map a BigQuery column type back to a Frictionless type name. + + Unknown or absent types collapse to `string` so a newer BigQuery + type (e.g. `RANGE`) is still surfaced as a usable column instead + of breaking the response. + """ + if not bq_type: + return "string" + return BIGQUERY_TO_FRICTIONLESS.get(bq_type.upper(), "string") diff --git a/datastore/infrastructure/engines/registry.py b/datastore/infrastructure/engines/registry.py index 22a582b..9eb489e 100644 --- a/datastore/infrastructure/engines/registry.py +++ b/datastore/infrastructure/engines/registry.py @@ -16,6 +16,7 @@ if TYPE_CHECKING: # type-only — no runtime import from api/ from datastore.api.context import RequestContext + from datastore.core.config import Config Mode = Literal["rw", "ro"] @@ -61,6 +62,56 @@ def get_allowed_sql_functions( return frozenset(names) + +_INSTANCES: dict[tuple[str, Mode], DatastoreBackend] = {} + +def _build_engine( + engine: str, + mode: Mode, + *, + config: Config, + context: RequestContext | None = None, +) -> DatastoreBackend: + """Import the engine package and instantiate its `Backend` class. + + Engine packages expose a `Backend` symbol pointing at their concrete + `DatastoreBackend` subclass (e.g. `BigQueryBackend`, future + `DucklakeBackend`, `PostgresBackend`). Decoupling the registry from + any specific class name keeps the dispatch engine-agnostic — adding + a new backend is a folder drop with a `Backend = …` re-export. + """ + try: + module = importlib.import_module( + f"datastore.infrastructure.engines.{engine}" + ) + except ImportError as e: + raise NotImplementedError( + f"engine package not available: {engine!r}" + ) from e + + backend_cls = getattr(module, "Backend", None) + if backend_cls is None: + raise NotImplementedError( + f"engine {engine!r} has no `Backend` export — engine packages " + ) + backend = backend_cls(context=context, config=config, mode=mode) + backend.initialize() + return backend + + +def warmup_engines(config: Config) -> None: + """Build + initialise rw and ro engine instances. Called from the + FastAPI lifespan so credential errors surface at startup.""" + engine = config.DATASTORE_ENGINE + for mode in ("rw", "ro"): + _INSTANCES[(engine, mode)] = _build_engine(engine, mode, config=config) + + +def reset_engine_cache() -> None: + """Drop cached instances. Used by lifespan teardown + test fixtures.""" + _INSTANCES.clear() + + def get_datastore_engine( context: RequestContext, *, @@ -83,25 +134,11 @@ def get_datastore_engine( without re-resolving them. The factory itself only reads `context.config.DATASTORE_ENGINE` to pick a class. - Dispatch is dynamic: it imports `datastore.infrastructure.engines.` - and uses the package's exported `BigQueryBackend` (legacy name kept - across engines for convention). Adding a new engine = drop in a new - package; no edit here required. Engines that exist locally but are - gitignored (e.g. a test variant) work the same way. """ engine = context.config.DATASTORE_ENGINE - try: - module = importlib.import_module( - f"datastore.infrastructure.engines.{engine}" - ) - except ImportError as e: - raise NotImplementedError( - f"engine package not available: {engine!r}" - ) from e - - backend_cls = getattr(module, "BigQueryBackend", None) - if backend_cls is None: - raise NotImplementedError( - f"engine {engine!r} has no `BigQueryBackend` export" + key = (engine, mode) + if key not in _INSTANCES: + _INSTANCES[key] = _build_engine( + engine, mode, config=context.config, context=context ) - return backend_cls(context=context, mode=mode) + return _INSTANCES[key] diff --git a/datastore/main.py b/datastore/main.py index ea0a71b..1bbb6f0 100644 --- a/datastore/main.py +++ b/datastore/main.py @@ -12,9 +12,14 @@ from datastore.api.middleware import BodySizeLimitMiddleware from datastore.api.responses import ORJSONResponse from datastore.api.routes import api_router +from datastore.auth.registry import get_auth_provider from datastore.core.config import get_config from datastore.infrastructure.cache import InMemoryCache, RedisCache from datastore.infrastructure.ckan_client import CKANClient +from datastore.infrastructure.engines.registry import ( + reset_engine_cache, + warmup_engines, +) log = logging.getLogger("uvicorn.error") @@ -30,22 +35,32 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: httpx.AsyncClient(timeout=config.HTTP_TIMEOUT_SECONDS) ) app.state.http = http - app.state.ckan = CKANClient(base_url=config.CKAN_URL, http=http) + ckan: CKANClient | None = ( + CKANClient(base_url=config.CKAN_URL, http=http) + if config.AUTH_TYPE == "ckan" + else None + ) + app.state.ckan = ckan cache = RedisCache(config.REDIS_URL) if config.REDIS_URL else InMemoryCache() if hasattr(cache, "close"): stack.push_async_callback(cache.close) app.state.cache = cache + + app.state.auth_provider = get_auth_provider( + config, ckan=ckan, cache=cache, cache_ttl=config.AUTH_CACHE_TTL, + ) + + # Build + initialise rw/ro engines once; surface credential + # errors at startup, not on the first request. + warmup_engines(config) + stack.callback(reset_engine_cache) - # One-line startup summary so operators can see the active engine - # and the related toggles at a glance. Goes through Python's - # `logging`, inheriting uvicorn's INFO-level root config. log.info( - "datastore ready: engine=%r auth=%s cache=%s sql_allow_file=%s", + "datastore ready: Engine=%r Auth=%r Cache=%s", config.DATASTORE_ENGINE, - "on" if config.AUTH_ENABLED else "off", + config.AUTH_TYPE, "redis" if config.REDIS_URL else "memory", - config.SQL_FUNCTIONS_ALLOW_FILE or "default", ) yield diff --git a/datastore/schemas/request.py b/datastore/schemas/request.py index d88575a..b14bc24 100644 --- a/datastore/schemas/request.py +++ b/datastore/schemas/request.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from typing import Any, Literal +from typing import Annotated, Any, Literal from pydantic import ( BaseModel, @@ -15,9 +15,12 @@ from datastore.schemas.validators import ( FieldSpec, StringOrList, + fields_to_frictionless_schema, + parse_sql_pagination, parse_sql_references, to_json_object, to_str_or_json_object, + validate_frictionless_schema, ) UpsertMethod = Literal["upsert", "insert", "update"] @@ -26,19 +29,39 @@ class DatastoreCreateRequest(BaseModel): """Request body for `POST /api/3/datastore_create`. + + Column definitions: provide either the legacy `fields` shape (a list of + `FieldSpec` objects) **or** a Frictionless Table Schema via `schema`, + never both. The Frictionless form is the native shape; `fields` is kept + as a back-compat input and will be deprecated. + + When `schema` is supplied, `primary_key` must not be — the schema's + `primaryKey` is the single source of truth for the unique key. """ model_config = ConfigDict(extra="forbid") resource_id: str | None = None resource: dict[str, Any] | None = None - fields: list[FieldSpec] = Field(min_length=1) - primary_key: StringOrList = None + # `deprecated=` must ride on `Annotated` metadata, not as a `Field()` + # default — Pydantic silently drops it on union- / Annotated-aliased + # fields when supplied via `Field(default=..., deprecated=...)`. + fields: Annotated[ + list[FieldSpec] | None, + Field(deprecated="use 'schema' (Frictionless Table Schema) instead"), + ] = None + schema: dict[str, Any] | None = None + primary_key: Annotated[ + StringOrList, + Field(deprecated="use 'schema.primaryKey' instead"), + ] = None records: list[dict[str, Any]] | None = None include_records: bool = False include_total: bool = False force: bool | None = None + _check_schema = field_validator("schema")(validate_frictionless_schema) + @model_validator(mode="after") def _require_resource_id_or_resource(self) -> DatastoreCreateRequest: if self.resource_id is None and self.resource is None: @@ -47,10 +70,48 @@ def _require_resource_id_or_resource(self) -> DatastoreCreateRequest: raise ValueError("provide either 'resource_id' or 'resource', not both") return self + @model_validator(mode="after") + def _require_fields_or_schema(self) -> DatastoreCreateRequest: + # Read deprecated fields via __dict__ so we don't trip our own + # `Field(deprecated=...)` DeprecationWarning during validation. + fields_val = self.__dict__.get("fields") + primary_key_val = self.__dict__.get("primary_key") + has_fields = fields_val is not None + has_schema = self.schema is not None + if has_fields and has_schema: + raise ValueError( + "provide either 'fields' (legacy) or 'schema' (frictionless), not both" + ) + if not has_fields and not has_schema: + raise ValueError("either 'fields' or 'schema' is required") + if has_fields and len(fields_val or []) == 0: + raise ValueError("'fields' must not be empty") + if has_schema and primary_key_val: + raise ValueError( + "'primary_key' is not allowed with 'schema'; use the schema's 'primaryKey' instead" + ) + return self + + @model_validator(mode="after") + def _build_canonical_schema(self) -> DatastoreCreateRequest: + """Fold legacy `fields` + `primary_key` into the canonical `schema`. + + After this validator `self.schema` is always populated, so the + endpoint / service only ever read the Frictionless shape — the + legacy inputs exist purely as a boundary back-compat surface. + Runs after `_require_fields_or_schema`, so we know exactly one + of {fields, schema} is set. + """ + if self.schema is not None: + return self + fields_val = self.__dict__.get("fields") or [] + primary_key_val = self.__dict__.get("primary_key") or [] + self.__dict__["schema"] = fields_to_frictionless_schema(fields_val, primary_key_val) + return self + class DatastoreUpsertRequest(BaseModel): - """Request body for `POST /api/3/datastore_upsert`. - """ + """Request body for `POST /api/3/datastore_upsert`.""" model_config = ConfigDict(extra="forbid") @@ -89,7 +150,9 @@ class DatastoreSearchRequest(BaseModel): distinct: bool = False plain: bool = True language: str = "english" - limit: int = Field(default=100, ge=0, le=32000) + # Engine enforces `Config.SEARCH_RESULT_ROWS_MAX` (default 32000). + # No `le` here so ops can lift the cap via env without a schema change. + limit: int = Field(default=100, ge=0) offset: int = Field(default=0, ge=0) fields: str | None = None sort: str | None = None @@ -142,6 +205,8 @@ class DatastoreSearchSQLRequest(BaseModel): # the read-only properties below give callers a clean attribute. _resource_ids: list[str] = PrivateAttr(default_factory=list) _function_names: list[str] = PrivateAttr(default_factory=list) + _limit: int = PrivateAttr(default=0) + _offset: int = PrivateAttr(default=0) @property def resource_ids(self) -> list[str]: @@ -155,6 +220,16 @@ def function_names(self) -> list[str]: against `core.constants.ALLOWED_SQL_FUNCTIONS`.""" return self._function_names + @property + def limit(self) -> int: + """`LIMIT` literal parsed from the SQL — required.""" + return self._limit + + @property + def offset(self) -> int: + """`OFFSET` literal parsed from the SQL (0 when absent).""" + return self._offset + @field_validator("sql") @classmethod def _check_sql_is_select(cls, v: str) -> str: @@ -179,9 +254,7 @@ def _check_sql_is_select(cls, v: str) -> str: head = _SQL_COMMENT_RE.sub("", stripped).strip() if not _SQL_LEAD_RE.match(head): - raise ValueError( - "only SELECT / WITH statements are allowed" - ) + raise ValueError("only SELECT / WITH statements are allowed") cleaned = stripped.rstrip(";").rstrip() if ";" in cleaned: @@ -194,12 +267,88 @@ def _check_sql_is_select(cls, v: str) -> str: @model_validator(mode="after") def _extract_sql_references(self) -> DatastoreSearchSQLRequest: - """Parse `sql` via sqlglot and stash table + function names. + """Parse `sql` via sqlglot and stash table + function names + + the LIMIT/OFFSET literals. Runs after `_check_sql_is_select`, so we know we have a single SELECT / WITH. CTE aliases are excluded from `_resource_ids` - (they're defined inline, not external tables). + (they're defined inline, not external tables). LIMIT is + required — the service uses it to build pagination links and + to cap the streaming response; missing LIMIT raises a clean + ValidationError up front. """ self._resource_ids, self._function_names = parse_sql_references(self.sql) + self._limit, self._offset = parse_sql_pagination(self.sql) return self + +class DatastoreInfoRequest(BaseModel): + """Query parameters for `GET /api/3/datastore_info`. + + Accepts either `resource_id` or `id` — they're aliases for the same + thing (CKAN's `id` is historical; `resource_id` is what the rest of + this API uses). Exactly one must be provided. The model_validator + normalises `id` → `resource_id` so downstream code only reads one + field. `extra="forbid"` so unknown params surface as 400s. + """ + + model_config = ConfigDict(extra="forbid") + + resource_id: str | None = None + id: str | None = None + + @model_validator(mode="after") + def _require_resource_id_or_id(self) -> DatastoreInfoRequest: + if self.resource_id is None and self.id is None: + raise ValueError("either 'resource_id' or 'id' is required") + if ( + self.resource_id is not None + and self.id is not None + and self.resource_id != self.id + ): + raise ValueError( + "'resource_id' and 'id' both provided with different " + "values; send exactly one" + ) + if self.resource_id is None: + self.resource_id = self.id + return self + + +class DatastoreDeleteRequest(BaseModel): + """Request body for `POST /api/3/datastore_delete`. Drops the + whole table when both `filters` and `fields` are omitted; row + delete when `filters` is set; column drop when `fields` is set. + `filters` and `fields` are mutually exclusive.""" + + model_config = ConfigDict(extra="forbid") + + resource_id: str | None = None + id: str | None = None + filters: dict[str, Any] | None = None + fields: list[str] | None = None + force: bool = False + + @model_validator(mode="after") + def _require_resource_id_or_id(self) -> DatastoreDeleteRequest: + if self.resource_id is None and self.id is None: + raise ValueError("either 'resource_id' or 'id' is required") + if ( + self.resource_id is not None + and self.id is not None + and self.resource_id != self.id + ): + raise ValueError( + "'resource_id' and 'id' both provided with different " + "values; send exactly one" + ) + if self.resource_id is None: + self.resource_id = self.id + if self.filters is not None and self.fields: + raise ValueError( + "'filters' and 'fields' are mutually exclusive — " + "rows and columns are separate delete operations" + ) + if self.fields is not None and not self.fields: + raise ValueError("'fields' must list at least one column") + return self diff --git a/datastore/schemas/responses.py b/datastore/schemas/responses.py index 602794b..a7f5108 100644 --- a/datastore/schemas/responses.py +++ b/datastore/schemas/responses.py @@ -11,7 +11,7 @@ from __future__ import annotations -from typing import Any +from typing import Annotated, Any from pydantic import BaseModel, ConfigDict, Field @@ -48,14 +48,32 @@ class Result(BaseModel): # --- datastore -------------------------------------------------------------- + class DatastoreCreateResponse(ResponseModel): - """Response for `POST /api/3/datastore_create`.""" + """Response for `POST /api/3/datastore_create`. + + Returns both column shapes so clients on either side of the migration + see the form they expect: + - `fields` is the legacy `{id, type, info}` shape. + - `schema` is the Frictionless Table Schema (`{fields, + primaryKey, ...}`). + Both describe the same columns; they're derived from whichever the + caller supplied. Legacy `fields` will be removed once callers move + over to `schema`. + """ class Result(BaseModel): resource_id: str package_id: str | None = None - fields: list[FieldSpec] - primary_key: list[str] = Field(default_factory=list) + fields: Annotated[ + list[FieldSpec], + Field(deprecated="use 'schema' (Frictionless Table Schema) instead"), + ] + schema: dict[str, Any] + primary_key: Annotated[ + list[str], + Field(deprecated="use 'schema.primaryKey' (Frictionless Table Schema) instead"), + ] # Echoed input rows when the request set `include_records=True`. records: list[dict[str, Any]] | None = None # Total row count after the write — set only when `include_total=True`. @@ -76,8 +94,20 @@ class Result(BaseModel): result: Result +class DatastoreDeleteResponse(ResponseModel): + """Response for `POST /api/3/datastore_delete`.""" + + class Result(BaseModel): + resource_id: str + filters: dict[str, Any] | None = None + fields: list[str] | None = None + + result: Result + + class DatastoreSearchResponse(ResponseModel): - """Response for `GET /api/3/datastore_search`.""" + """Response for `GET /api/3/datastore_search` + """ class Result(BaseModel): # `_links` starts with an underscore, which pydantic treats as a @@ -85,13 +115,46 @@ class Result(BaseModel): model_config = ConfigDict(populate_by_name=True) resource_id: str - fields: list[dict[str, Any]] + # Only set for `datastore_search_sql`: the original SQL string + # echoed back so callers can confirm what ran (especially after + # `_links.next` rewrites the OFFSET). + sql: str | None = None + schema: dict[str, Any] + fields: Annotated[ + list[dict[str, Any]], + Field(deprecated="use 'schema' (Frictionless Table Schema) instead"), + ] records: list[dict[str, Any]] limit: int offset: int total: int | None = None - links: dict[str, str] = Field( - alias="_links", default_factory=dict - ) + # Carries URL strings (`start` / `prev` / `next`) plus integer + # page counters (`page` / `total_pages`); typed as `Any` for + # OpenAPI accuracy. + links: dict[str, Any] = Field(alias="_links", default_factory=dict) + + result: Result + + +class DatastoreInfoResponse(ResponseModel): + """Response for `GET /api/3/datastore_info`. + + Returns column metadata in both shapes so clients on either side of + the migration see what they expect: + - `schema` is the canonical Frictionless Table Schema. + - `fields` is the legacy `{id, type, info}` list (marked + `deprecated`). + `meta` is a free-form dict that engines populate with whatever extras + they expose (row count, table size, last-modified, …) — piped + through verbatim so adding a new key doesn't need a schema change. + """ + + class Result(BaseModel): + meta: dict[str, Any] + schema: dict[str, Any] + fields: Annotated[ + list[dict[str, Any]], + Field(deprecated="use 'schema' (Frictionless Table Schema) instead"), + ] result: Result diff --git a/datastore/schemas/validators.py b/datastore/schemas/validators.py index 5f9f76c..b04f8aa 100644 --- a/datastore/schemas/validators.py +++ b/datastore/schemas/validators.py @@ -1,14 +1,17 @@ -"""Reusable Pydantic validators and schema parts. -""" +"""Reusable Pydantic validators and schema parts.""" from __future__ import annotations import json from typing import Annotated, Any -from pydantic import BaseModel, BeforeValidator, ConfigDict +from pydantic import BaseModel, BeforeValidator, ConfigDict, field_validator -from datastore.core.constants import POSTGRES_TYPES +from datastore.core.constants import ( + FRICTIONLESS_TO_POSTGRES, + POSTGRES_TO_FRICTIONLESS, + POSTGRES_TYPES, +) # --- validator functions ----------------------------------------------------- @@ -41,8 +44,7 @@ def check_postgres_type(value: Any) -> str | None: if canonical is None: canonicals = sorted(set(POSTGRES_TYPES.values())) raise ValueError( - f"unknown field type '{value}'; " - f"expected one of {canonicals} or a PostgreSQL alias" + f"unknown field type '{value}'; expected one of {canonicals} or a PostgreSQL alias" ) return canonical @@ -89,9 +91,155 @@ def to_csv_list(value: Any) -> list[str] | None: raise ValueError("must be a comma-separated string or list of strings") -def parse_sql_references( - sql: str, *, dialect: str = "postgres" -) -> tuple[list[str], list[str]]: +def fields_to_frictionless_schema( + fields: list[Any], primary_key: list[str] | None = None +) -> dict[str, Any]: + """Convert the legacy `fields` + `primary_key` shape into a Frictionless + Table Schema descriptor. + + Each `FieldSpec` becomes a Frictionless field: + - `id` → `name` + - `type` (Postgres canonical) → `type` (Frictionless), via + `POSTGRES_TO_FRICTIONLESS`. Unknown types fall through to `string`. + - `info` is unpacked: recognised keys (`title`, `description`) move + to top-level Frictionless properties; the rest stays nested + under a custom `info` key so `frictionless_schema_to_fields` + can round-trip the data dictionary intact (previously these + extras were spread onto the field and silently lost on the + reverse path). + + `primary_key` becomes the schema's `primaryKey` (Frictionless naming). + """ + fr_fields: list[dict[str, Any]] = [] + for f in fields: + spec = f.model_dump(exclude_none=True) if hasattr(f, "model_dump") else dict(f) + fr: dict[str, Any] = {"name": spec["id"]} + pg_type = spec.get("type") + if pg_type: + fr["type"] = POSTGRES_TO_FRICTIONLESS.get(pg_type, "string") + info = spec.get("info") or {} + extra: dict[str, Any] = {} + for k, v in info.items(): + # `info.type` is treated as a hint and dropped — the outer + # canonical type already lives on `fr["type"]`, and letting an + # info-side `type` ride along would either shadow or conflict + # with it after the merge below. + if k == "type": + continue + if k in ("title", "description") and isinstance(v, str): + fr[k] = v + else: + extra[k] = v + if extra: + fr = {**fr, **extra} + fr_fields.append(fr) + + schema: dict[str, Any] = {"fields": fr_fields} + if primary_key: + schema["primaryKey"] = list(primary_key) + return schema + + +def frictionless_schema_to_fields( + schema: dict[str, Any], +) -> tuple[list[dict[str, Any]], list[str]]: + """Inverse of `fields_to_frictionless_schema`. + + Returns `(fields, primary_key)` where `fields` matches the legacy + `{id, type, info}` shape. Frictionless `name` → `id`; the field's + Frictionless type is mapped back to Postgres via + `FRICTIONLESS_TO_POSTGRES` (defaults to `text`). `title` / + `description` on the field move into `info`; any extras saved + under `info` are merged back in. + + `primaryKey` may be a string or list of strings in Frictionless; + normalised to `list[str]`. + """ + fields_out: list[dict[str, Any]] = [] + for fr in schema.get("fields", []): + name = fr.get("name") + if not name: + continue + out: dict[str, Any] = {"id": name} + fr_type = fr.get("type") + if fr_type: + out["type"] = FRICTIONLESS_TO_POSTGRES.get(fr_type, "text") + info: dict[str, Any] = {} + for k in ("title", "description"): + v = fr.get(k) + if isinstance(v, str): + info[k] = v + extra = fr.get("info") + if isinstance(extra, dict): + info.update(extra) + if info: + out["info"] = info + fields_out.append(out) + + pk = schema.get("primaryKey") + if isinstance(pk, str): + primary_key = [pk] + elif isinstance(pk, list): + primary_key = [str(x) for x in pk] + else: + primary_key = [] + return fields_out, primary_key + + +def validate_frictionless_schema(value: Any) -> dict[str, Any] | None: + """Validate a Frictionless Table Schema descriptor against this + repo's stricter contract. + + Pass-through `None`. Otherwise: + 1. `frictionless.Schema.from_descriptor` validates the descriptor + shape (raises on missing `fields`, unknown field type, etc.). + 2. Field types must be in `ALLOWED_FRICTIONLESS_TYPES` — wider + Frictionless vocabulary (e.g. `duration`, `year`, `yearmonth`) + is rejected here so storage layout stays predictable and the + engine type maps don't grow ad-hoc. + 3. Field names must not collide with engine-reserved system + columns (`_id`, `_updated_at`). Silently dropping them would + leave the response advertising a column the engine won't + populate. + + Any failure raises `ValueError` so Pydantic surfaces it through + the standard CKAN error envelope. + """ + if value is None: + return None + if not isinstance(value, dict): + raise ValueError("schema must be a JSON object") + + from frictionless import Schema + from frictionless.exception import FrictionlessException + + from datastore.core.constants import ( + ALLOWED_FRICTIONLESS_TYPES, + RESERVED_SYSTEM_COLUMN_NAMES, + ) + + try: + Schema.from_descriptor(value) + except FrictionlessException as exc: + raise ValueError(str(exc)) from exc + + for f in value.get("fields", []) or []: + name = f.get("name") + if name in RESERVED_SYSTEM_COLUMN_NAMES: + raise ValueError( + f"field name {name!r} is reserved for engine-managed " + "system columns; rename the field" + ) + ftype = f.get("type") + if ftype is not None and ftype not in ALLOWED_FRICTIONLESS_TYPES: + raise ValueError( + f"field {name!r} has unsupported type {ftype!r}; " + f"allowed: {sorted(ALLOWED_FRICTIONLESS_TYPES)}" + ) + return value + + +def parse_sql_references(sql: str, *, dialect: str = "postgres") -> tuple[list[str], list[str]]: """Parse `sql` and return (table_names, function_names). Used by `datastore_search_sql` to: @@ -123,15 +271,8 @@ def parse_sql_references( # CTE aliases (e.g. `WITH t AS (...) SELECT * FROM t`) parse as # `exp.Table` nodes even though they're defined inline — exclude them # so auth isn't called for non-external table refs. - cte_aliases = { - cte.alias_or_name - for cte in tree.find_all(exp.CTE) - if cte.alias_or_name - } - tables = { - t.name for t in tree.find_all(exp.Table) - if t.name and t.name not in cte_aliases - } + cte_aliases = {cte.alias_or_name for cte in tree.find_all(exp.CTE) if cte.alias_or_name} + tables = {t.name for t in tree.find_all(exp.Table) if t.name and t.name not in cte_aliases} functions: set[str] = set() for f in tree.find_all(exp.Func): @@ -149,6 +290,81 @@ def parse_sql_references( return sorted(tables), sorted(functions) +def parse_sql_pagination( + sql: str, *, dialect: str = "postgres", +) -> tuple[int, int]: + """Extract `(limit, offset)` from a SELECT. LIMIT is required. + + `datastore_search_sql` lets callers ship raw SQL but the API still + wants page metadata + links — so we parse the LIMIT/OFFSET out of + the user's statement and use those for `result.limit`, `offset`, + `page`, `total_pages`, and the prev/next links. Missing LIMIT + raises `ValueError`; callers should be explicit so the server can + paginate properly and so an unbounded SELECT can't lock streaming + open. + + OFFSET defaults to 0 when absent. Non-integer LIMIT/OFFSET + expressions (e.g. `LIMIT @x`) raise too — pagination needs a + constant. + """ + import sqlglot + from sqlglot import expressions as exp + + try: + tree = sqlglot.parse_one(sql, dialect=dialect) + except Exception as e: + raise ValueError(f"could not parse SQL: {e}") from e + + limit_node = tree.args.get("limit") + if limit_node is None: + raise ValueError( + "SQL must include a LIMIT clause (e.g. 'LIMIT 100'); " + "an explicit page size is required for pagination links " + "and to prevent unbounded SELECTs" + ) + limit_expr = limit_node.expression if isinstance(limit_node, exp.Limit) else None + if not isinstance(limit_expr, exp.Literal) or not limit_expr.is_int: + raise ValueError( + "LIMIT must be a constant integer literal" + ) + limit = int(limit_expr.this) + if limit < 0: + raise ValueError("LIMIT must be >= 0") + + offset = 0 + offset_node = tree.args.get("offset") + if offset_node is not None: + offset_expr = ( + offset_node.expression + if isinstance(offset_node, exp.Offset) else None + ) + if not isinstance(offset_expr, exp.Literal) or not offset_expr.is_int: + raise ValueError( + "OFFSET must be a constant integer literal" + ) + offset = int(offset_expr.this) + if offset < 0: + raise ValueError("OFFSET must be >= 0") + + return limit, offset + + +def rewrite_sql_offset( + sql: str, new_offset: int, *, dialect: str = "postgres", +) -> str: + """Return `sql` with its OFFSET replaced (or inserted) at `new_offset`. + + Used by the search_sql link builder to produce prev / next URLs + without asking the caller to rebuild the SQL themselves. + """ + import sqlglot + from sqlglot import expressions as exp + + tree = sqlglot.parse_one(sql, dialect=dialect) + tree.set("offset", exp.Offset(expression=exp.Literal.number(new_offset))) + return tree.sql(dialect=dialect) + + # --- reusable Annotated types ------------------------------------------------ # The parser functions above (`to_json_object`, `to_str_or_json_object`, # `to_csv_list`) are invoked directly at the service boundary; they don't @@ -173,3 +389,14 @@ class FieldSpec(BaseModel): id: str type: PostgresType = None info: dict[str, Any] | None = None + + @field_validator("id") + @classmethod + def _check_not_reserved(cls, v: str) -> str: + from datastore.core.constants import RESERVED_SYSTEM_COLUMN_NAMES + if v in RESERVED_SYSTEM_COLUMN_NAMES: + raise ValueError( + f"field id {v!r} is reserved for engine-managed system " + "columns; rename the field" + ) + return v diff --git a/datastore/services/read.py b/datastore/services/read.py index 23e8d73..186bf7c 100644 --- a/datastore/services/read.py +++ b/datastore/services/read.py @@ -7,7 +7,10 @@ from datastore.core.exceptions import ValidationError from datastore.infrastructure.engines import get_datastore_engine from datastore.infrastructure.engines.registry import get_allowed_sql_functions +from datastore.schemas.responses import DatastoreInfoResponse from datastore.schemas.validators import ( + frictionless_schema_to_fields, + rewrite_sql_offset, to_csv_list, to_json_object, to_str_or_json_object, @@ -53,6 +56,14 @@ async def search_datastore( The returned iterator pulls rows from the engine one at a time; peak memory ≈ 1 row regardless of result size. """ + max_limit = context.config.SEARCH_RESULT_ROWS_MAX + if data_dict["limit"] > max_limit: + raise ValidationError( + f"limit greater than {max_limit} is not allowed; " + "paginate with `offset` to fetch more rows", + fields={"limit": [f"must be <= {max_limit}"]}, + ) + engine = get_datastore_engine(context, mode="ro") result = engine.search( resource_id=data_dict["resource_id"], @@ -68,10 +79,13 @@ async def search_datastore( include_total=data_dict["include_total"], ) + fields, _ = frictionless_schema_to_fields(result.schema) + envelope_kwargs = dict( help_url=request_url, resource_id=data_dict["resource_id"], - fields=result.fields, + schema=result.schema, + fields=fields, records=result.records, limit=data_dict["limit"], offset=data_dict["offset"], @@ -81,73 +95,140 @@ async def search_datastore( request_url, limit=data_dict["limit"], offset=data_dict["offset"], + total=result.total, ), ) return _WRITERS[data_dict["records_format"]](**envelope_kwargs) -_SQL_DEFAULT_LIMIT = 32000 - - async def search_sql_datastore( context: RequestContext, data_dict: dict[str, Any], *, request_url: str, ) -> Iterator[bytes]: - """Run a raw SQL SELECT and stream the result. - - Reuses the `datastore_search` writer + envelope so the response shape - is identical to `datastore_search`. Pagination is the caller's job - (edit the SQL); the envelope's `_links` / `limit` / `offset` / - `resource_id` fields are kept for shape parity, with no-op defaults. - - `data_dict` carries `{"sql": ..., "function_names": [...]}`. The - endpoint already handles per-table CKAN authorize (using the schema's - `resource_ids`); this layer handles the engine-specific function - allow-list — `mode="ro"` selects read-only credentials so writes - can't happen even if a function slips through. + """Run a vetted SELECT and stream the result. + + `data_dict` carries `sql` + `function_names` + the `limit` / + `offset` parsed out of the SQL itself (LIMIT is required by the + request schema; OFFSET defaults to 0). The service: + + - rejects function calls outside the engine's allow-list, + - clamps LIMIT against `Config.SEARCH_RESULT_ROWS_MAX`, + - dispatches to the read-only engine (mode="ro" — RO credentials + are the load-bearing safety), + - builds CKAN-style pagination links by rewriting the SQL's + OFFSET so callers can follow `_links.next` / `prev` without + re-editing their SQL. """ allowed = get_allowed_sql_functions( context.config.DATASTORE_ENGINE, override_path=context.config.SQL_FUNCTIONS_ALLOW_FILE, ) disallowed = sorted(set(data_dict.get("function_names", [])) - allowed) - if disallowed: raise ValidationError( f"sql uses disallowed function(s): {', '.join(disallowed)}", fields={"sql": [f"disallowed: {', '.join(disallowed)}"]}, ) + limit = data_dict["limit"] + offset = data_dict["offset"] + max_limit = context.config.SEARCH_RESULT_ROWS_MAX + if limit > max_limit: + raise ValidationError( + f"LIMIT greater than {max_limit} is not allowed; " + "paginate with OFFSET to fetch more rows", + fields={"sql": [f"LIMIT must be <= {max_limit}"]}, + ) + engine = get_datastore_engine(context, mode="ro") - result = engine.search_sql( - sql=data_dict["sql"], limit=_SQL_DEFAULT_LIMIT - ) + result = engine.search_sql(sql=data_dict["sql"], limit=limit) + fields, _ = frictionless_schema_to_fields(result.schema) return stream_objects( help_url=request_url, resource_id="", - fields=result.fields, + schema=result.schema, + fields=fields, records=result.records, - limit=_SQL_DEFAULT_LIMIT, - offset=0, - total=None, - include_total=False, - links=_build_pagination_links( - request_url, limit=_SQL_DEFAULT_LIMIT, offset=0 + limit=limit, + offset=offset, + total=result.total, + include_total=result.total is not None, + links=_build_sql_pagination_links( + request_url, + sql=data_dict["sql"], + limit=limit, + offset=offset, + total=result.total, ), + # Echo the original SQL on the response so callers can confirm + # what actually ran (especially after `_links.next` rewrites + # the OFFSET on follow-up requests). + sql=data_dict["sql"], + ) + + +async def info_datastore( + context: RequestContext, data_dict: dict[str, Any] +) -> DatastoreInfoResponse.Result: + """Look up table metadata for a single `resource_id`. + + Endpoint authorizes the caller first (same gate as `search`). This + service just asks the read-only engine for its `InfoResult` and + re-shapes it as the response's typed `Result`. No streaming — + `info` responses are small enough for the standard `_success_response` + path. + """ + engine = get_datastore_engine(context, mode="ro") + result = engine.info(resource_id=data_dict["resource_id"]) + fields, _ = frictionless_schema_to_fields(result.schema) + return DatastoreInfoResponse.Result( + meta=result.meta, + schema=result.schema, + fields=fields, ) def _build_pagination_links( - url: str, *, limit: int, offset: int -) -> dict[str, str]: - """CKAN-style pagination links. + url: str, + *, + limit: int, + offset: int, + total: int | None = None, +) -> dict[str, Any]: + """CKAN-style pagination links + page counters. + + URL keys: + - ``start`` — always emitted, with ``offset`` stripped (defaults to 0). + - ``prev`` — only when a previous page exists (``offset > 0``); + lands at ``max(0, offset - limit)`` so paging back from a partial + first page clamps to 0 rather than going negative. + - ``next`` — only when a next page exists (``total`` known and + ``offset + limit < total``). When ``total`` is None (caller + didn't ask for `include_total`, or this is a raw-SQL call) we + can't tell, so ``next`` is omitted; the client detects end-of- + data via an empty `records` array. + + Counter keys (added alongside the URL keys, 1-indexed): + - ``page_size`` — rows per page = ``limit``; emitted whenever + ``limit > 0`` (a UI can always render it, even on empty pages). + - ``page`` — current page = ``offset // limit + 1``. + - ``total_pages`` — ``ceil(total / limit)``; omitted when total + is unknown. - `start` strips `offset` (it defaults to 0). `next` appends - `offset = offset + limit`. All other params ride along on both - links so the caller can paginate without re-assembling the URL. + ``page`` and ``total_pages`` are dropped whenever the current page + has no rows — either because the resource is empty + (``total == 0``) or because the caller paged past the end + (``offset >= total > 0``). Reporting ``page=5 / total_pages=4`` + would be incoherent (no such page exists); the absence + the + empty `records` array + `prev` are what let a UI recover. When + ``total`` is unknown (caller didn't request `include_total`) we + keep ``page`` since position is still meaningful for single-page + pickers. + + All non-`offset` query params ride along on every emitted URL. Scheme + host are preserved from the input URL when present, so `http://host/path?x=1` produces `http://host/path?...` links and @@ -157,8 +238,7 @@ def _build_pagination_links( """ parsed = urlparse(url) pairs = parse_qsl(parsed.query, keep_blank_values=True) - start_pairs = [(k, v) for k, v in pairs if k != "offset"] - next_pairs = start_pairs + [("offset", str(offset + limit))] + base_pairs = [(k, v) for k, v in pairs if k != "offset"] def _qs(pairs: list[tuple[str, str]]) -> str: return urlunparse(( @@ -166,4 +246,73 @@ def _qs(pairs: list[tuple[str, str]]) -> str: "", urlencode(pairs), "", )) - return {"start": _qs(start_pairs), "next": _qs(next_pairs)} + out: dict[str, Any] = {"start": _qs(base_pairs)} + if offset > 0: + prev_offset = max(0, offset - limit) + out["prev"] = _qs(base_pairs + [("offset", str(prev_offset))]) + has_next = ( + limit > 0 and total is not None and offset + limit < total + ) + if has_next: + out["next"] = _qs(base_pairs + [("offset", str(offset + limit))]) + if limit > 0: + out["page_size"] = limit + # Drop `page` / `total_pages` whenever the current page has no + # rows: empty resource or past-end pagination. `total is None` + # means "unknown → assume there might be rows" so we still emit + # `page`. + has_rows_on_page = total is None or (total > 0 and offset < total) + if limit > 0 and has_rows_on_page: + out["page"] = offset // limit + 1 + if total is not None: + # ceil division without importing math + out["total_pages"] = (total + limit - 1) // limit + return out + + +def _build_sql_pagination_links( + url: str, + *, + sql: str, + limit: int, + offset: int, + total: int | None, +) -> dict[str, Any]: + """Pagination links for `datastore_search_sql`. + + Same presence rules as `_build_pagination_links`, but the LIMIT / + OFFSET live inside the user's SQL — so we can't just bump the + `offset` query param. Each emitted URL carries a rewritten copy + of `sql` with a new OFFSET literal (LIMIT is preserved exactly). + """ + parsed = urlparse(url) + base_pairs = [ + (k, v) + for k, v in parse_qsl(parsed.query, keep_blank_values=True) + if k != "sql" + ] + + def _link_for(target_offset: int) -> str: + new_sql = rewrite_sql_offset(sql, target_offset) + pairs = base_pairs + [("sql", new_sql)] + return urlunparse(( + parsed.scheme, parsed.netloc, parsed.path, + "", urlencode(pairs), "", + )) + + out: dict[str, Any] = {"start": _link_for(0)} + if offset > 0: + out["prev"] = _link_for(max(0, offset - limit)) + has_next = ( + limit > 0 and total is not None and offset + limit < total + ) + if has_next: + out["next"] = _link_for(offset + limit) + if limit > 0: + out["page_size"] = limit + has_rows_on_page = total is None or (total > 0 and offset < total) + if limit > 0 and has_rows_on_page: + out["page"] = offset // limit + 1 + if total is not None: + out["total_pages"] = (total + limit - 1) // limit + return out diff --git a/datastore/services/streaming.py b/datastore/services/streaming.py index b8595c3..620fb53 100644 --- a/datastore/services/streaming.py +++ b/datastore/services/streaming.py @@ -41,6 +41,7 @@ def stream_objects( *, help_url: str, resource_id: str, + schema: dict[str, Any], fields: list[dict[str, Any]], records: Iterator[tuple], limit: int, @@ -48,12 +49,14 @@ def stream_objects( total: int | None, include_total: bool, links: dict[str, str], + sql: str | None = None, ) -> Iterator[bytes]: """`records_format=objects` — `records` is a JSON array of `{col: value}`.""" columns = [f["id"] for f in fields] return _stream_envelope( help_url=help_url, resource_id=resource_id, + schema=schema, fields=fields, records_chunks=_records_object_array(columns, records), limit=limit, @@ -61,6 +64,7 @@ def stream_objects( total=total, include_total=include_total, links=links, + sql=sql, ) @@ -68,6 +72,7 @@ def stream_lists( *, help_url: str, resource_id: str, + schema: dict[str, Any], fields: list[dict[str, Any]], records: Iterator[tuple], limit: int, @@ -75,11 +80,13 @@ def stream_lists( total: int | None, include_total: bool, links: dict[str, str], + sql: str | None = None, ) -> Iterator[bytes]: """`records_format=lists` — `records` is a JSON array of `[v1, v2, ...]`.""" return _stream_envelope( help_url=help_url, resource_id=resource_id, + schema=schema, fields=fields, records_chunks=_records_array_array(records), limit=limit, @@ -87,6 +94,7 @@ def stream_lists( total=total, include_total=include_total, links=links, + sql=sql, ) @@ -94,6 +102,7 @@ def stream_csv( *, help_url: str, resource_id: str, + schema: dict[str, Any], fields: list[dict[str, Any]], records: Iterator[tuple], limit: int, @@ -101,12 +110,14 @@ def stream_csv( total: int | None, include_total: bool, links: dict[str, str], + sql: str | None = None, ) -> Iterator[bytes]: """`records_format=csv` — `records` is a JSON string of CSV text.""" columns = [f["id"] for f in fields] return _stream_envelope( help_url=help_url, resource_id=resource_id, + schema=schema, fields=fields, records_chunks=_records_delimited_string(columns, records, delimiter=","), limit=limit, @@ -114,6 +125,7 @@ def stream_csv( total=total, include_total=include_total, links=links, + sql=sql, ) @@ -121,6 +133,7 @@ def stream_tsv( *, help_url: str, resource_id: str, + schema: dict[str, Any], fields: list[dict[str, Any]], records: Iterator[tuple], limit: int, @@ -128,12 +141,14 @@ def stream_tsv( total: int | None, include_total: bool, links: dict[str, str], + sql: str | None = None, ) -> Iterator[bytes]: """`records_format=tsv` — `records` is a JSON string of TSV text.""" columns = [f["id"] for f in fields] return _stream_envelope( help_url=help_url, resource_id=resource_id, + schema=schema, fields=fields, records_chunks=_records_delimited_string(columns, records, delimiter="\t"), limit=limit, @@ -141,6 +156,7 @@ def stream_tsv( total=total, include_total=include_total, links=links, + sql=sql, ) @@ -148,6 +164,7 @@ def _stream_envelope( *, help_url: str, resource_id: str, + schema: dict[str, Any], fields: list[dict[str, Any]], records_chunks: Iterator[bytes], limit: int, @@ -155,15 +172,26 @@ def _stream_envelope( total: int | None, include_total: bool, links: dict[str, str], + sql: str | None = None, ) -> Iterator[bytes]: """CKAN envelope skeleton. Each format passes its own `records_chunks` iterator that emits the JSON value for the `records` field — either a JSON array (objects / lists) or a JSON string (csv / tsv). + + Column metadata is emitted in both shapes: `schema` (canonical + Frictionless) and `fields` (legacy `{id, type}` list, deprecated). + `sql` is emitted only when supplied (i.e. for `datastore_search_sql`); + `datastore_search` leaves it out. """ yield b'{"help":' yield orjson.dumps(help_url) yield b',"success":true,"result":{"resource_id":' yield orjson.dumps(resource_id) + if sql is not None: + yield b',"sql":' + yield orjson.dumps(sql) + yield b',"schema":' + yield orjson.dumps(schema) yield b',"fields":' yield orjson.dumps(fields) yield b',"records":' diff --git a/datastore/services/write.py b/datastore/services/write.py index fb3f675..5ccfdf3 100644 --- a/datastore/services/write.py +++ b/datastore/services/write.py @@ -5,8 +5,10 @@ from datastore.infrastructure.engines import get_datastore_engine from datastore.schemas.responses import ( DatastoreCreateResponse, + DatastoreDeleteResponse, DatastoreUpsertResponse, ) +from datastore.schemas.validators import frictionless_schema_to_fields if TYPE_CHECKING: # type-only — no runtime import from api/ from datastore.api.context import RequestContext @@ -17,14 +19,19 @@ async def create_datastore( ) -> DatastoreCreateResponse.Result: package = data_dict.get("package") or {} resource = data_dict.get("resource") or {} - fields = data_dict.get("fields") or [] + schema = data_dict["schema"] records = data_dict.get("records") or [] - primary_key = data_dict.get("primary_key") or [] include_records = bool(data_dict.get("include_records", False)) include_total = bool(data_dict.get("include_total", False)) - is_new_resource = isinstance(resource, dict) - if is_new_resource: + fields, primary_key = frictionless_schema_to_fields(schema) + + if isinstance(resource, dict): + # Endpoint gates this branch on AUTH_TYPE=ckan, so context.ckan is + # non-None here in practice; the assert keeps the type checker honest. + assert context.ckan is not None, ( + "datastore_create `resource` dict path requires AUTH_TYPE=ckan" + ) resource = await context.ckan.resource_create(resource=resource) resource_id = resource["id"] else: @@ -34,8 +41,7 @@ async def create_datastore( engine = get_datastore_engine(context, mode="rw") write_result = engine.create( resource_id=resource_id, - fields=fields, - unique_keys=primary_key, + schema=schema, records=records, include_total=include_total, ) @@ -44,6 +50,7 @@ async def create_datastore( resource_id=resource_id, package_id=package.get("id"), fields=fields, + schema=schema, primary_key=primary_key, records=records if include_records else None, total=write_result.get("total") if include_total else None, @@ -75,3 +82,23 @@ async def upsert_datastore( records=records if include_records else None, total=write_result.get("total") if include_total else None, ) + + +async def delete_datastore( + context: RequestContext, data_dict: dict[str, Any] +) -> DatastoreDeleteResponse.Result: + """Drop the table, delete rows, or drop columns. `filters` and + `fields` are passed through verbatim — schema layer enforces + mutual exclusivity.""" + resource_id = data_dict["resource_id"] + filters = data_dict.get("filters") + fields = data_dict.get("fields") + + engine = get_datastore_engine(context, mode="rw") + engine.delete(resource_id=resource_id, filters=filters, fields=fields) + + return DatastoreDeleteResponse.Result( + resource_id=resource_id, + filters=filters, + fields=fields, + ) diff --git a/example_payload/README.md b/example_payload/README.md deleted file mode 100644 index 5f94f65..0000000 --- a/example_payload/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Request payload examples - -Hand-written JSON bodies for the CKAN datastore endpoints. Useful as: - -- copy-paste fixtures when smoke-testing a running dev server, -- canonical references when documenting clients, -- starting points when adding new tests. - -## Layout - -One subdirectory per endpoint; one file per **distinct scenario** (not per -field combination — keep it useful, not exhaustive). - -``` -example_payload/ -├── datastore_create/ -│ ├── with_resource_id.json # existing-resource flow -│ └── with_resource.json # new-resource flow (resource dict with package_id) -├── datastore_search/ -│ ├── basic.json # minimal — just resource_id -│ ├── with_filters.json # narrow by column values (e.g. product_code, accepted) -│ ├── with_full_text.json # `q` full-text query, plain + language -│ ├── paginated_sorted.json # fields + sort + limit + offset + include_total -│ └── response.json # sample RESPONSE for paginated_sorted.json, -│ # showing the CKAN envelope + `_links` -└── datastore_upsert/ - ├── upsert.json # default — corrects one row + adds a new one - ├── insert.json # method=insert; new rows only - └── update.json # method=update; patches existing rows by unique_key -``` - -## How to add a new example - -Three steps: - -1. **Pick the right subdirectory.** If you're adding the first example for a - new endpoint (e.g. `datastore_delete`), create the directory with the - endpoint's action name (`example_payload/datastore_delete/`). - -2. **Name the file after the scenario.** Short, lowercase, snake_case. - Examples: `by_filters.json`, `whole_table.json`, `empty_records.json`. - The filename is the only label the reader sees — make it tell the story. - -3. **Match the request schema.** Each endpoint has a Pydantic model in - [datastore/schemas/request.py](../datastore/schemas/request.py). - The payload must validate against it. (Files named `response*.json` - are sample server responses — skip this step for those.) Quick check: - - ```sh - python -c " - import json - from datastore.schemas.request import DatastoreUpsertRequest - DatastoreUpsertRequest.model_validate( - json.load(open('example_payload/datastore_upsert/upsert.json')) - ) - print('OK') - " - ``` - - Swap the model name to match the endpoint (`DatastoreCreateRequest`, - `DatastoreUpsertRequest`, …). - -## Smoke-test against a running server - -```sh -# Start the dev server in another shell: -# uvicorn datastore.main:app --reload - -curl -s -X POST http://localhost:8000/api/3/action/datastore_upsert \ - -H 'Content-Type: application/json' \ - -H 'Authorization: ' \ - -d @example_payload/datastore_upsert/upsert.json | jq -``` - -Set `AUTH_ENABLED=false` in `.env` for local runs without a CKAN instance — -auth is bypassed and a stub decision is returned. - -## Conventions - -- **Realistic values.** Use the running balancing-market example (auctions, - products, prices) so the files read as a coherent dataset across endpoints. -- **No PII, no secrets.** Treat these as public. -- **Stable resource IDs.** Reuse the same `resource_id` across files in a - scenario chain (e.g. create → upsert → search) so a reader can follow the - flow end-to-end. -- **One concept per file.** If you're tempted to demonstrate two unrelated - features in one payload, split it into two files. diff --git a/example_payload/datastore_create/with_resource.json b/example_payload/datastore_create/with_resource.json deleted file mode 100644 index 6ef2f4a..0000000 --- a/example_payload/datastore_create/with_resource.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "resource": { - "id": "balancing_auction_results_2025", - "name": "Balancing Auction Results 2025", - "package_id": "pkg-balancing-2025", - "format": "datastore", - "mimetype": "application/json", - "description": "Cleared bids from the balancing market, one row per auction × product." - }, - "fields": [ - { - "id": "auction_id", - "type": "int4", - "info": {"title": "Auction ID", "description": "Unique auction identifier."} - }, - {"id": "product_code", "type": "text"}, - { - "id": "delivery_start", - "type": "timestamptz", - "info": {"title": "Delivery Start (UTC)"} - }, - {"id": "duration_minutes", "type": "int4", "info": {"unit": "minutes"}}, - {"id": "clearing_price_gbp_per_mwh", "type": "float8", "info": {"unit": "GBP/MWh"}}, - {"id": "volume_mwh", "type": "float8", "info": {"unit": "MWh"}}, - {"id": "accepted", "type": "bool"}, - {"id": "bidder_metadata", "type": "jsonb"} - ], - "primary_key": ["auction_id", "product_code"], - "records": [ - { - "auction_id": 144, - "product_code": "DCL", - "delivery_start": "2025-11-04T16:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 47.82, - "volume_mwh": 120.0, - "accepted": true, - "bidder_metadata": {"unit_id": "DRAX-1", "submission_lag_ms": 412} - }, - { - "auction_id": 144, - "product_code": "DCH", - "delivery_start": "2025-11-04T16:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 51.10, - "volume_mwh": 75.5, - "accepted": true, - "bidder_metadata": {"unit_id": "EDF-COTT-2", "submission_lag_ms": 280} - } - ] -} diff --git a/example_payload/datastore_create/with_resource_id.json b/example_payload/datastore_create/with_resource_id.json deleted file mode 100644 index 63fb3db..0000000 --- a/example_payload/datastore_create/with_resource_id.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "resource_id": "balancing_auction_results_2025", - "fields": [ - { - "id": "auction_id", - "type": "int4", - "info": {"title": "Auction ID", "description": "Unique auction identifier."} - }, - {"id": "product_code", "type": "text"}, - { - "id": "delivery_start", - "type": "timestamptz", - "info": {"title": "Delivery Start (UTC)"} - }, - {"id": "duration_minutes", "type": "int4", "info": {"unit": "minutes"}}, - {"id": "clearing_price_gbp_per_mwh", "type": "float8", "info": {"unit": "GBP/MWh"}}, - {"id": "volume_mwh", "type": "float8", "info": {"unit": "MWh"}}, - {"id": "accepted", "type": "bool"}, - {"id": "bidder_metadata", "type": "jsonb"} - ], - "primary_key": ["auction_id", "product_code"], - "records": [ - { - "auction_id": 144, - "product_code": "DCL", - "delivery_start": "2025-11-04T16:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 47.82, - "volume_mwh": 120.0, - "accepted": true, - "bidder_metadata": {"unit_id": "DRAX-1", "submission_lag_ms": 412} - }, - { - "auction_id": 144, - "product_code": "DCH", - "delivery_start": "2025-11-04T16:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 51.10, - "volume_mwh": 75.5, - "accepted": true, - "bidder_metadata": {"unit_id": "EDF-COTT-2", "submission_lag_ms": 280} - } - ] -} diff --git a/example_payload/datastore_search/basic.json b/example_payload/datastore_search/basic.json deleted file mode 100644 index a36936f..0000000 --- a/example_payload/datastore_search/basic.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "resource_id": "balancing_auction_results_2025" -} diff --git a/example_payload/datastore_search/paginated_sorted.json b/example_payload/datastore_search/paginated_sorted.json deleted file mode 100644 index a41bd01..0000000 --- a/example_payload/datastore_search/paginated_sorted.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "resource_id": "balancing_auction_results_2025", - "fields": [ - "auction_id", - "product_code", - "delivery_start", - "clearing_price_gbp_per_mwh", - "volume_mwh" - ], - "sort": "delivery_start desc, clearing_price_gbp_per_mwh asc", - "limit": 100, - "offset": 0, - "include_total": true -} diff --git a/example_payload/datastore_search/response.json b/example_payload/datastore_search/response.json deleted file mode 100644 index dc0f773..0000000 --- a/example_payload/datastore_search/response.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "help": "http://localhost:8000/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&fields=auction_id,product_code,delivery_start,clearing_price_gbp_per_mwh,volume_mwh&sort=delivery_start+desc,+clearing_price_gbp_per_mwh+asc&limit=100&include_total=true", - "success": true, - "result": { - "resource_id": "balancing_auction_results_2025", - "fields": [ - {"id": "auction_id", "type": "integer"}, - {"id": "product_code", "type": "string"}, - {"id": "delivery_start", "type": "datetime"}, - {"id": "clearing_price_gbp_per_mwh", "type": "number"}, - {"id": "volume_mwh", "type": "number"} - ], - "records": [ - {"auction_id": 152, "product_code": "DCL", "delivery_start": "2025-11-05T18:30:00Z", "clearing_price_gbp_per_mwh": 39.40, "volume_mwh": 95.0}, - {"auction_id": 151, "product_code": "DCH", "delivery_start": "2025-11-05T18:30:00Z", "clearing_price_gbp_per_mwh": 52.10, "volume_mwh": 60.0}, - {"auction_id": 144, "product_code": "DCL", "delivery_start": "2025-11-04T16:00:00Z", "clearing_price_gbp_per_mwh": 47.82, "volume_mwh": 120.0} - ], - "limit": 100, - "offset": 0, - "total": 3, - "_links": { - "start": "http://localhost:8000/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&fields=auction_id,product_code,delivery_start,clearing_price_gbp_per_mwh,volume_mwh&sort=delivery_start+desc,+clearing_price_gbp_per_mwh+asc&limit=100&include_total=true", - "next": "http://localhost:8000/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&fields=auction_id,product_code,delivery_start,clearing_price_gbp_per_mwh,volume_mwh&sort=delivery_start+desc,+clearing_price_gbp_per_mwh+asc&limit=100&include_total=true&offset=100" - } - } -} diff --git a/example_payload/datastore_search/with_filters.json b/example_payload/datastore_search/with_filters.json deleted file mode 100644 index 9106668..0000000 --- a/example_payload/datastore_search/with_filters.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "resource_id": "balancing_auction_results_2025", - "filters": { - "product_code": "DCL", - "accepted": true - }, - "limit": 100 -} diff --git a/example_payload/datastore_search/with_full_text.json b/example_payload/datastore_search/with_full_text.json deleted file mode 100644 index a5f8cc4..0000000 --- a/example_payload/datastore_search/with_full_text.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "resource_id": "balancing_auction_results_2025", - "q": "DRAX", - "plain": true, - "language": "english", - "limit": 50 -} diff --git a/example_payload/datastore_upsert/insert.json b/example_payload/datastore_upsert/insert.json deleted file mode 100644 index fac9823..0000000 --- a/example_payload/datastore_upsert/insert.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "resource_id": "c6153a74-43cb-4edf-8bdf-bb664feca937", - "method": "insert", - "records": [ - { - "auction_id": 200, - "product_code": "DCL", - "delivery_start": "2025-11-06T08:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 38.90, - "volume_mwh": 95.0, - "accepted": true, - "bidder_metadata": {"unit_id": "DRAX-2", "submission_lag_ms": 350} - }, - { - "auction_id": 201, - "product_code": "DCH", - "delivery_start": "2025-11-06T08:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 44.20, - "volume_mwh": 110.0, - "accepted": true, - "bidder_metadata": {"unit_id": "EDF-COTT-3", "submission_lag_ms": 198} - } - ] -} diff --git a/example_payload/datastore_upsert/update.json b/example_payload/datastore_upsert/update.json deleted file mode 100644 index 995646a..0000000 --- a/example_payload/datastore_upsert/update.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "resource_id": "c6153a74-43cb-4edf-8bdf-bb664feca937", - "method": "update", - "records": [ - { - "auction_id": 144, - "product_code": "DCL", - "accepted": false, - "bidder_metadata": {"unit_id": "DRAX-1", "rejection_reason": "settlement_dispute"} - }, - { - "auction_id": 144, - "product_code": "DCH", - "clearing_price_gbp_per_mwh": 50.85 - } - ] -} diff --git a/example_payload/datastore_upsert/upsert.json b/example_payload/datastore_upsert/upsert.json deleted file mode 100644 index 861f502..0000000 --- a/example_payload/datastore_upsert/upsert.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "resource_id": "c6153a74-43cb-4edf-8bdf-bb664feca937", - "method": "upsert", - "records": [ - { - "auction_id": 144, - "product_code": "DCL", - "delivery_start": "2025-11-04T16:00:00Z", - "duration_minutes": 30, - "clearing_price_gbp_per_mwh": 48.05, - "volume_mwh": 120.0, - "accepted": true, - "bidder_metadata": {"unit_id": "DRAX-1", "submission_lag_ms": 412, "revision": 2} - }, - { - "auction_id": 153, - "product_code": "FFR", - "delivery_start": "2025-11-05T19:00:00Z", - "duration_minutes": 60, - "clearing_price_gbp_per_mwh": 32.40, - "volume_mwh": 200.0, - "accepted": false, - "bidder_metadata": {"unit_id": "SSE-PEH-3", "rejection_reason": "above_cap"} - } - ], - "include_total": false, - "force": false, - "include_records": true -} diff --git a/postman/README.md b/postman/README.md new file mode 100644 index 0000000..52915c6 --- /dev/null +++ b/postman/README.md @@ -0,0 +1,47 @@ +# Postman collection + +Postman v2.1.0 collection covering every Datastore API endpoint. +Auto-generated from [`example_payload/`](../example_payload/) by +[`generate_postman.py`](generate_postman.py). + +## Import + +In Postman: **File → Import** → `collection.json`. Seven folders appear: +`health`, `datastore_create`, `datastore_upsert`, `datastore_info`, +`datastore_search`, `datastore_search_sql`, `datastore_delete`. + +## Variables + +| Variable | Default | Notes | +|--------------|----------------------------------|------------------------------------------| +| `baseUrl` | `http://localhost:8000` | Datastore API root. | +| `apiKey` | (empty) | CKAN API key — required for writes. | +| `resourceId` | `balancing_auction_results_2025` | Resource to create / write / query. | + +Collection auth sends `Authorization: {{apiKey}}` on every request. + +## Walkthrough + +Run folders top-to-bottom on a fresh resource: + +1. **`datastore_create`** — seeds **110 rows** (auctions `1..55` × `DCL`/`DCH`). +2. **`datastore_upsert`** — upsert 2 → insert 10 (`100..109`) → update 2. Total: 121. +3. **`datastore_info`** — confirm schema + row count. +4. **`datastore_search`** — filter / full-text / paginated. +5. **`datastore_search_sql`** — raw SQL; `LIMIT` required. JOIN/UNION variants need a second resource `balancing_auction_results_2024`. +6. **`datastore_delete`** — row delete (`auction_id=1`) → drop column (`bidder_metadata`) → drop table. + +`health` is independent — hit any time to check the server. + +## Regenerate + +```sh +python postman/generate_postman.py +``` + +Drop new files under `example_payload//.json` to add requests. + +## Auth + +- Reads can run anonymously (CKAN decides by resource visibility). +- Writes need `apiKey`, or set `AUTH_ENABLED=false` in `.env` for local dev. diff --git a/postman/collection.json b/postman/collection.json new file mode 100644 index 0000000..bb9db26 --- /dev/null +++ b/postman/collection.json @@ -0,0 +1,861 @@ +{ + "info": { + "_postman_id": "b96f5ccc-fdd3-4a22-8911-51a5719b753e", + "name": "Datastore API", + "description": "CKAN-compatible datastore API \u2014 auto-generated from `example_payload/`. Set `baseUrl` to your server, `apiKey` to a CKAN API key (anonymous reads are allowed; writes require a key), and `resourceId` to the table you want to hit.", + "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" + }, + "variable": [ + { + "key": "baseUrl", + "value": "http://localhost:8000", + "type": "string" + }, + { + "key": "apiKey", + "value": "", + "type": "string" + }, + { + "key": "resourceId", + "value": "balancing_auction_results_2025", + "type": "string" + } + ], + "auth": { + "type": "apikey", + "apikey": [ + { + "key": "key", + "value": "Authorization", + "type": "string" + }, + { + "key": "value", + "value": "{{apiKey}}", + "type": "string" + }, + { + "key": "in", + "value": "header", + "type": "string" + } + ] + }, + "item": [ + { + "name": "health", + "description": "Health endpoints live at the root and also under `/api/3/action/` so k8s probes and CKAN clients can both reach them. Listed here at the root.", + "item": [ + { + "name": "Welcome", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "" + ] + }, + "description": "Banner / root endpoint. Echoes `APP_MESSAGE`." + }, + "response": [] + }, + { + "name": "Health", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/health", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "health" + ] + }, + "description": "Liveness probe \u2014 always 200 while the process is up." + }, + "response": [] + }, + { + "name": "Ready", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/ready", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "ready" + ] + }, + "description": "Readiness probe \u2014 200 only when both engines pass healthcheck." + }, + "response": [] + } + ] + }, + { + "name": "datastore_create", + "description": "Declare a resource and optionally seed it with rows. Run this first. Accepts either the canonical Frictionless `schema` or the legacy `fields` + `primary_key` shape.", + "item": [ + { + "name": "Create - Frictionless schema (recommended)", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"auction_id\",\n \"type\": \"integer\",\n \"title\": \"Auction ID\",\n \"description\": \"Unique auction identifier.\"\n },\n {\n \"name\": \"product_code\",\n \"type\": \"string\",\n \"title\": \"Product Code\"\n },\n {\n \"name\": \"delivery_start\",\n \"type\": \"datetime\",\n \"title\": \"Delivery Start (UTC)\"\n },\n {\n \"name\": \"duration_minutes\",\n \"type\": \"integer\",\n \"info\": {\n \"unit\": \"minutes\"\n }\n },\n {\n \"name\": \"clearing_price_gbp_per_mwh\",\n \"type\": \"number\",\n \"title\": \"Clearing Price\",\n \"info\": {\n \"unit\": \"GBP/MWh\"\n }\n },\n {\n \"name\": \"volume_mwh\",\n \"type\": \"number\",\n \"info\": {\n \"unit\": \"MWh\"\n }\n },\n {\n \"name\": \"accepted\",\n \"type\": \"boolean\"\n },\n {\n \"name\": \"bidder_metadata\",\n \"type\": \"object\"\n }\n ],\n \"primaryKey\": [\n \"auction_id\",\n \"product_code\"\n ]\n },\n \"records\": [\n {\n \"auction_id\": 1,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.99,\n \"volume_mwh\": 52.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 171\n }\n },\n {\n \"auction_id\": 1,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.41,\n \"volume_mwh\": 117.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 402\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.55,\n \"volume_mwh\": 53.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 408\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.66,\n \"volume_mwh\": 69.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 314\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.51,\n \"volume_mwh\": 108.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 488\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.15,\n \"volume_mwh\": 119.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 210\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 83.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 283\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.19,\n \"volume_mwh\": 110.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 374\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.12,\n \"volume_mwh\": 142.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 524\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.72,\n \"volume_mwh\": 138.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 460\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.74,\n \"volume_mwh\": 116.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 537\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.82,\n \"volume_mwh\": 60.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 183\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.25,\n \"volume_mwh\": 71.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 411\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.87,\n \"volume_mwh\": 103.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 294\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.75,\n \"volume_mwh\": 142.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 450\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.11,\n \"volume_mwh\": 126.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.89,\n \"volume_mwh\": 76.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 548\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.95,\n \"volume_mwh\": 71.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 173\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.62,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 482\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.61,\n \"volume_mwh\": 139.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 170\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.74,\n \"volume_mwh\": 59.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 178\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.69,\n \"volume_mwh\": 129.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 297\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.54,\n \"volume_mwh\": 149.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 540\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.56,\n \"volume_mwh\": 118.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 484\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.67,\n \"volume_mwh\": 114.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 180\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.34,\n \"volume_mwh\": 145.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 597\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.51,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 530\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.97,\n \"volume_mwh\": 110.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 376\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.84,\n \"volume_mwh\": 142.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 265\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.22,\n \"volume_mwh\": 61.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 222\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.45,\n \"volume_mwh\": 137.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 474\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.15,\n \"volume_mwh\": 56.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.49,\n \"volume_mwh\": 144.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 546\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.16,\n \"volume_mwh\": 146.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 465\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.79,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.94,\n \"volume_mwh\": 62.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 110\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.71,\n \"volume_mwh\": 73.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 462\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.78,\n \"volume_mwh\": 72.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 136\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.85,\n \"volume_mwh\": 77.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 167\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.08,\n \"volume_mwh\": 138.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 501\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.82,\n \"volume_mwh\": 90.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 281\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.59,\n \"volume_mwh\": 96.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.07,\n \"volume_mwh\": 83.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 198\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.76,\n \"volume_mwh\": 94.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 336\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.25,\n \"volume_mwh\": 142.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 150\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 149.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 574\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.84,\n \"volume_mwh\": 73.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 209\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.62,\n \"volume_mwh\": 140.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 299\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.63,\n \"volume_mwh\": 128.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 456\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.75,\n \"volume_mwh\": 71.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.7,\n \"volume_mwh\": 55.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 570\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.32,\n \"volume_mwh\": 65.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 535\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.65,\n \"volume_mwh\": 109.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 306\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.0,\n \"volume_mwh\": 139.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 120\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.48,\n \"volume_mwh\": 91.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 261\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.37,\n \"volume_mwh\": 70.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 235\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.89,\n \"volume_mwh\": 117.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 575\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.8,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 151\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.83,\n \"volume_mwh\": 71.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.72,\n \"volume_mwh\": 74.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.58,\n \"volume_mwh\": 80.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 104\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.7,\n \"volume_mwh\": 105.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 580\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.95,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.81,\n \"volume_mwh\": 110.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 451\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 132.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 494\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.27,\n \"volume_mwh\": 148.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 382\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.64,\n \"volume_mwh\": 106.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 379\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.9,\n \"volume_mwh\": 86.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.05,\n \"volume_mwh\": 86.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.95,\n \"volume_mwh\": 118.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 499\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.0,\n \"volume_mwh\": 137.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 573\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.25,\n \"volume_mwh\": 136.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.31,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 510\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.75,\n \"volume_mwh\": 123.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 503\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.53,\n \"volume_mwh\": 88.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 213\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.99,\n \"volume_mwh\": 141.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 214\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.59,\n \"volume_mwh\": 69.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 595\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 54.33,\n \"volume_mwh\": 85.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 269\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.48,\n \"volume_mwh\": 61.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 397\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 59.06,\n \"volume_mwh\": 76.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 276\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.21,\n \"volume_mwh\": 81.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 159\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.63,\n \"volume_mwh\": 107.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 100\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.0,\n \"volume_mwh\": 130.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 286\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.78,\n \"volume_mwh\": 144.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 260\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.58,\n \"volume_mwh\": 62.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 258\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.67,\n \"volume_mwh\": 82.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.8,\n \"volume_mwh\": 116.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 415\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.23,\n \"volume_mwh\": 90.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 246\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.25,\n \"volume_mwh\": 128.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 338\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.04,\n \"volume_mwh\": 117.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 437\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 37.12,\n \"volume_mwh\": 101.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.46,\n \"volume_mwh\": 125.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.98,\n \"volume_mwh\": 52.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 412\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.24,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 199\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.96,\n \"volume_mwh\": 88.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 435\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.19,\n \"volume_mwh\": 139.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 498\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.63,\n \"volume_mwh\": 67.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 337\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 74.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 168\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.04,\n \"volume_mwh\": 116.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 262\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.75,\n \"volume_mwh\": 139.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 318\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.76,\n \"volume_mwh\": 104.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 330\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.48,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 348\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.67,\n \"volume_mwh\": 77.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 220\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.79,\n \"volume_mwh\": 82.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 177\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.78,\n \"volume_mwh\": 119.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 312\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.19,\n \"volume_mwh\": 104.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.5,\n \"volume_mwh\": 140.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 538\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 57.02,\n \"volume_mwh\": 107.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 252\n }\n }\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_create", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_create" + ] + }, + "description": "Canonical input: pass a Frictionless Table Schema under `schema` and (optionally) seed rows under `records`." + }, + "response": [] + }, + { + "name": "Create - legacy fields, existing resource", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"fields\": [\n {\n \"id\": \"auction_id\",\n \"type\": \"int4\",\n \"info\": {\n \"title\": \"Auction ID\",\n \"description\": \"Unique auction identifier.\"\n }\n },\n {\n \"id\": \"product_code\",\n \"type\": \"text\"\n },\n {\n \"id\": \"delivery_start\",\n \"type\": \"timestamptz\",\n \"info\": {\n \"title\": \"Delivery Start (UTC)\"\n }\n },\n {\n \"id\": \"duration_minutes\",\n \"type\": \"int4\",\n \"info\": {\n \"unit\": \"minutes\"\n }\n },\n {\n \"id\": \"clearing_price_gbp_per_mwh\",\n \"type\": \"float8\",\n \"info\": {\n \"unit\": \"GBP/MWh\"\n }\n },\n {\n \"id\": \"volume_mwh\",\n \"type\": \"float8\",\n \"info\": {\n \"unit\": \"MWh\"\n }\n },\n {\n \"id\": \"accepted\",\n \"type\": \"bool\"\n },\n {\n \"id\": \"bidder_metadata\",\n \"type\": \"jsonb\"\n }\n ],\n \"primary_key\": [\n \"auction_id\",\n \"product_code\"\n ],\n \"records\": [\n {\n \"auction_id\": 1,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.99,\n \"volume_mwh\": 52.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 171\n }\n },\n {\n \"auction_id\": 1,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.41,\n \"volume_mwh\": 117.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 402\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.55,\n \"volume_mwh\": 53.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 408\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.66,\n \"volume_mwh\": 69.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 314\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.51,\n \"volume_mwh\": 108.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 488\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.15,\n \"volume_mwh\": 119.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 210\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 83.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 283\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.19,\n \"volume_mwh\": 110.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 374\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.12,\n \"volume_mwh\": 142.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 524\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.72,\n \"volume_mwh\": 138.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 460\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.74,\n \"volume_mwh\": 116.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 537\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.82,\n \"volume_mwh\": 60.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 183\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.25,\n \"volume_mwh\": 71.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 411\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.87,\n \"volume_mwh\": 103.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 294\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.75,\n \"volume_mwh\": 142.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 450\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.11,\n \"volume_mwh\": 126.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.89,\n \"volume_mwh\": 76.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 548\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.95,\n \"volume_mwh\": 71.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 173\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.62,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 482\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.61,\n \"volume_mwh\": 139.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 170\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.74,\n \"volume_mwh\": 59.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 178\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.69,\n \"volume_mwh\": 129.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 297\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.54,\n \"volume_mwh\": 149.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 540\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.56,\n \"volume_mwh\": 118.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 484\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.67,\n \"volume_mwh\": 114.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 180\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.34,\n \"volume_mwh\": 145.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 597\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.51,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 530\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.97,\n \"volume_mwh\": 110.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 376\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.84,\n \"volume_mwh\": 142.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 265\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.22,\n \"volume_mwh\": 61.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 222\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.45,\n \"volume_mwh\": 137.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 474\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.15,\n \"volume_mwh\": 56.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.49,\n \"volume_mwh\": 144.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 546\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.16,\n \"volume_mwh\": 146.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 465\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.79,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.94,\n \"volume_mwh\": 62.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 110\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.71,\n \"volume_mwh\": 73.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 462\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.78,\n \"volume_mwh\": 72.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 136\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.85,\n \"volume_mwh\": 77.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 167\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.08,\n \"volume_mwh\": 138.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 501\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.82,\n \"volume_mwh\": 90.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 281\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.59,\n \"volume_mwh\": 96.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.07,\n \"volume_mwh\": 83.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 198\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.76,\n \"volume_mwh\": 94.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 336\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.25,\n \"volume_mwh\": 142.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 150\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 149.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 574\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.84,\n \"volume_mwh\": 73.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 209\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.62,\n \"volume_mwh\": 140.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 299\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.63,\n \"volume_mwh\": 128.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 456\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.75,\n \"volume_mwh\": 71.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.7,\n \"volume_mwh\": 55.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 570\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.32,\n \"volume_mwh\": 65.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 535\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.65,\n \"volume_mwh\": 109.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 306\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.0,\n \"volume_mwh\": 139.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 120\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.48,\n \"volume_mwh\": 91.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 261\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.37,\n \"volume_mwh\": 70.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 235\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.89,\n \"volume_mwh\": 117.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 575\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.8,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 151\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.83,\n \"volume_mwh\": 71.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.72,\n \"volume_mwh\": 74.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.58,\n \"volume_mwh\": 80.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 104\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.7,\n \"volume_mwh\": 105.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 580\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.95,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.81,\n \"volume_mwh\": 110.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 451\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 132.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 494\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.27,\n \"volume_mwh\": 148.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 382\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.64,\n \"volume_mwh\": 106.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 379\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.9,\n \"volume_mwh\": 86.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.05,\n \"volume_mwh\": 86.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.95,\n \"volume_mwh\": 118.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 499\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.0,\n \"volume_mwh\": 137.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 573\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.25,\n \"volume_mwh\": 136.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.31,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 510\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.75,\n \"volume_mwh\": 123.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 503\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.53,\n \"volume_mwh\": 88.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 213\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.99,\n \"volume_mwh\": 141.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 214\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.59,\n \"volume_mwh\": 69.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 595\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 54.33,\n \"volume_mwh\": 85.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 269\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.48,\n \"volume_mwh\": 61.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 397\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 59.06,\n \"volume_mwh\": 76.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 276\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.21,\n \"volume_mwh\": 81.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 159\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.63,\n \"volume_mwh\": 107.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 100\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.0,\n \"volume_mwh\": 130.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 286\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.78,\n \"volume_mwh\": 144.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 260\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.58,\n \"volume_mwh\": 62.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 258\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.67,\n \"volume_mwh\": 82.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.8,\n \"volume_mwh\": 116.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 415\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.23,\n \"volume_mwh\": 90.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 246\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.25,\n \"volume_mwh\": 128.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 338\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.04,\n \"volume_mwh\": 117.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 437\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 37.12,\n \"volume_mwh\": 101.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.46,\n \"volume_mwh\": 125.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.98,\n \"volume_mwh\": 52.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 412\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.24,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 199\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.96,\n \"volume_mwh\": 88.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 435\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.19,\n \"volume_mwh\": 139.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 498\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.63,\n \"volume_mwh\": 67.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 337\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 74.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 168\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.04,\n \"volume_mwh\": 116.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 262\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.75,\n \"volume_mwh\": 139.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 318\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.76,\n \"volume_mwh\": 104.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 330\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.48,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 348\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.67,\n \"volume_mwh\": 77.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 220\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.79,\n \"volume_mwh\": 82.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 177\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.78,\n \"volume_mwh\": 119.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 312\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.19,\n \"volume_mwh\": 104.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.5,\n \"volume_mwh\": 140.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 538\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 57.02,\n \"volume_mwh\": 107.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 252\n }\n }\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_create", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_create" + ] + }, + "description": "Back-compat path: the legacy `fields` + `primary_key` shape against a resource id that already exists in CKAN." + }, + "response": [] + }, + { + "name": "Create - legacy fields, new resource", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource\": {\n \"id\": \"balancing_auction_results_2025\",\n \"name\": \"Balancing Auction Results 2025\",\n \"package_id\": \"pkg-balancing-2025\",\n \"format\": \"datastore\",\n \"mimetype\": \"application/json\",\n \"description\": \"Cleared bids from the balancing market, one row per auction \\u00d7 product.\"\n },\n \"fields\": [\n {\n \"id\": \"auction_id\",\n \"type\": \"int4\",\n \"info\": {\n \"title\": \"Auction ID\",\n \"description\": \"Unique auction identifier.\"\n }\n },\n {\n \"id\": \"product_code\",\n \"type\": \"text\"\n },\n {\n \"id\": \"delivery_start\",\n \"type\": \"timestamptz\",\n \"info\": {\n \"title\": \"Delivery Start (UTC)\"\n }\n },\n {\n \"id\": \"duration_minutes\",\n \"type\": \"int4\",\n \"info\": {\n \"unit\": \"minutes\"\n }\n },\n {\n \"id\": \"clearing_price_gbp_per_mwh\",\n \"type\": \"float8\",\n \"info\": {\n \"unit\": \"GBP/MWh\"\n }\n },\n {\n \"id\": \"volume_mwh\",\n \"type\": \"float8\",\n \"info\": {\n \"unit\": \"MWh\"\n }\n },\n {\n \"id\": \"accepted\",\n \"type\": \"bool\"\n },\n {\n \"id\": \"bidder_metadata\",\n \"type\": \"jsonb\"\n }\n ],\n \"primary_key\": [\n \"auction_id\",\n \"product_code\"\n ],\n \"records\": [\n {\n \"auction_id\": 1,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.99,\n \"volume_mwh\": 52.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 171\n }\n },\n {\n \"auction_id\": 1,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.41,\n \"volume_mwh\": 117.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 402\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.55,\n \"volume_mwh\": 53.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 408\n }\n },\n {\n \"auction_id\": 2,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.66,\n \"volume_mwh\": 69.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 314\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.51,\n \"volume_mwh\": 108.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 488\n }\n },\n {\n \"auction_id\": 3,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.15,\n \"volume_mwh\": 119.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 210\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 83.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 283\n }\n },\n {\n \"auction_id\": 4,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-01T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.19,\n \"volume_mwh\": 110.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 374\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.12,\n \"volume_mwh\": 142.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 524\n }\n },\n {\n \"auction_id\": 5,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.72,\n \"volume_mwh\": 138.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 460\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.74,\n \"volume_mwh\": 116.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 537\n }\n },\n {\n \"auction_id\": 6,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.82,\n \"volume_mwh\": 60.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 183\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.25,\n \"volume_mwh\": 71.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 411\n }\n },\n {\n \"auction_id\": 7,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.87,\n \"volume_mwh\": 103.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 294\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.75,\n \"volume_mwh\": 142.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 450\n }\n },\n {\n \"auction_id\": 8,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.11,\n \"volume_mwh\": 126.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.89,\n \"volume_mwh\": 76.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 548\n }\n },\n {\n \"auction_id\": 9,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.95,\n \"volume_mwh\": 71.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 173\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.62,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 482\n }\n },\n {\n \"auction_id\": 10,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-02T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.61,\n \"volume_mwh\": 139.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 170\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.74,\n \"volume_mwh\": 59.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 178\n }\n },\n {\n \"auction_id\": 11,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.69,\n \"volume_mwh\": 129.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 297\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.54,\n \"volume_mwh\": 149.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 540\n }\n },\n {\n \"auction_id\": 12,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.56,\n \"volume_mwh\": 118.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 484\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.67,\n \"volume_mwh\": 114.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 180\n }\n },\n {\n \"auction_id\": 13,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.34,\n \"volume_mwh\": 145.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 597\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.51,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 530\n }\n },\n {\n \"auction_id\": 14,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.97,\n \"volume_mwh\": 110.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 376\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.84,\n \"volume_mwh\": 142.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 265\n }\n },\n {\n \"auction_id\": 15,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.22,\n \"volume_mwh\": 61.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 222\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.45,\n \"volume_mwh\": 137.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 474\n }\n },\n {\n \"auction_id\": 16,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-03T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.15,\n \"volume_mwh\": 56.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.49,\n \"volume_mwh\": 144.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 546\n }\n },\n {\n \"auction_id\": 17,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.16,\n \"volume_mwh\": 146.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 465\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 42.79,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 18,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.94,\n \"volume_mwh\": 62.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 110\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.71,\n \"volume_mwh\": 73.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 462\n }\n },\n {\n \"auction_id\": 19,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.78,\n \"volume_mwh\": 72.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 136\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 47.85,\n \"volume_mwh\": 77.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 167\n }\n },\n {\n \"auction_id\": 20,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.08,\n \"volume_mwh\": 138.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 501\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.82,\n \"volume_mwh\": 90.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 281\n }\n },\n {\n \"auction_id\": 21,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.59,\n \"volume_mwh\": 96.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.07,\n \"volume_mwh\": 83.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 198\n }\n },\n {\n \"auction_id\": 22,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-04T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.76,\n \"volume_mwh\": 94.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 336\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.25,\n \"volume_mwh\": 142.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 150\n }\n },\n {\n \"auction_id\": 23,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 149.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 574\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.84,\n \"volume_mwh\": 73.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 209\n }\n },\n {\n \"auction_id\": 24,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.62,\n \"volume_mwh\": 140.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 299\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.63,\n \"volume_mwh\": 128.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 456\n }\n },\n {\n \"auction_id\": 25,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.93,\n \"volume_mwh\": 149.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.75,\n \"volume_mwh\": 71.8,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 131\n }\n },\n {\n \"auction_id\": 26,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.7,\n \"volume_mwh\": 55.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 570\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.32,\n \"volume_mwh\": 65.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 535\n }\n },\n {\n \"auction_id\": 27,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.65,\n \"volume_mwh\": 109.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 306\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.0,\n \"volume_mwh\": 139.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 120\n }\n },\n {\n \"auction_id\": 28,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-05T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.48,\n \"volume_mwh\": 91.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 261\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.37,\n \"volume_mwh\": 70.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 235\n }\n },\n {\n \"auction_id\": 29,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.89,\n \"volume_mwh\": 117.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 575\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.8,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 151\n }\n },\n {\n \"auction_id\": 30,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.83,\n \"volume_mwh\": 71.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.72,\n \"volume_mwh\": 74.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 31,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.58,\n \"volume_mwh\": 80.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 104\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.7,\n \"volume_mwh\": 105.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 580\n }\n },\n {\n \"auction_id\": 32,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.95,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 179\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.81,\n \"volume_mwh\": 110.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 451\n }\n },\n {\n \"auction_id\": 33,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 76.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.86,\n \"volume_mwh\": 132.9,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 494\n }\n },\n {\n \"auction_id\": 34,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-06T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.27,\n \"volume_mwh\": 148.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 382\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.64,\n \"volume_mwh\": 106.1,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 379\n }\n },\n {\n \"auction_id\": 35,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.9,\n \"volume_mwh\": 86.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.05,\n \"volume_mwh\": 86.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 560\n }\n },\n {\n \"auction_id\": 36,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.95,\n \"volume_mwh\": 118.2,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 499\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.0,\n \"volume_mwh\": 137.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 573\n }\n },\n {\n \"auction_id\": 37,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.25,\n \"volume_mwh\": 136.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 551\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.31,\n \"volume_mwh\": 67.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 510\n }\n },\n {\n \"auction_id\": 38,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.75,\n \"volume_mwh\": 123.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 503\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.53,\n \"volume_mwh\": 88.3,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 213\n }\n },\n {\n \"auction_id\": 39,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.99,\n \"volume_mwh\": 141.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 214\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 35.59,\n \"volume_mwh\": 69.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 595\n }\n },\n {\n \"auction_id\": 40,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-07T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 54.33,\n \"volume_mwh\": 85.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 269\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.48,\n \"volume_mwh\": 61.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 397\n }\n },\n {\n \"auction_id\": 41,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 59.06,\n \"volume_mwh\": 76.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 276\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.21,\n \"volume_mwh\": 81.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 159\n }\n },\n {\n \"auction_id\": 42,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 44.63,\n \"volume_mwh\": 107.7,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 100\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 48.0,\n \"volume_mwh\": 130.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 286\n }\n },\n {\n \"auction_id\": 43,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.78,\n \"volume_mwh\": 144.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 260\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.58,\n \"volume_mwh\": 62.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 258\n }\n },\n {\n \"auction_id\": 44,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.67,\n \"volume_mwh\": 82.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 165\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.8,\n \"volume_mwh\": 116.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 415\n }\n },\n {\n \"auction_id\": 45,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.23,\n \"volume_mwh\": 90.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 246\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.25,\n \"volume_mwh\": 128.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 338\n }\n },\n {\n \"auction_id\": 46,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-08T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.04,\n \"volume_mwh\": 117.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 437\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 37.12,\n \"volume_mwh\": 101.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 147\n }\n },\n {\n \"auction_id\": 47,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.46,\n \"volume_mwh\": 125.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 512\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.98,\n \"volume_mwh\": 52.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 412\n }\n },\n {\n \"auction_id\": 48,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 56.24,\n \"volume_mwh\": 57.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 199\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.96,\n \"volume_mwh\": 88.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 435\n }\n },\n {\n \"auction_id\": 49,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 52.19,\n \"volume_mwh\": 139.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 498\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.63,\n \"volume_mwh\": 67.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 337\n }\n },\n {\n \"auction_id\": 50,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 36.26,\n \"volume_mwh\": 74.9,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 168\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.04,\n \"volume_mwh\": 116.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 262\n }\n },\n {\n \"auction_id\": 51,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 58.75,\n \"volume_mwh\": 139.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 318\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.76,\n \"volume_mwh\": 104.8,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 330\n }\n },\n {\n \"auction_id\": 52,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-09T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.48,\n \"volume_mwh\": 74.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"SSE-PEH-3\",\n \"submission_lag_ms\": 348\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 50.67,\n \"volume_mwh\": 77.5,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 220\n }\n },\n {\n \"auction_id\": 53,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.79,\n \"volume_mwh\": 82.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 177\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.78,\n \"volume_mwh\": 119.4,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 312\n }\n },\n {\n \"auction_id\": 54,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.19,\n \"volume_mwh\": 104.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 526\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 45.5,\n \"volume_mwh\": 140.5,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 538\n }\n },\n {\n \"auction_id\": 55,\n \"product_code\": \"DCH\",\n \"delivery_start\": \"2025-11-10T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 57.02,\n \"volume_mwh\": 107.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 252\n }\n }\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_create", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_create" + ] + }, + "description": "Back-compat path: declare a brand-new CKAN resource inline with `resource: {\u2026}` plus legacy `fields` / `primary_key`." + }, + "response": [] + } + ] + }, + { + "name": "datastore_upsert", + "description": "Write rows. `method` picks the strategy: `upsert` (default \u2014 match by primaryKey, insert new), `insert` (fail on duplicate), or `update` (fail on miss). Run after `datastore_create`.", + "item": [ + { + "name": "Upsert - method=upsert (default)", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"method\": \"upsert\",\n \"records\": [\n {\n \"auction_id\": 1,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 41.2,\n \"volume_mwh\": 95.0,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 412,\n \"revision\": 2\n }\n },\n {\n \"auction_id\": 200,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-12-01T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 39.5,\n \"volume_mwh\": 110.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 350\n }\n }\n ],\n \"include_records\": true,\n \"include_total\": false,\n \"force\": false\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_upsert", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_upsert" + ] + }, + "description": "Match each row by stored `primaryKey`; update on hit, insert on miss. Updates only bump `_updated_at` when a non-PK column actually changes." + }, + "response": [] + }, + { + "name": "Upsert - method=insert", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"method\": \"insert\",\n \"records\": [\n {\n \"auction_id\": 100,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-17T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.84,\n \"volume_mwh\": 135.3,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 375\n }\n },\n {\n \"auction_id\": 101,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 53.69,\n \"volume_mwh\": 104.6,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 349\n }\n },\n {\n \"auction_id\": 102,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 40.49,\n \"volume_mwh\": 93.6,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-2\",\n \"submission_lag_ms\": 442\n }\n },\n {\n \"auction_id\": 103,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 51.98,\n \"volume_mwh\": 90.4,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 570\n }\n },\n {\n \"auction_id\": 104,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T12:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 38.19,\n \"volume_mwh\": 112.2,\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 403\n }\n },\n {\n \"auction_id\": 105,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T16:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 49.11,\n \"volume_mwh\": 52.7,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 543\n }\n },\n {\n \"auction_id\": 106,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-18T20:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 46.54,\n \"volume_mwh\": 55.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-2\",\n \"submission_lag_ms\": 332\n }\n },\n {\n \"auction_id\": 107,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-19T00:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.17,\n \"volume_mwh\": 126.1,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"EDF-COTT-3\",\n \"submission_lag_ms\": 229\n }\n },\n {\n \"auction_id\": 108,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-19T04:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 55.87,\n \"volume_mwh\": 97.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 587\n }\n },\n {\n \"auction_id\": 109,\n \"product_code\": \"DCL\",\n \"delivery_start\": \"2025-11-19T08:00:00Z\",\n \"duration_minutes\": 30,\n \"clearing_price_gbp_per_mwh\": 43.75,\n \"volume_mwh\": 115.0,\n \"accepted\": true,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"submission_lag_ms\": 486\n }\n }\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_upsert", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_upsert" + ] + }, + "description": "Insert only; duplicate primary key surfaces as a clean ValidationError." + }, + "response": [] + }, + { + "name": "Upsert - method=update", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"method\": \"update\",\n \"records\": [\n {\n \"auction_id\": 1,\n \"product_code\": \"DCL\",\n \"accepted\": false,\n \"bidder_metadata\": {\n \"unit_id\": \"DRAX-1\",\n \"rejection_reason\": \"settlement_dispute\"\n }\n },\n {\n \"auction_id\": 1,\n \"product_code\": \"DCH\",\n \"clearing_price_gbp_per_mwh\": 50.85\n }\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_upsert", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_upsert" + ] + }, + "description": "Update only; any row whose `primaryKey` doesn't match an existing row raises NotFoundError." + }, + "response": [] + } + ] + }, + { + "name": "datastore_info", + "description": "Read the resource's column schema + row count. Useful for confirming the table exists and verifying writes landed.", + "item": [ + { + "name": "Info - by resource_id", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_info?resource_id=balancing_auction_results_2025", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_info" + ], + "query": [ + { + "key": "resource_id", + "value": "balancing_auction_results_2025" + } + ] + }, + "description": "Return the stored Frictionless schema + row count via `INFORMATION_SCHEMA.TABLE_STORAGE`." + }, + "response": [] + }, + { + "name": "Info - `id` alias", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_info?id=balancing_auction_results_2025", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_info" + ], + "query": [ + { + "key": "id", + "value": "balancing_auction_results_2025" + } + ] + }, + "description": "`id` is accepted as a legacy CKAN alias for `resource_id`." + }, + "response": [] + } + ] + }, + { + "name": "datastore_search", + "description": "Stream rows matching `filters` / `q`. Pagination via `limit` + `offset`; response carries `_links.next` / `prev` / `page_size` / `page` / `total_pages`.", + "item": [ + { + "name": "Search - minimal (just resource_id)", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search?resource_id=balancing_auction_results_2025", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search" + ], + "query": [ + { + "key": "resource_id", + "value": "balancing_auction_results_2025" + } + ] + }, + "description": "Default page (limit=100, offset=0, include_total=true)." + }, + "response": [] + }, + { + "name": "Search - with filters", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&filters={\"product_code\": \"DCL\", \"accepted\": true}&limit=100", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search" + ], + "query": [ + { + "key": "resource_id", + "value": "balancing_auction_results_2025" + }, + { + "key": "filters", + "value": "{\"product_code\": \"DCL\", \"accepted\": true}" + }, + { + "key": "limit", + "value": "100" + } + ] + }, + "description": "JSON-encoded filters on the URL. Value matches must respect column types (no JSON-column equality)." + }, + "response": [] + }, + { + "name": "Search - `q` full-text", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&q=DRAX&plain=true&language=english&limit=50", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search" + ], + "query": [ + { + "key": "resource_id", + "value": "balancing_auction_results_2025" + }, + { + "key": "q", + "value": "DRAX" + }, + { + "key": "plain", + "value": "true" + }, + { + "key": "language", + "value": "english" + }, + { + "key": "limit", + "value": "50" + } + ] + }, + "description": "BigQuery `SEARCH(row, @q)` against every text column." + }, + "response": [] + }, + { + "name": "Search - paginated + sorted", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search?resource_id=balancing_auction_results_2025&fields=auction_id,product_code,delivery_start,clearing_price_gbp_per_mwh,volume_mwh&sort=delivery_start desc, clearing_price_gbp_per_mwh asc&limit=100&offset=0&include_total=true", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search" + ], + "query": [ + { + "key": "resource_id", + "value": "balancing_auction_results_2025" + }, + { + "key": "fields", + "value": "auction_id,product_code,delivery_start,clearing_price_gbp_per_mwh,volume_mwh" + }, + { + "key": "sort", + "value": "delivery_start desc, clearing_price_gbp_per_mwh asc" + }, + { + "key": "limit", + "value": "100" + }, + { + "key": "offset", + "value": "0" + }, + { + "key": "include_total", + "value": "true" + } + ] + }, + "description": "Custom projection (CSV), multi-column sort, explicit limit / offset / include_total. Drives `_links.next`." + }, + "response": [] + } + ] + }, + { + "name": "datastore_search_sql", + "description": "Run a vetted `SELECT` / `WITH` statement. `LIMIT` is required (parsed from the SQL); pagination links rewrite the SQL's `OFFSET` so callers can follow `next` without editing.", + "item": [ + { + "name": "SQL - basic SELECT", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=SELECT auction_id, product_code, clearing_price_gbp_per_mwh FROM \"balancing_auction_results_2025\" WHERE accepted = true LIMIT 100", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "SELECT auction_id, product_code, clearing_price_gbp_per_mwh FROM \"balancing_auction_results_2025\" WHERE accepted = true LIMIT 100" + } + ] + }, + "description": "Plain SELECT with WHERE + LIMIT. Total comes from `INFORMATION_SCHEMA.TABLE_STORAGE` since there's no aggregate." + }, + "response": [] + }, + { + "name": "SQL - GROUP BY + aggregates", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=SELECT product_code, AVG(clearing_price_gbp_per_mwh) AS avg_price, SUM(volume_mwh) AS total_volume FROM \"balancing_auction_results_2025\" WHERE accepted = true GROUP BY product_code ORDER BY product_code LIMIT 50", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "SELECT product_code, AVG(clearing_price_gbp_per_mwh) AS avg_price, SUM(volume_mwh) AS total_volume FROM \"balancing_auction_results_2025\" WHERE accepted = true GROUP BY product_code ORDER BY product_code LIMIT 50" + } + ] + }, + "description": "Aggregates collapse rows, so total goes through the `COUNT(*) FROM ()` path instead of the metadata shortcut." + }, + "response": [] + }, + { + "name": "SQL - WITH (CTE)", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=WITH daily AS (SELECT DATE(delivery_start) AS d, product_code, AVG(clearing_price_gbp_per_mwh) AS price FROM \"balancing_auction_results_2025\" WHERE accepted = true GROUP BY d, product_code) SELECT * FROM daily ORDER BY d DESC, product_code LIMIT 50", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "WITH daily AS (SELECT DATE(delivery_start) AS d, product_code, AVG(clearing_price_gbp_per_mwh) AS price FROM \"balancing_auction_results_2025\" WHERE accepted = true GROUP BY d, product_code) SELECT * FROM daily ORDER BY d DESC, product_code LIMIT 50" + } + ] + }, + "description": "Common table expression. CTE aliases are excluded from auth + qualification (they're inline, not external tables)." + }, + "response": [] + }, + { + "name": "SQL - LIMIT + OFFSET", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=SELECT _id, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2025\" ORDER BY _id ASC LIMIT 50 OFFSET 100", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "SELECT _id, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2025\" ORDER BY _id ASC LIMIT 50 OFFSET 100" + } + ] + }, + "description": "Pagination via OFFSET. `_links.next` will rewrite the OFFSET in the SQL string so the caller can follow without editing." + }, + "response": [] + }, + { + "name": "SQL - JOIN two resources", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=SELECT a._id AS a_id, b._id AS b_id, a.auction_id, a.product_code FROM \"balancing_auction_results_2025\" a JOIN \"balancing_auction_results_2024\" b ON a.auction_id = b.auction_id WHERE a.product_code = 'DCL' ORDER BY a.auction_id ASC LIMIT 50", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "SELECT a._id AS a_id, b._id AS b_id, a.auction_id, a.product_code FROM \"balancing_auction_results_2025\" a JOIN \"balancing_auction_results_2024\" b ON a.auction_id = b.auction_id WHERE a.product_code = 'DCL' ORDER BY a.auction_id ASC LIMIT 50" + } + ] + }, + "description": "JOIN across two resource_ids. Each table is authorised independently via CKAN; both get the `project.dataset` prefix." + }, + "response": [] + }, + { + "name": "SQL - UNION ALL two resources", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_search_sql?sql=SELECT '2025' AS source, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2025\" UNION ALL SELECT '2024' AS source, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2024\" ORDER BY delivery_start DESC LIMIT 100", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_search_sql" + ], + "query": [ + { + "key": "sql", + "value": "SELECT '2025' AS source, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2025\" UNION ALL SELECT '2024' AS source, auction_id, product_code, delivery_start FROM \"balancing_auction_results_2024\" ORDER BY delivery_start DESC LIMIT 100" + } + ] + }, + "description": "UNION ALL across two resource_ids \u2014 handy for combined reports over time-partitioned tables." + }, + "response": [] + } + ] + }, + { + "name": "datastore_delete", + "description": "Cleanup. Drop the table (no filters/fields), delete rows (`filters`), or drop columns (`fields`). `filters` and `fields` are mutually exclusive.", + "item": [ + { + "name": "Delete - drop the whole table", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\"\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_delete", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_delete" + ] + }, + "description": "No `filters`, no `fields` \u2192 DROP TABLE + delete metadata row. Resource disappears entirely." + }, + "response": [] + }, + { + "name": "Delete - narrow row delete", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"filters\": {\n \"auction_id\": 1\n }\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_delete", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_delete" + ] + }, + "description": "Parameterised `DELETE FROM \u2026 WHERE \u2026` against the filter columns. JSON-column equality rejected at the boundary." + }, + "response": [] + }, + { + "name": "Delete - drop columns", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"fields\": [\n \"bidder_metadata\"\n ]\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_delete", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_delete" + ] + }, + "description": "`ALTER TABLE DROP COLUMN \u2026` + rewrite the stored schema. System columns (`_id`, `_updated_at`) and PK columns are protected." + }, + "response": [] + }, + { + "name": "Delete - force read-only resource", + "request": { + "method": "POST", + "header": [ + { + "key": "Content-Type", + "value": "application/json" + } + ], + "body": { + "mode": "raw", + "raw": "{\n \"resource_id\": \"balancing_auction_results_2025\",\n \"filters\": {\n \"product_code\": \"FFR\"\n },\n \"force\": true\n}", + "options": { + "raw": { + "language": "json" + } + } + }, + "url": { + "raw": "{{baseUrl}}/api/3/action/datastore_delete", + "host": [ + "{{baseUrl}}" + ], + "path": [ + "api", + "3", + "action", + "datastore_delete" + ] + }, + "description": "`force=true` bypasses the CKAN read-only guard." + }, + "response": [] + } + ] + } + ] +} diff --git a/postman/generate_postman.py b/postman/generate_postman.py new file mode 100644 index 0000000..aca9144 --- /dev/null +++ b/postman/generate_postman.py @@ -0,0 +1,397 @@ +"""Build a Postman v2.1.0 collection from `example_payload/`. + +Walks every `example_payload//*.json`, decides POST-vs-GET +from the endpoint name, and emits a request per example. POST bodies +carry the JSON verbatim; GET endpoints unfold the top-level keys into +URL query params (matching how the API actually consumes them). + +Run from the repo root: + + python postman/generate_postman.py + +Output: `postman/collection.json` — import into Postman / Insomnia and +set the `apiKey` collection variable. +""" + +from __future__ import annotations + +import json +import uuid +from pathlib import Path +from typing import Any +from urllib.parse import quote + +REPO = Path(__file__).resolve().parent.parent +SOURCE_DIR = REPO / "example_payload" +OUT_FILE = REPO / "postman" / "collection.json" + +# Each endpoint's HTTP method + folder description. Order here matches +# the walkthrough flow: declare → write → inspect → query → cleanup. +# Run requests top-to-bottom on a fresh resource to see the full chain. +ENDPOINTS: list[tuple[str, str, str]] = [ + ( + "datastore_create", "POST", + "Declare a resource and optionally seed it with rows. Run this " + "first. Accepts either the canonical Frictionless `schema` or " + "the legacy `fields` + `primary_key` shape.", + ), + ( + "datastore_upsert", "POST", + "Write rows. `method` picks the strategy: `upsert` (default — " + "match by primaryKey, insert new), `insert` (fail on duplicate), " + "or `update` (fail on miss). Run after `datastore_create`.", + ), + ( + "datastore_info", "GET", + "Read the resource's column schema + row count. Useful for " + "confirming the table exists and verifying writes landed.", + ), + ( + "datastore_search", "GET", + "Stream rows matching `filters` / `q`. Pagination via " + "`limit` + `offset`; response carries `_links.next` / `prev` / " + "`page_size` / `page` / `total_pages`.", + ), + ( + "datastore_search_sql", "GET", + "Run a vetted `SELECT` / `WITH` statement. `LIMIT` is required " + "(parsed from the SQL); pagination links rewrite the SQL's " + "`OFFSET` so callers can follow `next` without editing.", + ), + ( + "datastore_delete", "POST", + "Cleanup. Drop the table (no filters/fields), delete rows " + "(`filters`), or drop columns (`fields`). `filters` and " + "`fields` are mutually exclusive.", + ), +] + +HEALTH_REQUESTS: list[tuple[str, str, str]] = [ + ("Welcome", "", "Banner / root endpoint. Echoes `APP_MESSAGE`."), + ("Health", "health", "Liveness probe — always 200 while the process is up."), + ("Ready", "ready", "Readiness probe — 200 only when both engines pass " + "healthcheck."), +] + + +def _request_url(path: str, query: list[dict[str, str]] | None = None) -> dict[str, Any]: + """Postman v2.1 structured URL — lets the Postman UI edit params.""" + parts = path.strip("/").split("/") + # Values can be JSON-encoded (e.g. `filters={"col":"v"}`) or contain + # spaces / `=` / `&`; percent-encode so the `raw` URL parses cleanly. + url: dict[str, Any] = { + "raw": "{{baseUrl}}/" + "/".join(parts) + ( + "?" + "&".join( + f"{quote(q['key'], safe='')}={quote(q['value'], safe='')}" + for q in query + ) + if query else "" + ), + "host": ["{{baseUrl}}"], + "path": parts, + } + if query: + url["query"] = query + return url + + +def _post_request(action: str, body: dict[str, Any], description: str) -> dict[str, Any]: + return { + "method": "POST", + "header": [{"key": "Content-Type", "value": "application/json"}], + "body": { + "mode": "raw", + "raw": json.dumps(body, indent=2), + "options": {"raw": {"language": "json"}}, + }, + "url": _request_url(f"api/3/action/{action}"), + "description": description, + } + + +def _get_request(action: str, body: dict[str, Any], description: str) -> dict[str, Any]: + """Each top-level key of the JSON becomes a query-string param.""" + query: list[dict[str, str]] = [] + for key, value in body.items(): + if value is None: + continue + if isinstance(value, (dict, list)): + # GET endpoints accept these as JSON-encoded strings on the + # URL — see `to_json_object` / `to_csv_list` in validators. + string_value = json.dumps(value, separators=(",", ":")) + else: + string_value = str(value).lower() if isinstance(value, bool) else str(value) + query.append({"key": key, "value": string_value}) + return { + "method": "GET", + "header": [], + "url": _request_url(f"api/3/action/{action}", query=query), + "description": description, + } + + +# Friendly request names + descriptions for each example payload. Falls +# back to a generated string when a file isn't listed here. Keys are +# `/` so the same stem can mean different things across +# endpoints (e.g. `with_filters` in search vs delete). +SCENARIOS: dict[str, tuple[str, str]] = { + "datastore_create/with_schema": ( + "Create - Frictionless schema (recommended)", + "Canonical input: pass a Frictionless Table Schema under " + "`schema` and (optionally) seed rows under `records`.", + ), + "datastore_create/with_resource_id": ( + "Create - legacy fields, existing resource", + "Back-compat path: the legacy `fields` + `primary_key` shape " + "against a resource id that already exists in CKAN.", + ), + "datastore_create/with_resource": ( + "Create - legacy fields, new resource", + "Back-compat path: declare a brand-new CKAN resource inline " + "with `resource: {…}` plus legacy `fields` / `primary_key`.", + ), + "datastore_upsert/upsert": ( + "Upsert - method=upsert (default)", + "Match each row by stored `primaryKey`; update on hit, insert " + "on miss. Updates only bump `_updated_at` when a non-PK column " + "actually changes.", + ), + "datastore_upsert/insert": ( + "Upsert - method=insert", + "Insert only; duplicate primary key surfaces as a clean " + "ValidationError.", + ), + "datastore_upsert/update": ( + "Upsert - method=update", + "Update only; any row whose `primaryKey` doesn't match an " + "existing row raises NotFoundError.", + ), + "datastore_info/basic": ( + "Info - by resource_id", + "Return the stored Frictionless schema + row count via " + "`INFORMATION_SCHEMA.TABLE_STORAGE`.", + ), + "datastore_info/with_id_alias": ( + "Info - `id` alias", + "`id` is accepted as a legacy CKAN alias for `resource_id`.", + ), + "datastore_search/basic": ( + "Search - minimal (just resource_id)", + "Default page (limit=100, offset=0, include_total=true).", + ), + "datastore_search/with_filters": ( + "Search - with filters", + "JSON-encoded filters on the URL. Value matches must respect " + "column types (no JSON-column equality).", + ), + "datastore_search/with_full_text": ( + "Search - `q` full-text", + "BigQuery `SEARCH(row, @q)` against every text column.", + ), + "datastore_search/paginated_sorted": ( + "Search - paginated + sorted", + "Custom projection (CSV), multi-column sort, explicit limit / " + "offset / include_total. Drives `_links.next`.", + ), + "datastore_search_sql/basic": ( + "SQL - basic SELECT", + "Plain SELECT with WHERE + LIMIT. Total comes from " + "`INFORMATION_SCHEMA.TABLE_STORAGE` since there's no aggregate.", + ), + "datastore_search_sql/aggregate": ( + "SQL - GROUP BY + aggregates", + "Aggregates collapse rows, so total goes through the " + "`COUNT(*) FROM ()` path instead of the metadata shortcut.", + ), + "datastore_search_sql/with_cte": ( + "SQL - WITH (CTE)", + "Common table expression. CTE aliases are excluded from auth + " + "qualification (they're inline, not external tables).", + ), + "datastore_search_sql/paginated": ( + "SQL - LIMIT + OFFSET", + "Pagination via OFFSET. `_links.next` will rewrite the OFFSET " + "in the SQL string so the caller can follow without editing.", + ), + "datastore_search_sql/join": ( + "SQL - JOIN two resources", + "JOIN across two resource_ids. Each table is authorised " + "independently via CKAN; both get the `project.dataset` prefix.", + ), + "datastore_search_sql/union": ( + "SQL - UNION ALL two resources", + "UNION ALL across two resource_ids — handy for combined " + "reports over time-partitioned tables.", + ), + "datastore_delete/whole_table": ( + "Delete - drop the whole table", + "No `filters`, no `fields` → DROP TABLE + delete metadata row. " + "Resource disappears entirely.", + ), + "datastore_delete/with_filters": ( + "Delete - narrow row delete", + "Parameterised `DELETE FROM … WHERE …` against the filter " + "columns. JSON-column equality rejected at the boundary.", + ), + "datastore_delete/with_fields": ( + "Delete - drop columns", + "`ALTER TABLE DROP COLUMN …` + rewrite the stored schema. " + "System columns (`_id`, `_updated_at`) and PK columns are " + "protected.", + ), + "datastore_delete/force_readonly": ( + "Delete - force read-only resource", + "`force=true` bypasses the CKAN read-only guard.", + ), +} + + +# Preferred request order within each folder. Items not listed here +# fall in at the end, alphabetically. +SCENARIO_ORDER: dict[str, list[str]] = { + "datastore_create": [ + "with_schema", "with_resource_id", "with_resource", + ], + "datastore_upsert": [ + "upsert", "insert", "update", + ], + "datastore_info": [ + "basic", "with_id_alias", + ], + "datastore_search": [ + "basic", "with_filters", "with_full_text", "paginated_sorted", + ], + "datastore_search_sql": [ + "basic", "aggregate", "with_cte", + "paginated", "join", "union", + ], + "datastore_delete": [ + "whole_table", "with_filters", "with_fields", "force_readonly", + ], +} + + +def _sorted_payloads(action: str, dir_path: Path) -> list[Path]: + """Order example files by `SCENARIO_ORDER` (intro → advanced); fall + back to filename for anything not pre-listed.""" + preferred = SCENARIO_ORDER.get(action, []) + rank = {stem: i for i, stem in enumerate(preferred)} + files = [p for p in dir_path.iterdir() if p.suffix == ".json"] + files.sort(key=lambda p: (rank.get(p.stem, 10_000), p.stem)) + return files + + +def _scenario(action: str, payload_file: Path) -> tuple[str, str]: + """Friendly name + description for a request.""" + key = f"{action}/{payload_file.stem}" + if key in SCENARIOS: + return SCENARIOS[key] + # Fallback for new examples not yet in the lookup. + name = payload_file.stem.replace("_", " ") + rel = payload_file.relative_to(REPO).as_posix() + return name, f"Scenario from {rel}." + + +def _build_endpoint_folder( + action: str, method: str, description: str, +) -> dict[str, Any]: + folder_items: list[dict[str, Any]] = [] + dir_path = SOURCE_DIR / action + if not dir_path.is_dir(): + return {"name": action, "description": description, "item": []} + for payload_file in _sorted_payloads(action, dir_path): + if "response" in payload_file.stem: + # Sample server responses live next to requests for docs; + # they don't belong in a Postman *request* collection. + continue + with payload_file.open() as f: + body = json.load(f) + name, scenario_desc = _scenario(action, payload_file) + builder = _post_request if method == "POST" else _get_request + folder_items.append({ + "name": name, + "request": builder(action, body, scenario_desc), + "response": [], + }) + return { + "name": action, + "description": description, + "item": folder_items, + } + + +def _build_health_folder() -> dict[str, Any]: + items = [] + for name, sub, desc in HEALTH_REQUESTS: + path = sub if sub else "" + items.append({ + "name": name, + "request": { + "method": "GET", + "header": [], + "url": _request_url(path), + "description": desc, + }, + "response": [], + }) + return { + "name": "health", + "description": ( + "Health endpoints live at the root and also under " + "`/api/3/action/` so k8s probes and CKAN clients can both " + "reach them. Listed here at the root." + ), + "item": items, + } + + +def build_collection() -> dict[str, Any]: + folders: list[dict[str, Any]] = [_build_health_folder()] + for action, method, description in ENDPOINTS: + folders.append(_build_endpoint_folder(action, method, description)) + return { + "info": { + "_postman_id": str(uuid.uuid4()), + "name": "Datastore API", + "description": ( + "CKAN-compatible datastore API — auto-generated from " + "`example_payload/`. Set `baseUrl` to your server, " + "`apiKey` to a CKAN API key (anonymous reads are " + "allowed; writes require a key), and `resourceId` to " + "the table you want to hit." + ), + "schema": ( + "https://schema.getpostman.com/json/collection/" + "v2.1.0/collection.json" + ), + }, + "variable": [ + {"key": "baseUrl", "value": "http://localhost:8000", "type": "string"}, + {"key": "apiKey", "value": "", "type": "string"}, + {"key": "resourceId", "value": "balancing_auction_results_2025", + "type": "string"}, + ], + "auth": { + "type": "apikey", + "apikey": [ + {"key": "key", "value": "Authorization", "type": "string"}, + {"key": "value", "value": "{{apiKey}}", "type": "string"}, + {"key": "in", "value": "header", "type": "string"}, + ], + }, + "item": folders, + } + + +def main() -> None: + collection = build_collection() + OUT_FILE.parent.mkdir(exist_ok=True) + with OUT_FILE.open("w") as f: + json.dump(collection, f, indent=2) + f.write("\n") + request_count = sum(len(folder["item"]) for folder in collection["item"]) + print(f"Wrote {OUT_FILE.relative_to(REPO)} with {request_count} requests.") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 5b73216..8f2d5c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "uvloop>=0.21", "httptools>=0.6", "sqlglot>=25.0", + "pyjwt>=2.8,<3", ] [project.optional-dependencies] diff --git a/tests/auth/__init__.py b/tests/auth/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/auth/anonymous/__init__.py b/tests/auth/anonymous/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/auth/anonymous/test_provider.py b/tests/auth/anonymous/test_provider.py new file mode 100644 index 0000000..42eaf89 --- /dev/null +++ b/tests/auth/anonymous/test_provider.py @@ -0,0 +1,48 @@ +"""Anonymous provider — every call returns an empty `Decision`.""" + +from __future__ import annotations + +import asyncio + +from datastore.auth.anonymous import Provider as AnonymousProvider +from datastore.auth.base import Decision + + +def test_authorize_returns_empty_decision_regardless_of_inputs() -> None: + provider = AnonymousProvider() + decision = asyncio.run(provider.authorize( + credential=None, + resource_id="any", + package_id=None, + permission="read", + )) + assert decision == Decision() + + +def test_authorize_does_not_care_about_credential() -> None: + provider = AnonymousProvider() + # Same result whether or not a token is presented. + a = asyncio.run(provider.authorize( + credential="token-1", resource_id="r", package_id=None, permission="read", + )) + b = asyncio.run(provider.authorize( + credential=None, resource_id="r", package_id=None, permission="read", + )) + assert a == b == Decision() + + +def test_key_id_is_constant_anon_string() -> None: + # Stable across credentials — the provider has no notion of identity. + provider = AnonymousProvider() + assert provider.key_id("anything") == "anon" + assert provider.key_id("") == "anon" + + +def test_provider_name_is_anonymous() -> None: + assert AnonymousProvider().name == "anonymous" + + +def test_constructor_absorbs_unused_kwargs() -> None: + # Lifespan passes `config=` / `ckan=` for all providers uniformly; + # the anonymous one must ignore them without error. + AnonymousProvider(config=object(), ckan=object()) diff --git a/tests/auth/ckan/__init__.py b/tests/auth/ckan/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/auth/ckan/test_provider.py b/tests/auth/ckan/test_provider.py new file mode 100644 index 0000000..37e3853 --- /dev/null +++ b/tests/auth/ckan/test_provider.py @@ -0,0 +1,265 @@ +"""CKAN provider — binds api_key per call, maps result to Decision, caches. + +The provider holds an unbound `CKANClient` + a TTL cache; each `authorize()` +call clones the client with the caller's credential and wraps the round +trip in the cache. Tests use a small fake CKAN to pin both the binding / +mapping and the cache hit/miss/fail-open behaviour. +""" + +from __future__ import annotations + +import asyncio +from typing import Any + +import jwt +import orjson +import pytest +from datastore.auth.ckan import Provider as CKANAuthProvider +from datastore.core.exceptions import AuthorizationError +from datastore.infrastructure.cache import InMemoryCache + + +class FakeCKAN: + """Minimal stand-in for `CKANClient` — records the bound key + call args.""" + + def __init__(self, result: dict[str, Any] | None = None) -> None: + self._bound_key: str | None = None + self._result = result or { + "package": {"id": "pkg-1"}, + "resource": {"id": "res-1", "package_id": "pkg-1"}, + } + self.calls: list[dict[str, Any]] = [] + self.raise_on_authorize: Exception | None = None + + def bind(self, api_key: str | None) -> "FakeCKAN": + clone = FakeCKAN(self._result) + clone._bound_key = api_key + clone.calls = self.calls # share so test sees calls regardless of clone + clone.raise_on_authorize = self.raise_on_authorize + return clone + + async def datastore_authorize( + self, + *, + resource_id: str | None, + package_id: str | None, + permission: str | None = None, + ) -> dict[str, Any]: + self.calls.append( + { + "bound_key": self._bound_key, + "resource_id": resource_id, + "package_id": package_id, + "permission": permission, + } + ) + if self.raise_on_authorize is not None: + raise self.raise_on_authorize + return self._result + + +class ExplodingCache: + """CachePort stand-in — every op raises. Verifies fail-open behaviour.""" + + async def get(self, key: str) -> bytes | None: + raise RuntimeError("cache down") + + async def set(self, key: str, value: bytes, ttl: int) -> None: + raise RuntimeError("cache down") + + +def _provider( + ckan: FakeCKAN | None = None, + cache: InMemoryCache | ExplodingCache | None = None, + cache_ttl: int = 60, +) -> CKANAuthProvider: + return CKANAuthProvider( + ckan=ckan or FakeCKAN(), + cache=cache or InMemoryCache(), + cache_ttl=cache_ttl, + ) + + +# --- mapping + binding ------------------------------------------------------ + + +def test_authorize_binds_credential_and_maps_response_to_decision() -> None: + ckan = FakeCKAN() + provider = _provider(ckan=ckan) + + decision = asyncio.run(provider.authorize( + credential="token-xyz", + resource_id="res-1", + package_id=None, + permission="read", + )) + + assert ckan.calls == [ + { + "bound_key": "token-xyz", + "resource_id": "res-1", + "package_id": None, + "permission": "read", + } + ] + # `subject` carries a hash of the credential (raw key never leaves + # this provider). Same shape as `key_id`. + assert decision.subject == provider.key_id("token-xyz") + assert decision.resource == {"id": "res-1", "package_id": "pkg-1"} + assert decision.package == {"id": "pkg-1"} + assert decision.claims is None + + +def test_authorize_propagates_ckan_authorization_error() -> None: + ckan = FakeCKAN() + ckan.raise_on_authorize = AuthorizationError("denied") + provider = _provider(ckan=ckan) + + with pytest.raises(AuthorizationError, match="denied"): + asyncio.run(provider.authorize( + credential="t", resource_id="r", package_id=None, permission="read", + )) + + +def test_authorize_handles_missing_metadata_fields() -> None: + # CKAN's package-scoped flow returns no `resource` dict; mapping + # must tolerate that (Decision.resource just stays None). + ckan = FakeCKAN(result={"package": {"id": "pkg-1"}}) + provider = _provider(ckan=ckan) + + decision = asyncio.run(provider.authorize( + credential="t", resource_id=None, package_id="pkg-1", permission="create", + )) + assert decision.package == {"id": "pkg-1"} + assert decision.resource is None + + +# --- cache hit / miss / errors ---------------------------------------------- + + +def test_cache_hit_skips_ckan_on_second_call() -> None: + ckan = FakeCKAN() + cache = InMemoryCache() + provider = _provider(ckan=ckan, cache=cache) + + asyncio.run(provider.authorize( + credential="tok", resource_id="res-1", package_id=None, permission="read", + )) + asyncio.run(provider.authorize( + credential="tok", resource_id="res-1", package_id=None, permission="read", + )) + + # CKAN called exactly once across both authorizations. + assert len(ckan.calls) == 1 + + +def test_cache_key_uses_anon_marker_when_no_credential() -> None: + ckan = FakeCKAN() + cache = InMemoryCache() + provider = _provider(ckan=ckan, cache=cache) + + asyncio.run(provider.authorize( + credential=None, resource_id="res-1", package_id=None, permission="read", + )) + + # Verify by hitting again with the same shape — second call must be cached. + asyncio.run(provider.authorize( + credential=None, resource_id="res-1", package_id=None, permission="read", + )) + assert len(ckan.calls) == 1 + + +def test_separate_credentials_get_separate_cache_entries() -> None: + ckan = FakeCKAN() + cache = InMemoryCache() + provider = _provider(ckan=ckan, cache=cache) + + asyncio.run(provider.authorize( + credential="user-a", resource_id="r", package_id=None, permission="read", + )) + asyncio.run(provider.authorize( + credential="user-b", resource_id="r", package_id=None, permission="read", + )) + + # Two distinct cache entries → two CKAN calls. + assert len(ckan.calls) == 2 + + +def test_package_scoped_call_uses_pkg_cache_namespace() -> None: + ckan = FakeCKAN() + cache = InMemoryCache() + provider = _provider(ckan=ckan, cache=cache) + + # res-scoped and pkg-scoped calls share neither key nor cache entry. + asyncio.run(provider.authorize( + credential="tok", resource_id="x", package_id=None, permission="read", + )) + asyncio.run(provider.authorize( + credential="tok", resource_id=None, package_id="x", permission="create", + )) + assert len(ckan.calls) == 2 + + +def test_cache_failure_falls_through_to_ckan() -> None: + ckan = FakeCKAN() + provider = _provider(ckan=ckan, cache=ExplodingCache()) + + # Fail-open: a broken cache must not break the request. + decision = asyncio.run(provider.authorize( + credential="tok", resource_id="res-1", package_id=None, permission="read", + )) + + assert decision.resource == {"id": "res-1", "package_id": "pkg-1"} + assert len(ckan.calls) == 1 + + +def test_malformed_cache_entry_falls_through_to_ckan() -> None: + # A poisoned cache value (not a JSON dict) is treated as a miss. + # Blocking auth on a corrupt cache entry would be a self-inflicted + # outage; we log + re-query CKAN instead. + ckan = FakeCKAN() + cache = InMemoryCache() + provider = _provider(ckan=ckan, cache=cache) + cache_key = ( + f"auth:ckan:{provider.key_id('tok')}:res:res-1:read" + ) + asyncio.run(cache.set(cache_key, orjson.dumps("not-a-dict"), 60)) + + decision = asyncio.run(provider.authorize( + credential="tok", resource_id="res-1", package_id=None, permission="read", + )) + + # Fell back to CKAN and got the canned decision. + assert decision.resource == {"id": "res-1", "package_id": "pkg-1"} + assert len(ckan.calls) == 1 + + +def test_subject_in_cached_decision_is_hashed_not_raw_credential() -> None: + # Security: the raw credential must never end up in the cache. + # `Decision.subject` is what gets serialised — store the hash. + ckan = FakeCKAN() + provider = _provider(ckan=ckan, cache=InMemoryCache()) + + decision = asyncio.run(provider.authorize( + credential="raw-api-key-do-not-leak", + resource_id="res-1", package_id=None, permission="read", + )) + + assert decision.subject is not None + assert "raw-api-key-do-not-leak" not in decision.subject + assert decision.subject.startswith("h:") + + +# --- key derivation + name -------------------------------------------------- + + +def test_key_id_hashes_credentials_regardless_of_jwt_shape() -> None: + # JWTs and opaque tokens both go through sha256 — never trust an + # unverified JWT claim for cache identity. + jwt_tok = jwt.encode({"sub": "u", "jti": "tok-42"}, "k", algorithm="HS256") + assert _provider().key_id(jwt_tok).startswith("h:") + assert _provider().key_id("opaque-api-key").startswith("h:") + + +def test_provider_name_is_ckan() -> None: + assert _provider().name == "ckan" diff --git a/tests/auth/jwt/__init__.py b/tests/auth/jwt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/auth/jwt/test_provider.py b/tests/auth/jwt/test_provider.py new file mode 100644 index 0000000..b415556 --- /dev/null +++ b/tests/auth/jwt/test_provider.py @@ -0,0 +1,117 @@ +"""JWT provider — verifies signature + optional `aud` / `iss` claims. + +PyJWT does the heavy lifting; tests pin the provider's wrapping: + - what becomes `Decision.subject` / `Decision.claims`, + - which failure modes translate to `AuthorizationError`, + - HS* vs RS* key wiring at construction time. +""" + +from __future__ import annotations + +import asyncio +import datetime as dt +from typing import Any + +import jwt +import pytest +from datastore.auth.jwt import Provider as JWTAuthProvider +from datastore.core.config import Config +from datastore.core.exceptions import AuthorizationError + +SECRET = "topsecret" + + +def _hs256_config(**overrides: Any) -> Config: + base = { + "AUTH_TYPE": "jwt", + "JWT_ALGORITHM": "HS256", + "JWT_SECRET": SECRET, + "CKAN_URL": "", + } + base.update(overrides) + return Config(**base) + + +def _provider(**overrides: Any) -> JWTAuthProvider: + return JWTAuthProvider(config=_hs256_config(**overrides)) + + +def _authorize(provider: JWTAuthProvider, token: str | None): + return asyncio.run(provider.authorize( + credential=token, + resource_id="r", + package_id=None, + permission="read", + )) + + +def test_valid_token_returns_decision_with_subject_and_claims() -> None: + token = jwt.encode({"sub": "user-1", "role": "admin"}, SECRET, algorithm="HS256") + decision = _authorize(_provider(), token) + assert decision.subject == "user-1" + assert decision.claims == {"sub": "user-1", "role": "admin"} + + +def test_missing_credential_is_rejected_before_jwt_decode() -> None: + with pytest.raises(AuthorizationError, match="JWT token required"): + _authorize(_provider(), None) + + +def test_invalid_signature_raises_authorization_error() -> None: + token = jwt.encode({"sub": "u"}, "wrong-secret", algorithm="HS256") + with pytest.raises(AuthorizationError, match="invalid JWT"): + _authorize(_provider(), token) + + +def test_expired_token_raises_authorization_error() -> None: + past = dt.datetime.now(tz=dt.timezone.utc) - dt.timedelta(seconds=5) + token = jwt.encode({"sub": "u", "exp": past}, SECRET, algorithm="HS256") + with pytest.raises(AuthorizationError, match="invalid JWT"): + _authorize(_provider(), token) + + +def test_audience_mismatch_raises_authorization_error() -> None: + provider = _provider(JWT_AUDIENCE="expected-aud") + token = jwt.encode({"sub": "u", "aud": "other-aud"}, SECRET, algorithm="HS256") + with pytest.raises(AuthorizationError, match="invalid JWT"): + _authorize(provider, token) + + +def test_audience_match_passes() -> None: + provider = _provider(JWT_AUDIENCE="expected-aud") + token = jwt.encode( + {"sub": "u", "aud": "expected-aud"}, SECRET, algorithm="HS256" + ) + decision = _authorize(provider, token) + assert decision.subject == "u" + + +def test_issuer_mismatch_raises_authorization_error() -> None: + provider = _provider(JWT_ISSUER="expected-iss") + token = jwt.encode({"sub": "u", "iss": "other-iss"}, SECRET, algorithm="HS256") + with pytest.raises(AuthorizationError, match="invalid JWT"): + _authorize(provider, token) + + +def test_missing_sub_claim_yields_subject_none() -> None: + token = jwt.encode({"role": "guest"}, SECRET, algorithm="HS256") + decision = _authorize(_provider(), token) + assert decision.subject is None + assert decision.claims == {"role": "guest"} + + +def test_garbled_token_raises_authorization_error() -> None: + with pytest.raises(AuthorizationError, match="invalid JWT"): + _authorize(_provider(), "not.a.real.jwt") + + +def test_key_id_hashes_full_token() -> None: + # Cache identity is sha256-of-credential — unverified JWT claims + # (like `jti`) are never used for the cache key. + provider = _provider() + token = jwt.encode({"sub": "u", "jti": "tok-1"}, SECRET, algorithm="HS256") + assert provider.key_id(token).startswith("h:") + + +def test_provider_name_is_jwt() -> None: + assert _provider().name == "jwt" diff --git a/tests/auth/test_base.py b/tests/auth/test_base.py new file mode 100644 index 0000000..36728de --- /dev/null +++ b/tests/auth/test_base.py @@ -0,0 +1,47 @@ +"""`Decision` shape + `default_key_id` always-sha256 behaviour.""" + +from __future__ import annotations + +import hashlib + +import jwt +import pytest +from datastore.auth.base import Decision, default_key_id + + +def test_decision_defaults_are_all_none() -> None: + d = Decision() + assert d.subject is None + assert d.claims is None + assert d.resource is None + assert d.package is None + + +def test_default_key_id_hashes_full_credential_even_for_jwt() -> None: + # Security: never derive cache identity from unverified JWT claims + # (a forged `jti` could collide with a verified user's cache entry). + # The full credential bytes always go through sha256. + token = jwt.encode({"sub": "u", "jti": "abc123"}, "k", algorithm="HS256") + expected = "h:" + hashlib.sha256(token.encode()).hexdigest()[:16] + assert default_key_id(token) == expected + + +def test_default_key_id_hashes_opaque_token() -> None: + token = "opaque-token-no-dots" + expected = "h:" + hashlib.sha256(token.encode()).hexdigest()[:16] + assert default_key_id(token) == expected + + +def test_two_different_jwts_with_same_jti_get_different_cache_keys() -> None: + # The whole point of dropping the jti optimisation: A and B both + # claim `jti=shared` but were signed differently. Their cache keys + # must NOT collide. + a = jwt.encode({"sub": "a", "jti": "shared"}, "key-1", algorithm="HS256") + b = jwt.encode({"sub": "b", "jti": "shared"}, "key-2", algorithm="HS256") + assert default_key_id(a) != default_key_id(b) + + +@pytest.mark.parametrize("token", ["", "a", "a.b", "a.b.c", "a.b.c.d"]) +def test_default_key_id_is_sha256_for_any_input_shape(token: str) -> None: + expected = "h:" + hashlib.sha256(token.encode()).hexdigest()[:16] + assert default_key_id(token) == expected diff --git a/tests/auth/test_orchestration.py b/tests/auth/test_orchestration.py new file mode 100644 index 0000000..03676d9 --- /dev/null +++ b/tests/auth/test_orchestration.py @@ -0,0 +1,147 @@ +"""`api/auth.py` orchestration — validation + anonymous-read policy. + +Provider behaviour is tested per-provider in `tests/auth//`. +Caching is provider-specific (only CKAN caches) and lives in +`tests/auth/ckan/test_provider.py`. Here we pin only the cross-cutting +pieces that apply to every provider: + - permission whitelist + resource_id XOR package_id validation; + - anonymous-read policy (read with no api_key forwards to provider); + - non-read with no api_key hard-fails before any provider call; + - the dict shape returned to endpoints. +""" + +from __future__ import annotations + +import asyncio +from typing import Any + +import pytest +from datastore.api.auth import authorize +from datastore.auth.base import Decision +from datastore.core.exceptions import AuthorizationError, ValidationError + + +class FakeProvider: + """Records calls and returns a canned Decision (or raises).""" + + name = "fake" + + def __init__( + self, + decision: Decision | None = None, + raises: Exception | None = None, + ) -> None: + self._decision = decision or Decision( + resource={"id": "res-1"}, package={"id": "pkg-1"} + ) + self._raises = raises + self.calls: list[dict[str, Any]] = [] + + async def authorize(self, **kwargs: Any) -> Decision: + self.calls.append(kwargs) + if self._raises is not None: + raise self._raises + return self._decision + + def key_id(self, credential: str) -> str: + return f"k:{credential}" + + +# --- happy path ------------------------------------------------------------- + + +def test_provider_decision_is_returned_as_endpoint_data_dict_shape() -> None: + provider = FakeProvider() + result = asyncio.run(authorize( + api_key="tok", + provider=provider, + resource_id="res-1", + package_id=None, + permission="read", + )) + + assert result == {"resource": {"id": "res-1"}, "package": {"id": "pkg-1"}} + assert provider.calls == [ + { + "credential": "tok", + "resource_id": "res-1", + "package_id": None, + "permission": "read", + } + ] + + +def test_decision_without_metadata_yields_empty_dicts() -> None: + # Anonymous / JWT providers return Decision() with no resource/package; + # endpoint code reads from the dict so we must substitute empty dicts. + result = asyncio.run(authorize( + api_key="tok", + provider=FakeProvider(decision=Decision()), + resource_id="res-1", + package_id=None, + permission="read", + )) + assert result == {"resource": {}, "package": {}} + + +# --- anonymous-read policy -------------------------------------------------- + + +def test_anonymous_caller_for_read_passes_through_to_provider() -> None: + provider = FakeProvider(decision=Decision()) + asyncio.run(authorize( + api_key=None, provider=provider, + resource_id="res-1", package_id=None, permission="read", + )) + assert provider.calls[0]["credential"] is None + + +@pytest.mark.parametrize("permission", ["create", "update", "delete", "patch"]) +def test_anonymous_caller_rejected_for_non_read_permissions(permission: str) -> None: + provider = FakeProvider() + with pytest.raises(AuthorizationError, match="authenticated user"): + asyncio.run(authorize( + api_key=None, provider=provider, + resource_id="res-1", package_id=None, permission=permission, # type: ignore[arg-type] + )) + # Provider never reached — policy short-circuits first. + assert provider.calls == [] + + +# --- input validation ------------------------------------------------------- + + +def test_must_supply_exactly_one_of_resource_or_package_id() -> None: + provider = FakeProvider() + with pytest.raises(ValidationError, match="resource_id or package_id"): + asyncio.run(authorize( + api_key="tok", provider=provider, + resource_id="res-1", package_id="pkg-1", permission="read", + )) + with pytest.raises(ValidationError, match="resource_id or package_id"): + asyncio.run(authorize( + api_key="tok", provider=provider, + resource_id=None, package_id=None, permission="read", + )) + + +def test_invalid_permission_rejected_at_boundary() -> None: + provider = FakeProvider() + with pytest.raises(ValidationError, match="permission must be one of"): + asyncio.run(authorize( + api_key="tok", provider=provider, + resource_id="res-1", package_id=None, permission="execute", # type: ignore[arg-type] + )) + assert provider.calls == [] + + +# --- failure modes ---------------------------------------------------------- + + +def test_provider_authorization_error_propagates() -> None: + provider = FakeProvider(raises=AuthorizationError("nope")) + with pytest.raises(AuthorizationError, match="nope"): + asyncio.run(authorize( + api_key="tok", provider=provider, + resource_id="res-1", package_id=None, permission="read", + )) diff --git a/tests/auth/test_registry.py b/tests/auth/test_registry.py new file mode 100644 index 0000000..c73445e --- /dev/null +++ b/tests/auth/test_registry.py @@ -0,0 +1,72 @@ +"""Registry dispatch — `AUTH_TYPE` selects the right provider class. + +Verifies the importlib-based factory: each provider package's `Provider` +symbol is what gets returned, kwargs are forwarded, and constructor +errors propagate (so the lifespan can surface them at startup). +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest +from datastore.auth.anonymous import Provider as AnonymousProvider +from datastore.auth.ckan import Provider as CKANProvider +from datastore.auth.jwt import Provider as JWTProvider +from datastore.auth.registry import get_auth_provider +from datastore.core.config import Config +from datastore.infrastructure.cache import InMemoryCache + + +def test_anonymous_type_returns_anonymous_provider() -> None: + cfg = Config(AUTH_TYPE="anonymous", CKAN_URL="") + provider = get_auth_provider(cfg) + assert isinstance(provider, AnonymousProvider) + assert provider.name == "anonymous" + + +def test_ckan_type_returns_ckan_provider_and_forwards_kwargs() -> None: + cfg = Config(AUTH_TYPE="ckan", CKAN_URL="http://ckan.test") + ckan = MagicMock() + provider = get_auth_provider( + cfg, ckan=ckan, cache=InMemoryCache(), cache_ttl=60, + ) + assert isinstance(provider, CKANProvider) + assert provider.name == "ckan" + + +def test_jwt_type_returns_jwt_provider_with_hs_secret() -> None: + cfg = Config(AUTH_TYPE="jwt", JWT_SECRET="topsecret", CKAN_URL="") + provider = get_auth_provider(cfg) + assert isinstance(provider, JWTProvider) + assert provider.name == "jwt" + + +def test_each_call_returns_a_fresh_instance() -> None: + # The factory doesn't memoize — instance reuse is the caller's job + # (the lifespan builds once and stashes on app.state). + cfg = Config(AUTH_TYPE="anonymous", CKAN_URL="") + assert get_auth_provider(cfg) is not get_auth_provider(cfg) + + +def test_unknown_auth_type_rejected_at_config_validation() -> None: + # The Config validator checks against the directories on disk — + # nothing exotic; just verify the boundary fails fast. + with pytest.raises(ValueError, match="AUTH_TYPE"): + Config(AUTH_TYPE="does_not_exist", CKAN_URL="") + + +def test_jwt_provider_raises_when_hs_secret_missing() -> None: + # Constructor errors must propagate so the lifespan surfaces them + # at startup rather than on the first request. + cfg = Config(AUTH_TYPE="jwt", JWT_ALGORITHM="HS256", JWT_SECRET="", CKAN_URL="") + with pytest.raises(ValueError, match="JWT_SECRET"): + get_auth_provider(cfg) + + +def test_jwt_provider_raises_when_rs_public_key_missing() -> None: + cfg = Config( + AUTH_TYPE="jwt", JWT_ALGORITHM="RS256", JWT_PUBLIC_KEY="", CKAN_URL="" + ) + with pytest.raises(ValueError, match="JWT_PUBLIC_KEY"): + get_auth_provider(cfg) diff --git a/tests/conftest.py b/tests/conftest.py index 6acac69..59d2f72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,13 +4,44 @@ from typing import Any import pytest -from datastore.api.context import get_cache, get_ckan_client +from datastore.api.context import get_auth_provider, get_ckan_client +from datastore.auth.ckan import Provider as CKANAuthProvider from datastore.core.exceptions import AuthorizationError, NotFoundError from datastore.infrastructure.cache import InMemoryCache from datastore.main import create_app from fastapi.testclient import TestClient +@pytest.fixture(autouse=True) +def _isolate_bigquery_env(monkeypatch: pytest.MonkeyPatch) -> None: + """Force the BigQuery engine into placeholder mode for every test. + + The unit suite isn't allowed to contact real BigQuery — engine tests + mock `client.query` / `client.insert_rows_json` directly, and other + layers (write service, action endpoints) rely on the backend's + placeholder-echo branch (active when project/dataset are unset). + + A developer .env that points at a live BQ project would otherwise + flip the engine into real mode, talk to GCP, and either hang the + suite on network calls or fail tests that expect echo semantics. + Clearing the four BQ envs (and resetting the engine cache so a + previously-built live instance doesn't survive between tests) keeps + the suite hermetic. + """ + from datastore.core.config import get_config + from datastore.infrastructure.engines.registry import reset_engine_cache + + for name in ( + "BIGQUERY_PROJECT", "BIGQUERY_DATASET", + "BIGQUERY_CREDENTIALS", "BIGQUERY_CREDENTIALS_RO", + ): + monkeypatch.setenv(name, "") + # `Config` and engine instances are lru-cached / module-level + # singletons; invalidate so the cleared env actually takes effect. + get_config.cache_clear() + reset_engine_cache() + + class FakeCKAN: """In-memory stand-in matching `CKANClient` shape (api_key bound on instance). @@ -46,7 +77,8 @@ async def datastore_authorize( permission: str | None = None, ) -> dict[str, Any]: self.authorize_calls += 1 - self._guard() + if self._api_key and self._api_key in self.deny_keys: + raise AuthorizationError(f"key '{self._api_key}' is not allowed") if resource_id is not None: existing = self.resources.get(resource_id) @@ -116,7 +148,11 @@ def cache() -> InMemoryCache: def client(fake_ckan: FakeCKAN, cache: InMemoryCache) -> Iterator[TestClient]: app = create_app() app.dependency_overrides[get_ckan_client] = lambda: fake_ckan - app.dependency_overrides[get_cache] = lambda: cache + # Auth provider talks to the same FakeCKAN — tests don't go through + # the real HTTP CKAN client. Mirrors what the lifespan would build. + app.dependency_overrides[get_auth_provider] = lambda: CKANAuthProvider( + ckan=fake_ckan, cache=cache, cache_ttl=60, + ) with TestClient(app) as c: c.headers["Authorization"] = "test-token" yield c diff --git a/tests/engines/__init__.py b/tests/engines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/engines/bigquery/__init__.py b/tests/engines/bigquery/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/engines/bigquery/test_metadata.py b/tests/engines/bigquery/test_metadata.py new file mode 100644 index 0000000..b93bc66 --- /dev/null +++ b/tests/engines/bigquery/test_metadata.py @@ -0,0 +1,235 @@ +"""Unit tests for `BigQueryMetadataStore`. + +The store talks to BigQuery via `client.query(sql, job_config=...)`. We +mock the client so tests can pin: + - what SQL the store issues (DDL on `initialize`, INSERT on `insert`, + UPDATE on `update`, SELECT on `get`, DELETE on `delete`); + - what query parameters travel alongside the SQL. + +No real BigQuery is contacted — these are pure unit tests over the +SQL the store generates. +""" + +from __future__ import annotations + +import json +from typing import Any +from unittest.mock import MagicMock + +import pytest +from datastore.infrastructure.engines.bigquery.metadata import ( + METADATA_TABLE_NAME, + BigQueryMetadataStore, +) + + +@pytest.fixture +def mock_client() -> MagicMock: + """A `bigquery.Client` stand-in that records `.query(...)` calls. + + `client.query(sql, job_config=...)` returns a job whose `.result()` + yields whatever the test arranges via `mock_client.set_rows([...])`. + """ + client = MagicMock() + job = MagicMock() + job.result.return_value = [] + client.query.return_value = job + + def _set_rows(rows: list[dict[str, Any]]) -> None: + job.result.return_value = rows + + client.set_rows = _set_rows # type: ignore[attr-defined] + return client + + +@pytest.fixture +def store(mock_client: MagicMock) -> BigQueryMetadataStore: + return BigQueryMetadataStore( + client=mock_client, + project="proj-1", + dataset="ds-1", + ) + + +# --- initialize ------------------------------------------------------------ + + +def test_initialize_issues_create_table_if_not_exists( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + store.initialize() + + assert mock_client.query.call_count == 1 + sql = mock_client.query.call_args[0][0] + assert "CREATE TABLE IF NOT EXISTS" in sql + assert "`proj-1.ds-1._table_metadata`" in sql + # Schema columns are declared. + for col in ( + "resource_id STRING", + "schema JSON", + "created_at TIMESTAMP", + "updated_at TIMESTAMP", + ): + assert col in sql + + +def test_initialize_is_idempotent( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + """Two `initialize()` calls — both safe because the DDL uses + `IF NOT EXISTS`.""" + store.initialize() + store.initialize() + + assert mock_client.query.call_count == 2 + + +# --- insert ---------------------------------------------------------------- + + +def test_insert_issues_parameterised_insert( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + schema = { + "fields": [{"name": "a", "type": "integer"}], + "primaryKey": ["a"], + } + store.insert("res-1", schema) + + assert mock_client.query.call_count == 1 + sql, kwargs = mock_client.query.call_args + sql_text = sql[0] + assert "INSERT INTO" in sql_text + assert "MERGE" not in sql_text # no upsert semantics + assert "PARSE_JSON(@schema)" in sql_text + assert "CURRENT_TIMESTAMP()" in sql_text + + params = {p.name: p.value for p in kwargs["job_config"].query_parameters} + assert params["resource_id"] == "res-1" + assert json.loads(params["schema"]) == schema + + +# --- update ---------------------------------------------------------------- + + +def test_update_issues_parameterised_update( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + schema = {"fields": [{"name": "b", "type": "string"}]} + store.update("res-1", schema) + + assert mock_client.query.call_count == 1 + sql, kwargs = mock_client.query.call_args + sql_text = sql[0] + assert "UPDATE" in sql_text + assert "SET schema = PARSE_JSON(@schema)" in sql_text + assert "WHERE resource_id = @resource_id" in sql_text + # `created_at` must NOT be reassigned by update. + assert "created_at" not in sql_text + + params = {p.name: p.value for p in kwargs["job_config"].query_parameters} + assert params["resource_id"] == "res-1" + assert json.loads(params["schema"]) == schema + + +# --- get ------------------------------------------------------------------- + + +def test_get_returns_parsed_schema( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + schema = {"fields": [{"name": "a", "type": "integer"}]} + mock_client.set_rows([{"schema_json": json.dumps(schema)}]) + + out = store.get("res-1") + + assert out == schema + + +def test_get_returns_none_when_no_row( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + mock_client.set_rows([]) + + assert store.get("does-not-exist") is None + + +# --- delete ---------------------------------------------------------------- + + +def test_delete_issues_parameterised_delete( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + store.delete("res-1") + + assert mock_client.query.call_count == 1 + sql, kwargs = mock_client.query.call_args + assert "DELETE FROM" in sql[0] + assert "WHERE resource_id = @resource_id" in sql[0] + params = {p.name: p.value for p in kwargs["job_config"].query_parameters} + assert params["resource_id"] == "res-1" + + +# --- table reference ------------------------------------------------------- + +def test_table_ref_format(store: BigQueryMetadataStore) -> None: + assert store.table_ref == "`proj-1.ds-1._table_metadata`" + assert store.table_name == METADATA_TABLE_NAME + + +# --- error wrapping -------------------------------------------------------- + + +def test_insert_wraps_bigquery_errors_as_server_error( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + """Raw BigQuery exceptions are wrapped as `ServerError` carrying + the operation name + resource_id.""" + from datastore.core.exceptions import ServerError + + mock_client.query.return_value.result.side_effect = RuntimeError( + "quota exceeded" + ) + + with pytest.raises(ServerError) as exc: + store.insert("res-1", {"fields": [{"name": "a", "type": "integer"}]}) + + msg = str(exc.value) + assert "metadata INSERT" in msg + assert "'res-1'" in msg + assert "quota exceeded" in msg + + +def test_update_wraps_bigquery_errors_as_server_error( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + from datastore.core.exceptions import ServerError + + mock_client.query.return_value.result.side_effect = RuntimeError( + "bigquery is sad" + ) + + with pytest.raises(ServerError) as exc: + store.update("res-1", {"fields": [{"name": "a", "type": "integer"}]}) + + assert "metadata UPDATE" in str(exc.value) + assert "'res-1'" in str(exc.value) + + +def test_initialize_wraps_bigquery_errors_as_server_error( + store: BigQueryMetadataStore, mock_client: MagicMock +) -> None: + """Init has no resource_id; the error message uses `` so the + target is still labelled.""" + from datastore.core.exceptions import ServerError + + mock_client.query.return_value.result.side_effect = RuntimeError( + "permission denied" + ) + + with pytest.raises(ServerError) as exc: + store.initialize() + + msg = str(exc.value) + assert "metadata CREATE TABLE" in msg + assert "" in msg diff --git a/tests/engines/bigquery/test_tables.py b/tests/engines/bigquery/test_tables.py new file mode 100644 index 0000000..06db34e --- /dev/null +++ b/tests/engines/bigquery/test_tables.py @@ -0,0 +1,1814 @@ +"""Unit tests for the BigQuery write paths — DDL, records insert, +MERGE/UPDATE, and the `upsert` action dispatch. + +A mocked `bigquery.Client` is plugged into a backend whose +`initialize()` we skip — no real BigQuery is contacted. The tests +pin SQL shape, parameter binding, error wrapping, and the +metadata/DDL atomicity contract. +""" + +from __future__ import annotations + +import json +from typing import Any +from unittest.mock import MagicMock + +import pytest +from datastore.core.exceptions import ( + ConflictError, + NotFoundError, + ServerError, + ValidationError, +) +from datastore.infrastructure.engines.bigquery.backend import BigQueryBackend +from datastore.infrastructure.engines.bigquery.lib import ( + merge_sql, + update_sql, +) +from datastore.infrastructure.engines.bigquery.types import ( + bigquery_type, + can_widen, +) + +# --- fixtures -------------------------------------------------------------- + + +@pytest.fixture +def mock_client() -> MagicMock: + client = MagicMock() + # Default: every `client.query(...).result()` yields no rows. + client.query.return_value.result.return_value = [] + return client + + +def _backend(client: MagicMock) -> BigQueryBackend: + """Backend wired with a mocked client + config; skips `initialize()`.""" + b = BigQueryBackend(mode="rw") + b.client = client + b.config = MagicMock() + b.config.BIGQUERY_PROJECT = "proj-1" + b.config.BIGQUERY_DATASET = "ds-1" + # Default cache flag — real bool so `QueryJobConfig.use_query_cache` + # is True, not a MagicMock, in tests that inspect the job config. + b.config.BIGQUERY_USE_QUERY_CACHE = True + return b + + +# --- types.py -------------------------------------------------------------- + + +def test_bigquery_type_resolves_canonical_and_falls_back_to_string() -> None: + assert bigquery_type("integer") == "INT64" + assert bigquery_type("datetime") == "TIMESTAMP" + assert bigquery_type("object") == "JSON" + assert bigquery_type("unknown-type") == "STRING" + assert bigquery_type(None) == "STRING" + + +def test_can_widen_allows_supported_and_rejects_others() -> None: + assert can_widen("INT64", "INT64") is True # identity + assert can_widen("INT64", "FLOAT64") is True # supported widening + assert can_widen("DATE", "TIMESTAMP") is True + assert can_widen("INT64", "STRING") is False + assert can_widen("FLOAT64", "INT64") is False # narrowing + + +# --- DDL helpers ----------------------------------------------------------- + + +def test_data_table_ref_uses_backticks(mock_client: MagicMock) -> None: + """Backticks let CKAN UUID-like ids parse without further escaping.""" + assert _backend(mock_client)._data_table_ref("res-abc-123") == ( + "`proj-1.ds-1.res-abc-123`" + ) + + +def test_create_data_table_emits_create_table_if_not_exists( + mock_client: MagicMock, +) -> None: + backend = _backend(mock_client) + backend._create_data_table( + "res-1", + {"fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ]}, + ) + sql = mock_client.query.call_args[0][0] + assert "CREATE TABLE IF NOT EXISTS `proj-1.ds-1.res-1`" in sql + # User columns. + assert "`id` INT64" in sql + assert "`label` STRING" in sql + # System columns auto-prepended. + assert "`_id` INT64" in sql + assert "`_updated_at` TIMESTAMP" in sql + + +def test_alter_adds_new_columns_and_widens_supported_types( + mock_client: MagicMock, +) -> None: + backend = _backend(mock_client) + old = {"fields": [{"name": "a", "type": "integer"}]} + new = {"fields": [ + {"name": "a", "type": "number"}, # widen INT64 → FLOAT64 + {"name": "b", "type": "string"}, # add + ]} + + backend._alter_data_table("res-1", old, new) + + sql = mock_client.query.call_args[0][0] + assert "ALTER TABLE `proj-1.ds-1.res-1`" in sql + assert "ADD COLUMN IF NOT EXISTS `b` STRING" in sql + assert "ALTER COLUMN `a` SET DATA TYPE FLOAT64" in sql + + +def test_alter_rejects_unsupported_type_change_before_any_ddl( + mock_client: MagicMock, +) -> None: + """`integer` → `string` isn't a BigQuery-allowed widening — raise + ConflictError up front, never issue partial DDL.""" + backend = _backend(mock_client) + with pytest.raises(ConflictError, match="Cannot change column type"): + backend._alter_data_table( + "res-1", + {"fields": [{"name": "a", "type": "integer"}]}, + {"fields": [{"name": "a", "type": "string"}]}, + ) + mock_client.query.assert_not_called() + + +# --- records insert -------------------------------------------------------- + + +def test_insert_records_issues_dml_insert_with_rows_param( + mock_client: MagicMock, +) -> None: + """`_insert_records` runs a DML `INSERT INTO ... SELECT FROM + UNNEST(@rows)` — not the streaming `insert_rows_json` API — so + rows go straight to storage and subsequent MERGE/UPDATE can touch + them immediately.""" + backend = _backend(mock_client) + schema = {"fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "bidder_metadata", "type": "object"}, + ]} + records = [ + {"auction_id": 144, "bidder_metadata": {"unit_id": "X"}}, + {"auction_id": 145, "bidder_metadata": {"unit_id": "Y"}}, + ] + + backend._insert_records("res-1", schema, records) + + # No streaming insert. + mock_client.insert_rows_json.assert_not_called() + # Single DML statement — `MAX(_id)` is inlined as a scalar + # subquery, so we don't pay a separate round-trip for the probe. + assert mock_client.query.call_count == 1 + sql_arg, kwargs = mock_client.query.call_args + sql = sql_arg[0] + assert sql.startswith("INSERT INTO `proj-1.ds-1.res-1` ") + assert "FROM UNNEST(JSON_QUERY_ARRAY(@rows)) AS r" in sql + # JSON columns extracted via PARSE_JSON inside SQL. + assert "PARSE_JSON(JSON_QUERY(r, '$.bidder_metadata'))" in sql + # System columns auto-injected — `_id` from the inlined MAX subquery + # + ROW_NUMBER(), `_updated_at` from CURRENT_TIMESTAMP(). + assert "`_id`, `_updated_at`" in sql + assert ( + "(SELECT IFNULL(MAX(`_id`), 0) FROM `proj-1.ds-1.res-1`) " + "+ ROW_NUMBER() OVER ()" + ) in sql + assert "CURRENT_TIMESTAMP()" in sql + # Only `@rows` is passed as a parameter now — no separate probe. + params = {p.name: p.value for p in kwargs["job_config"].query_parameters} + assert list(params.keys()) == ["rows"] + assert json.loads(params["rows"]) == records + + +# --- error wrapping -------------------------------------------------------- + + +def test_client_query_errors_surface_as_server_error_with_context( + mock_client: MagicMock, +) -> None: + """Raw BQ exceptions on `client.query` are wrapped as ServerError + carrying op + resource_id — never leak as `RuntimeError`.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "Insufficient permissions" + ) + backend = _backend(mock_client) + with pytest.raises(ServerError) as exc: + backend._create_data_table( + "res-1", {"fields": [{"name": "a", "type": "integer"}]} + ) + assert "CREATE TABLE" in str(exc.value) + assert "'res-1'" in str(exc.value) + + +# --- create() orchestration ----------------------------------------------- + + +def test_create_new_resource_runs_ddl_records_then_metadata_in_order( + mock_client: MagicMock, +) -> None: + """New resource path: CREATE TABLE → INSERT INTO → metadata.insert. + Two BigQuery round-trips (`MAX(_id)` is inlined into the INSERT + statement). Metadata is the last write so any failure earlier + leaves it untouched.""" + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + parent = MagicMock() + parent.attach_mock(mock_client.query, "query") + parent.attach_mock(backend.metadata.insert, "metadata_insert") + + backend.create( + "res-1", + schema={"fields": [{"name": "a", "type": "integer"}]}, + records=[{"a": 1}], + include_total=False, + ) + + sql_calls = [c for c in parent.mock_calls if c[0] == "query"] + assert len(sql_calls) == 2 # no separate MAX(_id) probe + assert sql_calls[0].args[0].startswith("CREATE TABLE IF NOT EXISTS") + assert sql_calls[1].args[0].startswith("INSERT INTO ") + # The inlined MAX(_id) subquery rides inside the INSERT itself. + assert "SELECT IFNULL(MAX(`_id`), 0)" in sql_calls[1].args[0] + # metadata.insert came last. + assert parent.mock_calls[-1][0] == "metadata_insert" + + +def test_create_skips_metadata_when_records_insert_fails( + mock_client: MagicMock, +) -> None: + """Atomicity: a DML INSERT failure leaves metadata untouched. + + Create flow: CREATE TABLE → INSERT. Fail the second query (the + INSERT); the first (CREATE TABLE) succeeds. + """ + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + success_job = MagicMock() + success_job.result.return_value = [] + fail_job = MagicMock() + fail_job.result.side_effect = RuntimeError("insert failed") + mock_client.query.side_effect = [success_job, fail_job] + + with pytest.raises(ServerError): + backend.create( + "res-1", + schema={"fields": [{"name": "a", "type": "integer"}]}, + records=[{"a": 1}], + include_total=False, + ) + backend.metadata.insert.assert_not_called() + backend.metadata.update.assert_not_called() + + +def test_create_placeholder_mode_skips_everything( + mock_client: MagicMock, +) -> None: + """No metadata store → no DDL, no metadata calls. Lets the unit + suite run without GCP creds.""" + backend = _backend(mock_client) + backend.metadata = None + + backend.create( + "res-1", + schema={"fields": [{"name": "a", "type": "integer"}]}, + records=[{"a": 1}], + include_total=False, + ) + mock_client.query.assert_not_called() + + +# --- merge_sql / update_sql (lib) ------------------------------------------ + + +def test_merge_sql_renders_typed_extractors_on_match_update_no_match_insert() -> None: + sql = merge_sql( + "`p.d.r`", + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + {"name": "meta", "type": "object"}, + ], + "primaryKey": ["id"], + }, + ) + assert sql.startswith("MERGE `p.d.r` T") + assert "CAST(JSON_VALUE(r, '$.id') AS INT64) AS `id`" in sql + assert "JSON_VALUE(r, '$.label') AS `label`" in sql + assert "PARSE_JSON(JSON_QUERY(r, '$.meta')) AS `meta`" in sql + # USING attaches ROW_NUMBER() as _rn for auto-`_id` on NOT MATCHED. + assert "ROW_NUMBER() OVER () AS _rn" in sql + assert "ON T.`id` = S.`id`" in sql + # WHEN MATCHED only fires when some non-PK column actually differs + # — `_updated_at` advances on real changes, not on no-op upserts. + # Diff predicate uses IS DISTINCT FROM (NULL-safe) for scalars and + # TO_JSON_STRING(...) wrap for JSON columns. + assert ( + "WHEN MATCHED AND (" + "T.`label` IS DISTINCT FROM S.`label` OR " + "TO_JSON_STRING(T.`meta`) IS DISTINCT FROM TO_JSON_STRING(S.`meta`)" + ")" + ) in sql + assert "T.`label` = S.`label`" in sql + assert "T.`_updated_at` = CURRENT_TIMESTAMP()" in sql + # NOT MATCHED inserts system columns + user columns. `_id` is + # `(SELECT MAX(_id) FROM tbl) + S._rn` — inlined to avoid a + # separate probe round-trip. + assert ( + "WHEN NOT MATCHED THEN INSERT (`_id`, `_updated_at`, " + "`id`, `label`, `meta`)" + ) in sql + assert ( + "(SELECT IFNULL(MAX(`_id`), 0) FROM `p.d.r`) + S._rn" + ) in sql + + +def test_update_sql_renders_dml_update_keyed_on_primary_key() -> None: + sql = update_sql( + "`p.d.r`", + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + assert sql.startswith("UPDATE `p.d.r` T ") + assert "T.`label` = S.`label`" in sql + # `_updated_at` is always bumped, even when there are non-PK fields. + assert "T.`_updated_at` = CURRENT_TIMESTAMP()" in sql + assert "WHERE T.`id` = S.`id`" in sql + assert "MERGE" not in sql # plain DML, not MERGE + + +def test_merge_and_update_sql_reject_missing_primary_key() -> None: + schema = {"fields": [{"name": "id", "type": "integer"}]} + with pytest.raises(ValueError, match="primaryKey"): + merge_sql("`p.d.r`", schema) + with pytest.raises(ValueError, match="primaryKey"): + update_sql("`p.d.r`", schema) + + +# --- upsert() dispatch ---------------------------------------------------- + + +def _backend_with_schema( + mock_client: MagicMock, schema: dict[str, Any] +) -> BigQueryBackend: + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = schema + return backend + + +def test_upsert_method_upsert_issues_merge_with_rows_param( + mock_client: MagicMock, +) -> None: + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + records = [{"id": 1, "label": "x"}, {"id": 2, "label": "y"}] + + backend.upsert("res-1", records, method="upsert", include_total=False) + + sql, kwargs = mock_client.query.call_args + assert "MERGE `proj-1.ds-1.res-1` T" in sql[0] + params = {p.name: p.value for p in kwargs["job_config"].query_parameters} + assert json.loads(params["rows"]) == records + + +def test_upsert_method_insert_issues_dml_insert( + mock_client: MagicMock, +) -> None: + """`method='insert'` runs DML `INSERT INTO ... SELECT FROM UNNEST`, + not the streaming insert API — same path as `_insert_records` on + the create flow.""" + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}], "primaryKey": ["id"]}, + ) + + backend.upsert("res-1", [{"id": 1}], method="insert", include_total=False) + + mock_client.insert_rows_json.assert_not_called() + sql = mock_client.query.call_args[0][0] + assert sql.startswith("INSERT INTO `proj-1.ds-1.res-1` ") + assert "FROM UNNEST(JSON_QUERY_ARRAY(@rows)) AS r" in sql + + +def test_upsert_method_update_issues_dml_update( + mock_client: MagicMock, +) -> None: + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + mock_client.query.return_value.num_dml_affected_rows = 2 + + backend.upsert( + "res-1", + [{"id": 1, "label": "x"}, {"id": 2, "label": "y"}], + method="update", + include_total=False, + ) + + sql = mock_client.query.call_args[0][0] + assert sql.startswith("UPDATE `proj-1.ds-1.res-1` T ") + assert "WHERE T.`id` = S.`id`" in sql + + +def test_upsert_method_update_raises_not_found_when_pk_missing( + mock_client: MagicMock, +) -> None: + """Affected-row count < input row count → some PKs didn't match. + DML UPDATE silently no-ops on misses; we surface NotFoundError.""" + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + mock_client.query.return_value.num_dml_affected_rows = 1 # 2 missing + + with pytest.raises(NotFoundError, match="2 of 3"): + backend.upsert( + "res-1", + [ + {"id": 1, "label": "x"}, + {"id": 2, "label": "y"}, + {"id": 3, "label": "z"}, + ], + method="update", + include_total=False, + ) + + +def test_upsert_undeclared_resource_raises_not_found( + mock_client: MagicMock, +) -> None: + """`upsert` before `create` → NotFoundError. Metadata store is the + source of truth for whether a resource exists.""" + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + with pytest.raises(NotFoundError, match="not declared"): + backend.upsert( + "ghost", [{"a": 1}], method="upsert", include_total=False + ) + + +def test_upsert_missing_primary_key_raises_validation( + mock_client: MagicMock, +) -> None: + """`upsert`/`update` need a primaryKey — ValueError from the SQL + helpers is re-raised as ValidationError, never reaches BigQuery.""" + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}]}, # no primaryKey + ) + with pytest.raises(ValidationError, match="primaryKey"): + backend.upsert( + "res-1", [{"id": 1}], method="upsert", include_total=False + ) + mock_client.query.assert_not_called() + + +def test_upsert_unknown_method_raises_validation( + mock_client: MagicMock, +) -> None: + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}], "primaryKey": ["id"]}, + ) + with pytest.raises(ValidationError, match="unknown upsert method"): + backend.upsert( + "res-1", [], method="merge", include_total=False # bogus + ) + + +def test_upsert_translates_bigquery_scalar_subquery_error_to_duplicate_pk( + mock_client: MagicMock, +) -> None: + """When `records` contain duplicate PK tuples, BigQuery's MERGE + fails with 'Scalar subquery produced more than one element'. The + backend translates that into a clear ValidationError naming the + actual cause.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Scalar subquery produced more than one element; reason: " + "invalidQuery, location: query" + ) + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", + [{"id": 1, "label": "x"}, {"id": 1, "label": "y"}], # dup PK + method="upsert", + include_total=False, + ) + + msg = str(exc.value) + assert "duplicated" in msg.lower() + assert "primary key" in msg.lower() + + +def test_update_translates_bigquery_scalar_subquery_error_to_duplicate_pk( + mock_client: MagicMock, +) -> None: + """Same translation on the DML UPDATE path.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Scalar subquery produced more than one element" + ) + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + }, + ) + + with pytest.raises(ValidationError, match="duplicate"): + backend.upsert( + "res-1", + [{"id": 1, "label": "x"}, {"id": 1, "label": "y"}], + method="update", + include_total=False, + ) + + +def test_insert_translates_bigquery_bad_double_value_to_type_mismatch( + mock_client: MagicMock, +) -> None: + """BigQuery's `Bad double value: ` (raised when a record sends + a non-numeric string for a `number` column) is translated to a + clear ValidationError naming the bad value and the expected type. + + The create-flow runs CREATE TABLE → INSERT INTO — only the INSERT + (2nd call) should fail. The first (CREATE TABLE) succeeds. + """ + success_job = MagicMock() + success_job.result.return_value = [] + fail_job = MagicMock() + fail_job.result.side_effect = RuntimeError( + "400 Bad double value: jk; reason: invalidQuery, location: query" + ) + mock_client.query.side_effect = [success_job, fail_job] + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + with pytest.raises(ValidationError) as exc: + backend.create( + "res-1", + schema={"fields": [{"name": "price", "type": "number"}]}, + records=[{"price": "jk"}], + include_total=False, + ) + msg = str(exc.value) + assert "'jk'" in msg + assert "number" in msg + + +def test_upsert_translates_bigquery_bad_int64_value_to_type_mismatch( + mock_client: MagicMock, +) -> None: + """`Bad int64 value: …` on the MERGE path becomes a ValidationError + that says 'integer'.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Bad int64 value: not-a-number; reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}], "primaryKey": ["id"]}, + ) + + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", [{"id": "not-a-number"}], method="upsert", + include_total=False, + ) + assert "'not-a-number'" in str(exc.value) + assert "integer" in str(exc.value) + + +def test_translate_invalid_timestamp_value(mock_client: MagicMock) -> None: + """`Invalid timestamp: …` → ValidationError mentioning 'timestamp'.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Invalid timestamp: 2025-99-99; reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "ts", "type": "datetime"}, + ], + "primaryKey": ["id"], + }, + ) + + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", [{"id": 1, "ts": "2025-99-99"}], method="upsert", + include_total=False, + ) + assert "'2025-99-99'" in str(exc.value) + assert "timestamp" in str(exc.value) + + +def test_translate_could_not_cast_literal_error(mock_client: MagicMock) -> None: + """`Could not cast literal '...' to type ` — alternative + BigQuery phrasing for the same coercion failure as `Bad + value`. Should produce the same friendly message.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Could not cast literal 'jk' to type INT64; reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}], "primaryKey": ["id"]}, + ) + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", [{"id": "jk"}], method="upsert", include_total=False, + ) + msg = str(exc.value) + assert "'jk'" in msg + assert "integer" in msg + + +def test_translate_could_not_parse_as_type_error( + mock_client: MagicMock, +) -> None: + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Could not parse 'abc' as FLOAT64; reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "price", "type": "number"}, + ], + "primaryKey": ["id"], + }, + ) + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", [{"id": 1, "price": "abc"}], + method="upsert", include_total=False, + ) + msg = str(exc.value) + assert "'abc'" in msg + assert "number" in msg + + +def test_translate_value_out_of_range(mock_client: MagicMock) -> None: + """Numeric value that parses but exceeds the column type's range + → ValidationError mentioning out-of-range.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Value out of range for INT64: 99999999999999999999; " + "reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + {"fields": [{"name": "id", "type": "integer"}], "primaryKey": ["id"]}, + ) + with pytest.raises(ValidationError) as exc: + backend.upsert( + # Use string to avoid orjson's 64-bit int limit — the test + # checks the BigQuery-side error, not orjson encoding. + "res-1", [{"id": "99999999999999999999"}], + method="upsert", include_total=False, + ) + msg = str(exc.value) + assert "out of range" in msg + assert "integer" in msg + + +def test_translate_bad_numeric_value(mock_client: MagicMock) -> None: + """NUMERIC / BIGNUMERIC (e.g., after widening INT64 → NUMERIC) get + the same `number` friendly name.""" + mock_client.query.return_value.result.side_effect = RuntimeError( + "400 Bad NUMERIC value: not-a-num; reason: invalidQuery" + ) + backend = _backend_with_schema( + mock_client, + { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "amount", "type": "number"}, + ], + "primaryKey": ["id"], + }, + ) + with pytest.raises(ValidationError) as exc: + backend.upsert( + "res-1", [{"id": 1, "amount": "not-a-num"}], + method="upsert", include_total=False, + ) + assert "'not-a-num'" in str(exc.value) + assert "number" in str(exc.value) + + +# --- _updated_at toggle --------------------------------------------------- + + +def test_sql_helpers_omit_updated_at_when_flag_disabled() -> None: + """`Config.INCLUDE_UPDATED_AT=False` drops `_updated_at` from every + write path: CREATE TABLE, INSERT, MERGE (both branches), UPDATE.""" + schema = { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + } + from datastore.infrastructure.engines.bigquery.lib import ( + column_defs, + insert_sql, + merge_sql, + update_sql, + ) + + # CREATE TABLE + cols = column_defs(schema, include_updated_at=False) + assert "`_id` INT64" in cols + assert not any("_updated_at" in c for c in cols) + + # INSERT + ins = insert_sql("`p.d.r`", schema, include_updated_at=False) + assert "`_id`, `id`, `label`" in ins # no _updated_at in col list + assert "CURRENT_TIMESTAMP()" not in ins + assert "_updated_at" not in ins + + # MERGE — neither MATCHED nor NOT MATCHED touches `_updated_at`. + mer = merge_sql("`p.d.r`", schema, include_updated_at=False) + assert "_updated_at" not in mer + assert "CURRENT_TIMESTAMP()" not in mer + # The MATCHED branch still fires on real diffs, just without the + # timestamp bump. + assert "WHEN MATCHED AND (T.`label` IS DISTINCT FROM S.`label`)" in mer + + # UPDATE — SET only carries the user column edit. + upd = update_sql("`p.d.r`", schema, include_updated_at=False) + assert "_updated_at" not in upd + assert "SET T.`label` = S.`label`" in upd + + +def test_update_sql_rejects_all_pk_schema_when_timestamp_disabled() -> None: + """With `_updated_at` disabled, an all-PK schema has nothing to + SET — raise so the backend can surface a clear ValidationError.""" + from datastore.infrastructure.engines.bigquery.lib import update_sql + + schema = { + "fields": [ + {"name": "a", "type": "integer"}, + {"name": "b", "type": "string"}, + ], + "primaryKey": ["a", "b"], + } + with pytest.raises(ValueError, match="nothing to SET"): + update_sql("`p.d.r`", schema, include_updated_at=False) + + +def test_backend_propagates_config_flag_into_ddl( + mock_client: MagicMock, +) -> None: + """`BigQueryBackend._include_updated_at` reads `INCLUDE_UPDATED_AT` + off the attached config; `_create_data_table` honours it.""" + backend = _backend(mock_client) + backend.config.INCLUDE_UPDATED_AT = False + + backend._create_data_table( + "res-1", + {"fields": [{"name": "id", "type": "integer"}]}, + ) + + sql = mock_client.query.call_args[0][0] + assert "`_id` INT64" in sql + assert "_updated_at" not in sql + + +# --- info() --------------------------------------------------------------- + + +def test_info_returns_stored_schema_total_and_primary_key( + mock_client: MagicMock, +) -> None: + """`info()` reads the Frictionless schema from `_table_metadata` + and counts rows via `COUNT(*)` on the data table. `meta` exposes + the schema's primaryKey under `primary_key`.""" + schema = { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "product_code", "type": "string"}, + ], + "primaryKey": ["auction_id", "product_code"], + } + backend = _backend_with_schema(mock_client, schema) + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 18420 if k == "n" else None + mock_client.query.return_value.result.return_value = [count_row] + + result = backend.info("balancing_auction_results_2025") + + sql = mock_client.query.call_args[0][0] + assert sql == ( + "SELECT COUNT(*) AS n FROM " + "`proj-1.ds-1.balancing_auction_results_2025`" + ) + assert result.schema == schema + assert result.meta["resource_id"] == "balancing_auction_results_2025" + assert result.meta["total"] == 18420 + assert result.meta["primary_key"] == ["auction_id", "product_code"] + + +def test_info_raises_not_found_for_undeclared_resource( + mock_client: MagicMock, +) -> None: + """No metadata row → NotFoundError. The data table may exist + out-of-band but the engine treats `_table_metadata` as the + declaration source of truth.""" + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + with pytest.raises(NotFoundError, match="not declared"): + backend.info("ghost") + # No COUNT runs when the resource isn't declared. + mock_client.query.assert_not_called() + + +def test_info_returns_total_zero_when_count_fails( + mock_client: MagicMock, +) -> None: + """If the data table is missing while metadata exists (inconsistent + state from manual cleanup), `info` reports total=0 rather than + 500-ing the call — the schema is still informative on its own.""" + schema = { + "fields": [{"name": "id", "type": "integer"}], + "primaryKey": ["id"], + } + backend = _backend_with_schema(mock_client, schema) + mock_client.query.return_value.result.side_effect = RuntimeError( + "404 Not found: Table proj-1:ds-1.res-1" + ) + + result = backend.info("res-1") + + assert result.meta["total"] == 0 + assert result.schema == schema + + +def test_info_placeholder_mode_returns_stub(mock_client: MagicMock) -> None: + """No metadata store → return an empty stub so the unit suite can + exercise the call path without GCP creds.""" + backend = _backend(mock_client) + backend.metadata = None + + result = backend.info("res-1") + + assert result.schema == {"fields": []} + assert result.meta == {"resource_id": "res-1", "total": 0} + mock_client.query.assert_not_called() + + +# --- search() SQL builders ------------------------------------------------- + + +def test_parse_sort_validates_and_defaults_direction() -> None: + from datastore.infrastructure.engines.bigquery.search import parse_sort + + pairs = parse_sort("a, b desc, c asc", {"a", "b", "c"}) + assert pairs == [("a", "ASC"), ("b", "DESC"), ("c", "ASC")] + + with pytest.raises(ValueError, match="unknown column 'ghost'"): + parse_sort("ghost desc", {"a"}) + with pytest.raises(ValueError, match="direction"): + parse_sort("a sideways", {"a"}) + + +def test_build_search_renders_full_param_set() -> None: + """Every CKAN datastore_search param lands in the rendered SQL — + filters bind as parameters (no inlining), `q` becomes a row-wide + `SEARCH`, sort + projection are validated identifiers, and limit / + offset close the statement.""" + from datastore.infrastructure.engines.bigquery.search import build_search + + schema = { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "product_code", "type": "string"}, + {"name": "accepted", "type": "boolean"}, + ], + "primaryKey": ["auction_id"], + } + + sql, params, projected = build_search( + table_ref="`p.d.r`", + schema=schema, + include_updated_at=True, + fields=["auction_id", "product_code"], + filters={"product_code": "DCL", "accepted": True}, + q="apple", + distinct=False, + sort="auction_id desc", + limit=100, + offset=25, + ) + + assert sql.startswith("SELECT `auction_id`, `product_code` FROM `p.d.r` AS t") + assert "WHERE `product_code` = @f0 AND `accepted` = @f1" in sql + assert "SEARCH(t, @f2)" in sql + assert "ORDER BY `auction_id` DESC" in sql + assert sql.rstrip().endswith("LIMIT 100 OFFSET 25") + # Parameter types track the schema (STRING / BOOL / STRING for q). + by_name = {p.name: p for p in params} + assert by_name["f0"].type_ == "STRING" + assert by_name["f0"].value == "DCL" + assert by_name["f1"].type_ == "BOOL" + assert by_name["f1"].value is True + assert by_name["f2"].type_ == "STRING" + assert by_name["f2"].value == "apple" + # Result schema reflects the projection, in user-specified order. + assert [f["name"] for f in projected["fields"]] == [ + "auction_id", "product_code", + ] + + +def test_build_search_in_clause_for_list_filter() -> None: + """Filter value as a list → `col IN UNNEST(@p)` with an ARRAY param.""" + from datastore.infrastructure.engines.bigquery.search import build_search + + schema = {"fields": [{"name": "id", "type": "integer"}]} + sql, params, _ = build_search( + table_ref="`p.d.r`", + schema=schema, + include_updated_at=False, + fields=None, + filters={"id": [1, 2, 3]}, + q=None, + distinct=False, + sort=None, + limit=10, + offset=0, + ) + assert "`id` IN UNNEST(@f0)" in sql + assert params[0].array_type == "INT64" + assert params[0].values == [1, 2, 3] + + +def test_build_search_default_sort_is_id_asc() -> None: + """No `sort` → `_id ASC`. `_id` is always projected by default so + this is well-defined.""" + from datastore.infrastructure.engines.bigquery.search import build_search + + schema = {"fields": [{"name": "x", "type": "integer"}]} + sql, _, _ = build_search( + table_ref="`p.d.r`", + schema=schema, + include_updated_at=False, + fields=None, + filters=None, + q=None, + distinct=False, + sort=None, + limit=10, + offset=0, + ) + assert "ORDER BY `_id` ASC" in sql + + +def test_build_search_rejects_unknown_columns() -> None: + """`fields`, `sort`, `filters`, and dict-`q` all validate column + names against the schema — closing the SQL-injection vector on the + identifier-inlined slots.""" + from datastore.infrastructure.engines.bigquery.search import build_search + + schema = {"fields": [{"name": "a", "type": "integer"}]} + kwargs = dict( + table_ref="`p.d.r`", + schema=schema, + include_updated_at=False, + filters=None, q=None, distinct=False, sort=None, + limit=10, offset=0, + ) + with pytest.raises(ValueError, match="fields references unknown"): + build_search(fields=["ghost"], **kwargs) + with pytest.raises(ValueError, match="sort references unknown"): + build_search(fields=None, **{**kwargs, "sort": "ghost asc"}) + with pytest.raises(ValueError, match="filters references unknown"): + build_search(fields=None, **{**kwargs, "filters": {"ghost": 1}}) + with pytest.raises(ValueError, match="q references unknown"): + build_search(fields=None, **{**kwargs, "q": {"ghost": "x"}}) + + +def test_build_search_rejects_filters_on_json_columns() -> None: + """JSON/array/geojson columns have no clean equality in BQ — reject + early so the caller gets a 400 rather than a 500 from BigQuery.""" + from datastore.infrastructure.engines.bigquery.search import build_search + + schema = {"fields": [{"name": "blob", "type": "object"}]} + with pytest.raises(ValueError, match="JSON/array/geojson"): + build_search( + table_ref="`p.d.r`", + schema=schema, + include_updated_at=False, + fields=None, + filters={"blob": {"k": "v"}}, + q=None, distinct=False, sort=None, + limit=10, offset=0, + ) + + +def test_needs_count_query_only_when_filtering_or_distinct() -> None: + """Unfiltered + non-distinct search → backend takes the cheap + `__TABLES__`/`_count_rows` path; otherwise must run a real COUNT.""" + from datastore.infrastructure.engines.bigquery.search import ( + needs_count_query, + ) + + assert needs_count_query(filters=None, q=None, distinct=False) is False + assert needs_count_query(filters={"a": 1}, q=None, distinct=False) is True + assert needs_count_query(filters=None, q="x", distinct=False) is True + assert needs_count_query(filters=None, q=None, distinct=True) is True + + +# --- backend.search() orchestration --------------------------------------- + + +def test_search_returns_projection_schema_and_lazy_rows( + mock_client: MagicMock, +) -> None: + """End-to-end through `BigQueryBackend.search`: builds the SELECT, + submits the search + count jobs (filtered → count is a real query, + not `_count_rows`), yields tuples in projection order, and pipes + the projected schema back to the streaming writer.""" + schema = { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "product_code", "type": "string"}, + ], + "primaryKey": ["auction_id"], + } + backend = _backend_with_schema(mock_client, schema) + + # Distinct jobs for search + count; each yields its own .result(). + search_job = MagicMock() + search_row = MagicMock() + search_row.values.return_value = (1, "DCL") + search_job.result.return_value = iter([search_row]) + + count_job = MagicMock() + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 1 if k == "n" else None + count_job.result.return_value = [count_row] + + mock_client.query.side_effect = [count_job, search_job] + + result = backend.search( + resource_id="res-1", + filters={"product_code": "DCL"}, + q=None, + distinct=False, plain=True, language="english", + limit=100, offset=0, + fields=["auction_id", "product_code"], + sort=None, + include_total=True, + ) + + # Count query fires first (queued before search so both run in + # parallel; this caller awaits search before count). + assert mock_client.query.call_count == 2 + count_sql = mock_client.query.call_args_list[0][0][0] + search_sql = mock_client.query.call_args_list[1][0][0] + assert count_sql.startswith("SELECT COUNT(*) AS n FROM (") + assert search_sql.startswith( + "SELECT `auction_id`, `product_code` FROM `proj-1.ds-1.res-1` AS t" + ) + assert result.total == 1 + # `records` is a generator — assert lazy by exhausting it once. + rows = list(result.records) + assert rows == [(1, "DCL")] + # Projected schema is what the writer needs to label columns. + assert [f["name"] for f in result.schema["fields"]] == [ + "auction_id", "product_code", + ] + + +def test_search_unfiltered_uses_cheap_row_count( + mock_client: MagicMock, +) -> None: + """No filters, no q, no distinct → backend skips the filtered + COUNT subquery and falls back to the cheap row-count helper. Two + `client.query` calls land in this order: (1) the search SELECT, + (2) `_count_rows`'s `SELECT COUNT(*) FROM target`. Neither wraps + the data table in a subquery.""" + schema = {"fields": [{"name": "a", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + search_job = MagicMock() + search_job.result.return_value = iter([]) + count_job = MagicMock() + cnt = MagicMock() + cnt.__getitem__.side_effect = lambda k: 42 if k == "n" else None + count_job.result.return_value = [cnt] + + mock_client.query.side_effect = [search_job, count_job] + + result = backend.search( + resource_id="res-1", + filters=None, q=None, distinct=False, plain=True, + language="english", limit=10, offset=0, + fields=None, sort=None, include_total=True, + ) + + assert mock_client.query.call_count == 2 + sqls = [call.args[0] for call in mock_client.query.call_args_list] + # `_updated_at` rides along in default projection because the + # MagicMock config returns truthy for `INCLUDE_UPDATED_AT`. + assert sqls[0].startswith( + "SELECT `_id`, `a`, `_updated_at` FROM `proj-1.ds-1.res-1` AS t" + ) + assert sqls[1] == ( + "SELECT COUNT(*) AS n FROM `proj-1.ds-1.res-1`" + ) + # No filtered count subquery anywhere. + assert not any("FROM (SELECT" in s for s in sqls) + assert result.total == 42 + + +def test_search_raises_not_found_for_undeclared_resource( + mock_client: MagicMock, +) -> None: + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + with pytest.raises(NotFoundError, match="not declared"): + backend.search( + resource_id="ghost", + filters=None, q=None, distinct=False, plain=True, + language="english", limit=10, offset=0, + fields=None, sort=None, include_total=False, + ) + mock_client.query.assert_not_called() + + +def test_search_translates_builder_error_to_validation_error( + mock_client: MagicMock, +) -> None: + """Builder `ValueError` (unknown column, etc.) becomes a clean + `ValidationError` — caller gets 400, never reaches BigQuery.""" + schema = {"fields": [{"name": "a", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="unknown column"): + backend.search( + resource_id="res-1", + filters=None, q=None, distinct=False, plain=True, + language="english", limit=10, offset=0, + fields=["ghost"], sort=None, include_total=False, + ) + mock_client.query.assert_not_called() + + +# --- delete() ------------------------------------------------------------- + + +def test_delete_with_no_filters_or_fields_drops_table_and_metadata( + mock_client: MagicMock, +) -> None: + """Both `filters` and `fields` omitted → `DROP TABLE` + the + metadata row is removed. Resource disappears entirely.""" + schema = {"fields": [{"name": "id", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + backend.delete("res-1", filters=None, fields=None) + + sql = mock_client.query.call_args[0][0] + assert sql == "DROP TABLE IF EXISTS `proj-1.ds-1.res-1`" + backend.metadata.delete.assert_called_once_with("res-1") + + +def test_delete_with_empty_filters_deletes_all_rows( + mock_client: MagicMock, +) -> None: + """`filters={}` → `DELETE FROM … WHERE TRUE`. Table + metadata + survive; only rows go.""" + schema = {"fields": [{"name": "id", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + backend.delete("res-1", filters={}, fields=None) + + sql = mock_client.query.call_args[0][0] + assert sql == "DELETE FROM `proj-1.ds-1.res-1` WHERE TRUE" + backend.metadata.delete.assert_not_called() + + +def test_delete_with_filters_binds_typed_parameters( + mock_client: MagicMock, +) -> None: + """Populated `filters` produces parameterised SQL bound to the + column's type from the stored schema (no inlined values, no + string-vs-int confusion at the BQ layer).""" + schema = { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "accepted", "type": "boolean"}, + ], + "primaryKey": ["auction_id"], + } + backend = _backend_with_schema(mock_client, schema) + + backend.delete( + "res-1", filters={"auction_id": 144, "accepted": False}, fields=None, + ) + + sql_arg, kwargs = mock_client.query.call_args + sql = sql_arg[0] + assert sql.startswith("DELETE FROM `proj-1.ds-1.res-1` WHERE ") + assert "`auction_id` = @f0" in sql + assert "`accepted` = @f1" in sql + params = {p.name: (p.value, p.type_) for p in kwargs["job_config"].query_parameters} + assert params == {"f0": (144, "INT64"), "f1": (False, "BOOL")} + + +def test_delete_with_fields_drops_columns_and_updates_metadata( + mock_client: MagicMock, +) -> None: + """`fields=[…]` → `ALTER TABLE DROP COLUMN …` (one ALTER, multiple + clauses) and the stored schema is rewritten without those fields.""" + schema = { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "extra", "type": "string"}, + {"name": "obsolete", "type": "string"}, + ], + "primaryKey": ["id"], + } + backend = _backend_with_schema(mock_client, schema) + + backend.delete("res-1", filters=None, fields=["extra", "obsolete"]) + + sql = mock_client.query.call_args[0][0] + assert sql == ( + "ALTER TABLE `proj-1.ds-1.res-1` " + "DROP COLUMN `extra`, DROP COLUMN `obsolete`" + ) + # Stored schema shrinks to just the surviving fields. + new_schema = backend.metadata.update.call_args[0][1] + assert [f["name"] for f in new_schema["fields"]] == ["id"] + assert new_schema["primaryKey"] == ["id"] + + +def test_delete_rejects_dropping_primary_key_columns( + mock_client: MagicMock, +) -> None: + """Dropping a PK column would silently break every subsequent + upsert; the engine refuses up front rather than letting the user + discover this later.""" + schema = { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "label", "type": "string"}, + ], + "primaryKey": ["id"], + } + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="primary-key"): + backend.delete("res-1", filters=None, fields=["id"]) + mock_client.query.assert_not_called() + + +def test_delete_rejects_dropping_system_columns( + mock_client: MagicMock, +) -> None: + """`_id` / `_updated_at` are engine-managed; the API can't drop + them. Caught as ValidationError before any DDL runs.""" + schema = {"fields": [{"name": "id", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="system column"): + backend.delete("res-1", filters=None, fields=["_id"]) + mock_client.query.assert_not_called() + + +def test_delete_rejects_unknown_fields(mock_client: MagicMock) -> None: + schema = {"fields": [{"name": "id", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="unknown column"): + backend.delete("res-1", filters=None, fields=["ghost"]) + mock_client.query.assert_not_called() + + +def test_delete_rejects_unknown_filter_columns(mock_client: MagicMock) -> None: + """Filter column validation happens in `lib.delete_sql`; the + ValueError is converted to ValidationError before BigQuery is hit.""" + schema = {"fields": [{"name": "id", "type": "integer"}]} + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="unknown column"): + backend.delete("res-1", filters={"ghost": 1}, fields=None) + mock_client.query.assert_not_called() + + +def test_delete_rejects_filters_on_json_columns(mock_client: MagicMock) -> None: + schema = {"fields": [{"name": "blob", "type": "object"}]} + backend = _backend_with_schema(mock_client, schema) + + with pytest.raises(ValidationError, match="JSON/array/geojson"): + backend.delete("res-1", filters={"blob": {"k": "v"}}, fields=None) + + +def test_delete_raises_not_found_for_undeclared_resource( + mock_client: MagicMock, +) -> None: + backend = _backend(mock_client) + backend.metadata = MagicMock() + backend.metadata.get.return_value = None + + with pytest.raises(NotFoundError, match="not declared"): + backend.delete("ghost", filters=None, fields=None) + mock_client.query.assert_not_called() + + +# --- search_sql() --------------------------------------------------------- + + +def _ro_backend(client: MagicMock) -> BigQueryBackend: + """Read-only engine — what `search_sql` should always run against. + The mode-flag check is defense in depth; the load-bearing guard is + the RO credential bound at `build_client`.""" + b = BigQueryBackend(mode="ro") + b.client = client + b.config = MagicMock() + b.config.BIGQUERY_PROJECT = "proj-1" + b.config.BIGQUERY_DATASET = "ds-1" + b.config.BIGQUERY_USE_QUERY_CACHE = True + return b + + +def test_search_sql_streams_rows_with_result_schema( + mock_client: MagicMock, +) -> None: + """Result schema comes from BigQuery's job schema (BQ types mapped + back to Frictionless); rows arrive as tuples in column order.""" + backend = _ro_backend(mock_client) + + bq_schema = [ + MagicMock(name="day", field_type="DATE"), + MagicMock(name="avg", field_type="FLOAT64"), + MagicMock(name="count", field_type="INT64"), + ] + # MagicMock binds `name` as kwarg-to-MagicMock-name, not to attr — + # set it explicitly so `field.name` returns the column name. + for sf, n in zip(bq_schema, ("day", "avg", "count"), strict=True): + sf.name = n + + row_iter = MagicMock() + row_iter.schema = bq_schema + row1 = MagicMock() + row1.values.return_value = ("2025-11-04", 47.82, 2) + row2 = MagicMock() + row2.values.return_value = ("2025-11-05", 51.10, 1) + row_iter.__iter__.return_value = iter([row1, row2]) + + data_job = MagicMock() + data_job.result.return_value = row_iter + # COUNT(*) returns the unbounded filtered total — distinct from + # the data page's row count (which would only be the page size). + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 42 if k == "n" else None + count_job = MagicMock() + count_job.result.return_value = [count_row] + # COUNT is submitted first, then the data query. + mock_client.query.side_effect = [count_job, data_job] + + result = backend.search_sql( + "SELECT day, avg, count FROM x LIMIT 100", limit=100, + ) + + # Two queries fire. For this unfiltered SELECT the total comes + # from `INFORMATION_SCHEMA.TABLE_STORAGE` (free metadata read), not + # a COUNT(*) full-scan. + assert mock_client.query.call_count == 2 + count_sql, data_sql = ( + mock_client.query.call_args_list[0][0][0], + mock_client.query.call_args_list[1][0][0], + ) + assert "INFORMATION_SCHEMA.TABLE_STORAGE" in count_sql + assert "WHERE table_name = @table_name" in count_sql + assert "`proj-1`" in data_sql + assert "`ds-1`" in data_sql + assert "FROM `proj-1`.`ds-1`.x" in data_sql + # Total is the metadata row count (parametrised mock returns 42). + assert result.total == 42 + # Frictionless types come back from the BQ type map. + assert result.schema == { + "fields": [ + {"name": "day", "type": "date"}, + {"name": "avg", "type": "number"}, + {"name": "count", "type": "integer"}, + ], + } + # Records iterator is lazy — exhaust to verify shape. + assert list(result.records) == [ + ("2025-11-04", 47.82, 2), + ("2025-11-05", 51.10, 1), + ] + + +def test_search_sql_bounds_rows_by_limit(mock_client: MagicMock) -> None: + """A SELECT without an embedded LIMIT can't run forever — the + engine caps output at the service-supplied `limit` via islice.""" + backend = _ro_backend(mock_client) + + rows = [] + for i in range(10): + r = MagicMock() + r.values.return_value = (i,) + rows.append(r) + sf = MagicMock(field_type="INT64") + sf.name = "n" + row_iter = MagicMock() + row_iter.schema = [sf] + row_iter.__iter__.return_value = iter(rows) + + data_job = MagicMock() + data_job.result.return_value = row_iter + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 10 if k == "n" else None + count_job = MagicMock() + count_job.result.return_value = [count_row] + mock_client.query.side_effect = [count_job, data_job] + + result = backend.search_sql("SELECT n FROM x", limit=3) + assert list(result.records) == [(0,), (1,), (2,)] + # Filtered total is the unbounded count, not the page size. + assert result.total == 10 + + +def test_search_sql_refuses_to_run_on_rw_engine() -> None: + """Defense-in-depth: if a misconfigured caller routes search_sql to + the rw engine, the engine refuses before submitting the query — + RO credentials are the actual safety, this is the smoke alarm.""" + client = MagicMock() + b = BigQueryBackend(mode="rw") + b.client = client + b.config = MagicMock() + b.config.BIGQUERY_PROJECT = "proj-1" + b.config.BIGQUERY_DATASET = "ds-1" + + with pytest.raises(ServerError, match="read-only"): + b.search_sql("SELECT 1", limit=10) + client.query.assert_not_called() + + +def test_search_sql_translates_bq_error_to_server_error( + mock_client: MagicMock, +) -> None: + """A BQ-side failure (bad SQL, permission denied, …) surfaces as + `ServerError`, never as a raw google.api_core exception. + + Two jobs fire (COUNT + data); both error. The data-job error is + the one promoted to ServerError since rows are the primary + deliverable; COUNT failures alone degrade total to None instead. + """ + backend = _ro_backend(mock_client) + mock_client.query.side_effect = RuntimeError("syntax error at line 1") + + with pytest.raises(ServerError, match="search_sql failed"): + backend.search_sql("SELECT bogus", limit=10) + + +def test_search_sql_filtered_uses_count_subquery( + mock_client: MagicMock, +) -> None: + """SQL with a WHERE clause can't take the free metadata path — + fall back to `SELECT COUNT(*) FROM ()` which gives the + actual filtered total.""" + backend = _ro_backend(mock_client) + + sf = MagicMock(field_type="INT64") + sf.name = "n" + row_iter = MagicMock() + row_iter.schema = [sf] + row_iter.__iter__.return_value = iter([]) + + data_job = MagicMock() + data_job.result.return_value = row_iter + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 7 if k == "n" else None + count_job = MagicMock() + count_job.result.return_value = [count_row] + mock_client.query.side_effect = [count_job, data_job] + + result = backend.search_sql( + "SELECT n FROM x WHERE n > 5 LIMIT 10", limit=10, + ) + + count_sql = mock_client.query.call_args_list[0][0][0] + # Filtered query → subquery COUNT, no INFORMATION_SCHEMA path. + assert count_sql.startswith("SELECT COUNT(*) AS n FROM (") + assert "INFORMATION_SCHEMA" not in count_sql + assert result.total == 7 + + +def test_search_sql_aggregate_uses_count_subquery( + mock_client: MagicMock, +) -> None: + """Queries with aggregates (GROUP BY, COUNT(), …) collapse rows — + the free metadata path would give the source table's row count, + not the result row count. Use COUNT subquery.""" + backend = _ro_backend(mock_client) + + sf = MagicMock(field_type="INT64") + sf.name = "n" + row_iter = MagicMock() + row_iter.schema = [sf] + row_iter.__iter__.return_value = iter([]) + data_job = MagicMock() + data_job.result.return_value = row_iter + cnt = MagicMock() + cnt.__getitem__.side_effect = lambda k: 2 if k == "n" else None + count_job = MagicMock() + count_job.result.return_value = [cnt] + mock_client.query.side_effect = [count_job, data_job] + + result = backend.search_sql( + "SELECT category, COUNT(*) FROM x GROUP BY category LIMIT 10", + limit=10, + ) + count_sql = mock_client.query.call_args_list[0][0][0] + assert "INFORMATION_SCHEMA" not in count_sql + assert count_sql.startswith("SELECT COUNT(*) AS n FROM (") + assert result.total == 2 + + +def test_search_sql_total_falls_back_to_none_when_count_fails( + mock_client: MagicMock, +) -> None: + """If COUNT errors but the data query succeeds, the user still + gets their rows — `total` degrades to None instead of failing + the whole request. Pagination links then drop `next` / + `total_pages`, matching the "unknown total" rules in + `_build_sql_pagination_links`.""" + backend = _ro_backend(mock_client) + + sf = MagicMock(field_type="INT64") + sf.name = "n" + row_iter = MagicMock() + row_iter.schema = [sf] + row_iter.__iter__.return_value = iter([]) + + data_job = MagicMock() + data_job.result.return_value = row_iter + count_job = MagicMock() + count_job.result.side_effect = RuntimeError("count failed") + mock_client.query.side_effect = [count_job, data_job] + + result = backend.search_sql("SELECT n FROM x LIMIT 10", limit=10) + assert result.total is None + assert list(result.records) == [] + + +def test_search_sql_placeholder_mode_returns_empty( + mock_client: MagicMock, +) -> None: + """No client (no GCP creds) → empty result, no exception. Lets the + unit suite drive `search_sql_datastore` without a real backend.""" + backend = BigQueryBackend(mode="ro") + backend.client = None + + result = backend.search_sql("SELECT 1", limit=10) + assert result.schema == {"fields": []} + assert list(result.records) == [] + mock_client.query.assert_not_called() + + +# --- qualify_table_refs() ------------------------------------------------- + + +def test_qualify_table_refs_prepends_project_dataset() -> None: + """User refers to tables by raw resource_id; the qualifier prepends + `project.dataset` and emits BigQuery-dialect SQL with backticks.""" + from datastore.infrastructure.engines.bigquery.lib import ( + qualify_table_refs, + ) + out = qualify_table_refs( + 'SELECT * FROM "c6153a74-43cb-4edf-8bdf-bb664feca937" LIMIT 10', + project="my-project", + dataset="my_dataset", + ) + assert ( + "`my-project`.`my_dataset`.`c6153a74-43cb-4edf-8bdf-bb664feca937`" + in out + ) + assert out.endswith("LIMIT 10") + + +def test_qualify_table_refs_handles_joins() -> None: + """Both sides of a JOIN get qualified independently.""" + from datastore.infrastructure.engines.bigquery.lib import ( + qualify_table_refs, + ) + out = qualify_table_refs( + 'SELECT a.id FROM "tbl_a" a JOIN "tbl_b" b ON a.id = b.id LIMIT 10', + project="p", dataset="d", + ) + assert "`p`.`d`.`tbl_a`" in out + assert "`p`.`d`.`tbl_b`" in out + + +def test_qualify_table_refs_skips_cte_aliases() -> None: + """CTE aliases are inline, not external tables — leave them alone.""" + from datastore.infrastructure.engines.bigquery.lib import ( + qualify_table_refs, + ) + out = qualify_table_refs( + 'WITH t AS (SELECT 1 AS a) SELECT * FROM t LIMIT 10', + project="p", dataset="d", + ) + assert "`p`.`d`.`t`" not in out + # CTE name `t` survives unqualified. + assert " FROM t " in out or " FROM `t` " in out + + +def test_qualify_table_refs_leaves_already_qualified_refs_alone() -> None: + """If a caller fully-qualifies a ref, don't double-prefix.""" + from datastore.infrastructure.engines.bigquery.lib import ( + qualify_table_refs, + ) + out = qualify_table_refs( + "SELECT * FROM other_project.other_dataset.tbl LIMIT 10", + project="p", dataset="d", + ) + assert "`p`.`d`.`other_project`" not in out + assert "other_project" in out and "other_dataset" in out + assert "tbl" in out + + +# --- query-results cache on read paths ------------------------------------ +# +# BigQuery's built-in query cache makes identical, deterministic SELECTs +# free + fast on the second call. The default is on at the API, but every +# read site funnels its `QueryJobConfig` through `_read_job_config` so +# the flag is explicit in the code AND the `BIGQUERY_USE_QUERY_CACHE` +# opt-out actually flows through. These tests pin that contract. + + +def _job_config_kwarg(call) -> Any: + """Pull `job_config` from a `client.query(...)` call regardless of + whether it rode as a kwarg or as the second positional arg.""" + if "job_config" in call.kwargs: + return call.kwargs["job_config"] + return call.args[1] if len(call.args) > 1 else None + + +def test_search_passes_use_query_cache_true_on_data_and_count_jobs( + mock_client: MagicMock, +) -> None: + schema = { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "product_code", "type": "string"}, + ], + "primaryKey": ["auction_id"], + } + backend = _backend_with_schema(mock_client, schema) + + # Filtered search → triggers both COUNT and SELECT submits. + search_job = MagicMock() + search_job.result.return_value = iter([]) + count_job = MagicMock() + cnt = MagicMock() + cnt.__getitem__.side_effect = lambda k: 0 if k == "n" else None + count_job.result.return_value = [cnt] + mock_client.query.side_effect = [count_job, search_job] + + backend.search( + resource_id="res-1", + filters={"product_code": "DCL"}, + q=None, distinct=False, plain=True, language="english", + limit=10, offset=0, fields=["auction_id"], sort=None, + include_total=True, + ) + + assert mock_client.query.call_count == 2 + for call in mock_client.query.call_args_list: + cfg = _job_config_kwarg(call) + assert cfg is not None, "search must pass a QueryJobConfig" + assert cfg.use_query_cache is True + + +def test_search_sql_passes_use_query_cache_true_on_data_and_count_jobs( + mock_client: MagicMock, +) -> None: + backend = _ro_backend(mock_client) + + # Data result: empty iter with a minimal schema. + sf = MagicMock(field_type="INT64") + sf.name = "n" + row_iter = MagicMock() + row_iter.schema = [sf] + row_iter.__iter__.return_value = iter([]) + data_job = MagicMock() + data_job.result.return_value = row_iter + count_job = MagicMock() + count_job.result.return_value = [] + # COUNT submitted first (non-blocking), then DATA. + mock_client.query.side_effect = [count_job, data_job] + + backend.search_sql( + "SELECT n FROM res1 WHERE n > 0 LIMIT 10", limit=10, + ) + + assert mock_client.query.call_count == 2 + for call in mock_client.query.call_args_list: + cfg = _job_config_kwarg(call) + assert cfg is not None, "search_sql must pass a QueryJobConfig" + assert cfg.use_query_cache is True + + +def test_info_count_rows_passes_use_query_cache_true( + mock_client: MagicMock, +) -> None: + """`datastore_info` calls `_count_rows`, which issues + `SELECT COUNT(*) FROM
`. That SELECT must ride the cache.""" + backend = _backend_with_schema( + mock_client, {"fields": [{"name": "id", "type": "integer"}]}, + ) + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 7 if k == "n" else None + mock_client.query.return_value.result.return_value = [count_row] + + backend.info("res-1") + + # `_count_rows` is the only `client.query` call in `info()`'s path + # (metadata is mocked) — assert its job_config carries the flag. + assert mock_client.query.call_count == 1 + cfg = _job_config_kwarg(mock_client.query.call_args) + assert cfg is not None, "_count_rows must pass a QueryJobConfig" + assert cfg.use_query_cache is True + + +def test_use_query_cache_respects_config_opt_out( + mock_client: MagicMock, +) -> None: + """`BIGQUERY_USE_QUERY_CACHE=False` flows through to the wire so + integration tests / freshness-sensitive deployments can force a + fresh scan.""" + backend = _backend_with_schema( + mock_client, {"fields": [{"name": "id", "type": "integer"}]}, + ) + backend.config.BIGQUERY_USE_QUERY_CACHE = False + count_row = MagicMock() + count_row.__getitem__.side_effect = lambda k: 0 if k == "n" else None + mock_client.query.return_value.result.return_value = [count_row] + + backend.info("res-1") + + cfg = _job_config_kwarg(mock_client.query.call_args) + assert cfg.use_query_cache is False diff --git a/tests/engines/ducklake/__init__.py b/tests/engines/ducklake/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_datastore_create.py b/tests/test_datastore_create.py index 9cadc77..91c6f8d 100644 --- a/tests/test_datastore_create.py +++ b/tests/test_datastore_create.py @@ -51,6 +51,7 @@ def _valid_payload_with_resource() -> dict[str, Any]: # 1. Correct payload -------------------------------------------------------- + def test_create_with_resource_id_succeeds(client: TestClient) -> None: response = client.post(CREATE_URL, json=_valid_payload_with_resource_id()) @@ -60,6 +61,9 @@ def test_create_with_resource_id_succeeds(client: TestClient) -> None: result = body["result"] assert result["resource_id"] == "balancing_auction_results_2025" assert result["package_id"] == "pkg-balancing-2025" + # Both surfaces — canonical `schema.primaryKey` and deprecated top-level + # `primary_key` — carry the same unique key. + assert result["schema"]["primaryKey"] == ["auction_id", "product_code"] assert result["primary_key"] == ["auction_id", "product_code"] assert [f["id"] for f in result["fields"]] == ["auction_id", "product_code"] @@ -74,9 +78,75 @@ def test_create_with_resource_dict_succeeds(client: TestClient) -> None: assert result["package_id"] == "pkg-balancing-2025" +def test_create_with_resource_dict_rejected_when_auth_type_is_not_ckan( + fake_ckan: FakeCKAN, +) -> None: + """Only the CKAN provider can materialise a fresh resource on the fly + (`ckan.resource_create(...)`). Under JWT / anonymous auth the call has + nowhere to land — the endpoint rejects the dict form and tells the + caller to send `resource_id` instead.""" + from datastore.api.context import get_auth_provider, get_ckan_client + from datastore.auth.anonymous import Provider as AnonymousProvider + from datastore.core.config import Config, get_config + from datastore.main import create_app + + app = create_app() + app.dependency_overrides[get_config] = lambda: Config( + AUTH_TYPE="anonymous", CKAN_URL="" + ) + app.dependency_overrides[get_ckan_client] = lambda: fake_ckan + app.dependency_overrides[get_auth_provider] = lambda: AnonymousProvider() + + with TestClient(app) as c: + response = c.post(CREATE_URL, json=_valid_payload_with_resource()) + + assert response.status_code == 400 + body = response.json() + assert body["success"] is False + assert body["error"]["__type"] == "Validation Error" + # Message must point the caller at the alternative (`resource_id`) + # and name ckan as the gate — exact wording is intentionally loose. + msg = body["error"]["message"] + assert "resource_id" in msg + assert "ckan" in msg.lower() + + +def test_create_with_resource_id_succeeds_under_anonymous_auth( + fake_ckan: FakeCKAN, +) -> None: + """The `resource_id` form keeps working without CKAN — no resource + materialisation needed, just the engine create.""" + from datastore.api.context import get_auth_provider, get_ckan_client + from datastore.auth.anonymous import Provider as AnonymousProvider + from datastore.core.config import Config, get_config + from datastore.main import create_app + + app = create_app() + app.dependency_overrides[get_config] = lambda: Config( + AUTH_TYPE="anonymous", CKAN_URL="" + ) + app.dependency_overrides[get_ckan_client] = lambda: fake_ckan + app.dependency_overrides[get_auth_provider] = lambda: AnonymousProvider() + + with TestClient(app) as c: + # Policy still requires *some* credential for non-read permissions; + # the anonymous provider just doesn't check what it is. + c.headers["Authorization"] = "any-token" + response = c.post(CREATE_URL, json=_valid_payload_with_resource_id()) + + assert response.status_code == 200 + assert response.json()["result"]["resource_id"] == ( + "balancing_auction_results_2025" + ) + + # 2. Missing required field ------------------------------------------------- -def test_create_missing_fields_returns_validation_error(client: TestClient) -> None: + +def test_create_missing_fields_and_schema_returns_validation_error( + client: TestClient, +) -> None: + """Neither legacy `fields` nor frictionless `schema` provided → 400.""" payload = _valid_payload_with_resource_id() payload.pop("fields") @@ -87,7 +157,7 @@ def test_create_missing_fields_returns_validation_error(client: TestClient) -> N assert body["success"] is False error = body["error"] assert error["__type"] == "Validation Error" - assert "fields" in error["fields"] + assert "either 'fields' or 'schema' is required" in str(error["fields"]) def test_create_empty_fields_returns_validation_error(client: TestClient) -> None: @@ -100,7 +170,7 @@ def test_create_empty_fields_returns_validation_error(client: TestClient) -> Non body = response.json() assert body["success"] is False assert body["error"]["__type"] == "Validation Error" - assert "fields" in body["error"]["fields"] + assert "'fields' must not be empty" in str(body["error"]["fields"]) def test_create_field_missing_id_returns_validation_error(client: TestClient) -> None: @@ -118,6 +188,7 @@ def test_create_field_missing_id_returns_validation_error(client: TestClient) -> # 3. Resource not accessible ------------------------------------------------ + def test_create_unknown_resource_id_returns_404(client: TestClient) -> None: payload = _valid_payload_with_resource_id() payload["resource_id"] = "does-not-exist" @@ -144,8 +215,26 @@ def test_create_resource_id_with_denied_key_returns_403( assert body["error"]["__type"] == "Authorization Error" +def test_create_without_api_key_returns_403( + client: TestClient, fake_ckan: FakeCKAN +) -> None: + """Anonymous reads are allowed (CKAN decides on resource visibility), + but writes always require an authenticated user — short-circuit + with 403 before CKAN is even called.""" + before = fake_ckan.authorize_calls + # Drop the default Authorization header the conftest sets — we + # want a real "no header" request, not "header with empty value". + client.headers.pop("Authorization", None) + response = client.post(CREATE_URL, json=_valid_payload_with_resource_id()) + assert response.status_code == 403 + assert response.json()["error"]["__type"] == "Authorization Error" + # CKAN never sees the request — we reject before delegating. + assert fake_ckan.authorize_calls == before + + # 4. Package not accessible ------------------------------------------------- + def test_create_unknown_package_returns_404(client: TestClient) -> None: payload = _valid_payload_with_resource() payload["resource"]["package_id"] = "missing-package" @@ -170,3 +259,103 @@ def test_create_package_with_denied_key_returns_403( body = response.json() assert body["success"] is False assert body["error"]["__type"] == "Authorization Error" + + +# 5. Frictionless `schema` path -------------------------------------------- +def _valid_payload_with_schema() -> dict[str, Any]: + return { + "resource_id": "balancing_auction_results_2025", + "schema": { + "fields": [ + {"name": "auction_id", "type": "integer"}, + {"name": "product_code", "type": "string"}, + ], + "primaryKey": ["auction_id", "product_code"], + }, + "records": [ + {"auction_id": 144, "product_code": "DCL"}, + ], + } + + +def test_create_with_schema_succeeds_and_returns_both_shapes( + client: TestClient, +) -> None: + """Frictionless `schema` input → response carries both `fields` and `schema`.""" + response = client.post(CREATE_URL, json=_valid_payload_with_schema()) + + assert response.status_code == 200, response.text + result = response.json()["result"] + # Top-level `primary_key` mirrors `schema.primaryKey`. + assert result["primary_key"] == ["auction_id", "product_code"] + # Legacy `fields` derived from the schema, with Postgres types. + assert [f["id"] for f in result["fields"]] == ["auction_id", "product_code"] + assert result["fields"][0]["type"] == "int8" + assert result["fields"][1]["type"] == "text" + # Schema returned verbatim (Frictionless shape). + assert result["schema"]["primaryKey"] == ["auction_id", "product_code"] + assert [f["name"] for f in result["schema"]["fields"]] == [ + "auction_id", + "product_code", + ] + + +def test_create_with_fields_returns_both_shapes(client: TestClient) -> None: + """Legacy `fields` input → response also includes derived frictionless `schema`.""" + response = client.post(CREATE_URL, json=_valid_payload_with_resource_id()) + + assert response.status_code == 200, response.text + result = response.json()["result"] + schema = result["schema"] + assert [f["name"] for f in schema["fields"]] == [ + "auction_id", + "product_code", + ] + # int4 → integer, text → string when projecting Postgres → Frictionless. + assert schema["fields"][0]["type"] == "integer" + assert schema["fields"][1]["type"] == "string" + assert schema["primaryKey"] == ["auction_id", "product_code"] + + +def test_create_with_fields_and_schema_returns_validation_error( + client: TestClient, +) -> None: + payload = _valid_payload_with_resource_id() + payload["schema"] = {"fields": [{"name": "auction_id", "type": "integer"}]} + + response = client.post(CREATE_URL, json=payload) + + assert response.status_code == 400 + error = response.json()["error"] + assert error["__type"] == "Validation Error" + assert "not both" in str(error["fields"]) + + +def test_create_with_schema_and_primary_key_returns_validation_error( + client: TestClient, +) -> None: + """Top-level `primary_key` is rejected when `schema` is supplied.""" + payload = _valid_payload_with_schema() + payload["primary_key"] = ["auction_id"] + + response = client.post(CREATE_URL, json=payload) + + assert response.status_code == 400 + error = response.json()["error"] + assert error["__type"] == "Validation Error" + assert "primary_key" in str(error["fields"]) + + +def test_create_with_invalid_schema_returns_validation_error( + client: TestClient, +) -> None: + """Malformed frictionless schema is rejected at the boundary.""" + payload = _valid_payload_with_schema() + payload["schema"] = {"fields": [{"name": "auction_id", "type": "not-a-type"}]} + + response = client.post(CREATE_URL, json=payload) + + assert response.status_code == 400 + error = response.json()["error"] + assert error["__type"] == "Validation Error" + assert "schema" in error["fields"] diff --git a/tests/test_datastore_delete.py b/tests/test_datastore_delete.py new file mode 100644 index 0000000..22e3597 --- /dev/null +++ b/tests/test_datastore_delete.py @@ -0,0 +1,168 @@ +"""End-to-end tests for `POST /api/3/action/datastore_delete`. + +Body accepts: + resource_id / id (one required) — table to delete from + filters (optional dict) — narrow the delete; omit → drop table + force (optional bool) — required for read-only resources + +Response echoes back the original `filters` (CKAN convention) under +`result.{resource_id, filters}`. + +Covers: + 1. happy path — filtered + whole-table drop + 2. aliases — `id` accepted, normalised to `resource_id` + 3. validation — missing both, unknown body keys + 4. auth — unknown resource (404), denied key (403) + +Placeholder engine doesn't actually delete anything; these tests pin +shape + routing. +""" + +from __future__ import annotations + +from fastapi.testclient import TestClient + +from tests.conftest import FakeCKAN + +DELETE_URL = "/api/3/action/datastore_delete" +_RESOURCE_ID = "balancing_auction_results_2025" + + +# 1. Happy path ------------------------------------------------------------- + +def test_delete_with_filters_echoes_them(client: TestClient) -> None: + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "filters": {"product_code": "DCL", "accepted": False}, + }) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert body["result"]["resource_id"] == _RESOURCE_ID + assert body["result"]["filters"] == { + "product_code": "DCL", "accepted": False, + } + + +def test_delete_without_filters_drops_whole_table(client: TestClient) -> None: + """Omitting `filters` means drop the entire table; the response + omits `filters` from `result` (exclude_none).""" + response = client.post(DELETE_URL, json={"resource_id": _RESOURCE_ID}) + + assert response.status_code == 200 + result = response.json()["result"] + assert result["resource_id"] == _RESOURCE_ID + assert "filters" not in result + + +def test_force_flag_accepted(client: TestClient) -> None: + """`force=True` is accepted (the placeholder doesn't enforce + read-only; real BigQuery impl will check resource metadata).""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "filters": {"x": 1}, + "force": True, + }) + assert response.status_code == 200 + + +# 2. Aliases ---------------------------------------------------------------- + +def test_id_alias_works(client: TestClient) -> None: + """`id` is normalised to `resource_id` by the schema validator.""" + response = client.post(DELETE_URL, json={"id": _RESOURCE_ID}) + + assert response.status_code == 200 + assert response.json()["result"]["resource_id"] == _RESOURCE_ID + + +def test_same_value_for_resource_id_and_id_accepted(client: TestClient) -> None: + """Same value on both keys is the no-conflict legacy-echo case.""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "id": _RESOURCE_ID, + }) + assert response.status_code == 200 + assert response.json()["result"]["resource_id"] == _RESOURCE_ID + + +def test_conflicting_resource_id_and_id_rejected(client: TestClient) -> None: + """Different `resource_id` vs `id` → 400. Silently preferring one + would let a typo destroy the wrong resource.""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "id": "different-value", + }) + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + + +# 3. Validation ------------------------------------------------------------- + +def test_missing_both_returns_validation_error(client: TestClient) -> None: + response = client.post(DELETE_URL, json={}) + + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + + +def test_extra_body_key_rejected(client: TestClient) -> None: + """`extra='forbid'` blocks unknown keys to catch typos.""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "filterz": {"x": 1}, # typo + }) + + assert response.status_code == 400 + assert response.json()["error"]["__type"] == "Validation Error" + + +def test_filters_and_fields_are_mutually_exclusive(client: TestClient) -> None: + """Row delete (`filters`) and column drop (`fields`) are separate + operations; sending both is ambiguous and rejected up front.""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "filters": {"id": 1}, + "fields": ["label"], + }) + + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + assert "mutually exclusive" in body["error"]["message"] + + +def test_empty_fields_list_rejected(client: TestClient) -> None: + """`fields=[]` is ambiguous (column drop with no columns) — 400 + rather than silently no-op.""" + response = client.post(DELETE_URL, json={ + "resource_id": _RESOURCE_ID, + "fields": [], + }) + + assert response.status_code == 400 + assert response.json()["error"]["__type"] == "Validation Error" + + +# 4. Auth ------------------------------------------------------------------- + +def test_unknown_resource_returns_404(client: TestClient) -> None: + response = client.post(DELETE_URL, json={"resource_id": "does-not-exist"}) + + assert response.status_code == 404 + body = response.json() + assert body["error"]["__type"] == "Not Found Error" + + +def test_denied_key_returns_403( + client: TestClient, fake_ckan: FakeCKAN +) -> None: + fake_ckan.deny("test-token") + + response = client.post(DELETE_URL, json={"resource_id": _RESOURCE_ID}) + + assert response.status_code == 403 + assert response.json()["error"]["__type"] == "Authorization Error" diff --git a/tests/test_datastore_info.py b/tests/test_datastore_info.py new file mode 100644 index 0000000..42a3be3 --- /dev/null +++ b/tests/test_datastore_info.py @@ -0,0 +1,132 @@ +"""End-to-end tests for `GET /api/3/action/datastore_info`. + +Single `resource_id` query parameter; the response envelope's `result` +holds `meta` (free-form dict) + `fields` (column schema list). + +Covers: + 1. happy path — known resource_id returns 200 with envelope + 2. validation — missing resource_id, unknown query params + 3. auth — unknown resource_id (404), denied api_key (403) + +The placeholder BigQuery engine returns an empty fields list and a small +meta dict, so these tests pin the request/response shape and routing. +Engine-specific content lives with the real backend. +""" + +from __future__ import annotations + +from fastapi.testclient import TestClient + +from tests.conftest import FakeCKAN + +INFO_URL = "/api/3/action/datastore_info" +_RESOURCE_ID = "balancing_auction_results_2025" + + +# 1. Happy path ------------------------------------------------------------- + +def test_basic_info_succeeds(client: TestClient) -> None: + response = client.get(INFO_URL, params={"resource_id": _RESOURCE_ID}) + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + result = body["result"] + # Placeholder engine returns empty fields + a small meta echoing + # the requested resource_id. + assert "fields" in result + assert "meta" in result + assert isinstance(result["fields"], list) + assert isinstance(result["meta"], dict) + assert result["meta"]["resource_id"] == _RESOURCE_ID + + +def test_response_shape(client: TestClient) -> None: + """`result` carries `meta`, `schema` (canonical), and `fields` + (deprecated legacy mirror).""" + response = client.get(INFO_URL, params={"resource_id": _RESOURCE_ID}) + + body = response.json() + assert set(body) == {"help", "success", "result"} + assert set(body["result"]) == {"meta", "schema", "fields"} + assert isinstance(body["result"]["schema"], dict) + assert "fields" in body["result"]["schema"] + + +# 2. Validation + aliases --------------------------------------------------- + +def test_id_alias_works(client: TestClient) -> None: + """`id` is a CKAN-style alias for `resource_id`; either is accepted.""" + response = client.get(INFO_URL, params={"id": _RESOURCE_ID}) + + assert response.status_code == 200 + result = response.json()["result"] + # Placeholder engine's meta echoes the normalised resource_id. + assert result["meta"]["resource_id"] == _RESOURCE_ID + + +def test_same_value_for_resource_id_and_id_accepted(client: TestClient) -> None: + """Same value on both `resource_id` and `id` is the no-conflict case + (legacy clients echoing both keys); accepted as `resource_id`.""" + response = client.get(INFO_URL, params={ + "resource_id": _RESOURCE_ID, + "id": _RESOURCE_ID, + }) + assert response.status_code == 200 + assert response.json()["result"]["meta"]["resource_id"] == _RESOURCE_ID + + +def test_conflicting_resource_id_and_id_rejected(client: TestClient) -> None: + """Different values for `resource_id` and `id` → 400. Silently + preferring one would let CKAN-style legacy params mask a real client + bug, so the request must be unambiguous.""" + response = client.get(INFO_URL, params={ + "resource_id": _RESOURCE_ID, + "id": "different-value", + }) + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + + +def test_missing_both_returns_validation_error(client: TestClient) -> None: + """Neither `resource_id` nor `id` → 400.""" + response = client.get(INFO_URL, params={}) + + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + + +def test_extra_query_param_rejected(client: TestClient) -> None: + """`extra='forbid'` — only `resource_id` / `id` are allowed.""" + response = client.get(INFO_URL, params={ + "resource_id": _RESOURCE_ID, + "verbose": "true", + }) + + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + + +# 3. Auth ------------------------------------------------------------------- + +def test_unknown_resource_returns_404(client: TestClient) -> None: + response = client.get(INFO_URL, params={"resource_id": "does-not-exist"}) + + assert response.status_code == 404 + body = response.json() + assert body["error"]["__type"] == "Not Found Error" + assert "does-not-exist" in body["error"]["message"] + + +def test_denied_key_returns_403( + client: TestClient, fake_ckan: FakeCKAN +) -> None: + fake_ckan.deny("test-token") + + response = client.get(INFO_URL, params={"resource_id": _RESOURCE_ID}) + + assert response.status_code == 403 + assert response.json()["error"]["__type"] == "Authorization Error" diff --git a/tests/test_datastore_search.py b/tests/test_datastore_search.py index 8a1ce80..93a5b36 100644 --- a/tests/test_datastore_search.py +++ b/tests/test_datastore_search.py @@ -57,7 +57,6 @@ def _params(**overrides: Any) -> dict[str, Any]: # 1. Happy path ------------------------------------------------------------- - def test_basic_search_succeeds(client: TestClient) -> None: response = client.get(SEARCH_URL, params=_params()) @@ -82,10 +81,14 @@ def test_default_limit_and_offset_echoed(client: TestClient) -> None: # 2. Optional knobs --------------------------------------------------------- + def test_search_with_filters(client: TestClient) -> None: - response = client.get(SEARCH_URL, params=_params( - filters={"product_code": "DCL", "accepted": True}, - )) + response = client.get( + SEARCH_URL, + params=_params( + filters={"product_code": "DCL", "accepted": True}, + ), + ) assert response.status_code == 200 @@ -96,17 +99,23 @@ def test_search_with_q_as_string(client: TestClient) -> None: def test_search_with_q_as_dict(client: TestClient) -> None: """CKAN's `q` accepts a per-column dict; we encode it as JSON in the URL.""" - response = client.get(SEARCH_URL, params=_params( - q={"product_code": "DCL", "bidder_metadata": "DRAX"}, - )) + response = client.get( + SEARCH_URL, + params=_params( + q={"product_code": "DCL", "bidder_metadata": "DRAX"}, + ), + ) assert response.status_code == 200 def test_search_with_fields_and_sort(client: TestClient) -> None: - response = client.get(SEARCH_URL, params=_params( - fields=["auction_id", "product_code", "clearing_price_gbp_per_mwh"], - sort="delivery_start desc, clearing_price_gbp_per_mwh asc", - )) + response = client.get( + SEARCH_URL, + params=_params( + fields=["auction_id", "product_code", "clearing_price_gbp_per_mwh"], + sort="delivery_start desc, clearing_price_gbp_per_mwh asc", + ), + ) assert response.status_code == 200 @@ -117,6 +126,7 @@ def test_search_with_distinct(client: TestClient) -> None: # 3. Pagination + include_total -------------------------------------------- + def test_include_total_returns_total(client: TestClient) -> None: response = client.get(SEARCH_URL, params=_params(include_total=True)) @@ -147,6 +157,7 @@ def test_pagination_echoes_limit_offset(client: TestClient) -> None: # 4. Validation ------------------------------------------------------------- + def test_missing_resource_id_returns_validation_error(client: TestClient) -> None: response = client.get(SEARCH_URL, params={}) @@ -183,10 +194,13 @@ def test_filters_malformed_json_returns_validation_error(client: TestClient) -> def test_filters_must_be_object_not_array(client: TestClient) -> None: - response = client.get(SEARCH_URL, params={ - "resource_id": _RESOURCE_ID, - "filters": json.dumps(["not", "an", "object"]), - }) + response = client.get( + SEARCH_URL, + params={ + "resource_id": _RESOURCE_ID, + "filters": json.dumps(["not", "an", "object"]), + }, + ) assert response.status_code == 400 body = response.json() @@ -196,10 +210,13 @@ def test_filters_must_be_object_not_array(client: TestClient) -> None: def test_q_starting_with_brace_must_be_valid_json(client: TestClient) -> None: """A `q` that looks like JSON (leading `{`) must actually parse.""" - response = client.get(SEARCH_URL, params={ - "resource_id": _RESOURCE_ID, - "q": "{not valid", - }) + response = client.get( + SEARCH_URL, + params={ + "resource_id": _RESOURCE_ID, + "q": "{not valid", + }, + ) assert response.status_code == 400 body = response.json() @@ -209,6 +226,7 @@ def test_q_starting_with_brace_must_be_valid_json(client: TestClient) -> None: # 5. Auth ------------------------------------------------------------------- + def test_unknown_resource_id_returns_404(client: TestClient) -> None: response = client.get(SEARCH_URL, params=_params(resource_id="does-not-exist")) @@ -218,9 +236,7 @@ def test_unknown_resource_id_returns_404(client: TestClient) -> None: assert "does-not-exist" in body["error"]["message"] -def test_denied_key_returns_403( - client: TestClient, fake_ckan: FakeCKAN -) -> None: +def test_denied_key_returns_403(client: TestClient, fake_ckan: FakeCKAN) -> None: fake_ckan.deny("test-token") response = client.get(SEARCH_URL, params=_params()) @@ -229,8 +245,26 @@ def test_denied_key_returns_403( assert response.json()["error"]["__type"] == "Authorization Error" +def test_anonymous_read_calls_ckan_and_succeeds( + client: TestClient, fake_ckan: FakeCKAN, +) -> None: + """No Authorization header on a read → we still call CKAN's + `datastore_authorize`. CKAN itself decides based on resource + visibility; on the FakeCKAN (no deny-list, no visibility flags) + that succeeds, so the request returns 200.""" + before = fake_ckan.authorize_calls + # Drop the default Authorization header the conftest sets — we + # want a real "no header" request, not "header with empty value". + client.headers.pop("Authorization", None) + response = client.get(SEARCH_URL, params=_params()) + assert response.status_code == 200 + # Confirms the auth path actually reached CKAN (not short-circuited). + assert fake_ckan.authorize_calls - before == 1 + + # 6. records_format --------------------------------------------------------- + def test_default_records_format_is_json_objects(client: TestClient) -> None: """Default `records_format=objects` returns the CKAN JSON envelope.""" response = client.get(SEARCH_URL, params=_params()) @@ -258,10 +292,13 @@ def test_records_format_csv_returns_json_envelope(client: TestClient) -> None: as a CSV-encoded string. Content-Type is application/json — clients parse the envelope, then read the records string as CSV. Column names live on `result.fields`, not in the records string.""" - response = client.get(SEARCH_URL, params=_params( - fields=["auction_id", "product_code"], - records_format="csv", - )) + response = client.get( + SEARCH_URL, + params=_params( + fields=["auction_id", "product_code"], + records_format="csv", + ), + ) assert response.status_code == 200 assert response.headers["content-type"].startswith("application/json") @@ -273,10 +310,13 @@ def test_records_format_csv_returns_json_envelope(client: TestClient) -> None: def test_records_format_tsv_returns_json_envelope(client: TestClient) -> None: """`records_format=tsv` — same envelope as csv but tab-separated.""" - response = client.get(SEARCH_URL, params=_params( - fields=["auction_id", "product_code"], - records_format="tsv", - )) + response = client.get( + SEARCH_URL, + params=_params( + fields=["auction_id", "product_code"], + records_format="tsv", + ), + ) assert response.status_code == 200 assert response.headers["content-type"].startswith("application/json") @@ -334,9 +374,11 @@ def lazy_records() -> Iterator[tuple]: consumed.append(r) yield r + schema = {"fields": [{"name": f["id"], "type": f["type"]} for f in fields]} + def fake_search(self: BigQueryBackend, **kwargs: Any) -> SearchResult: return SearchResult( - fields=fields, + schema=schema, records=lazy_records(), total=len(rows), records_truncated=False, @@ -395,9 +437,7 @@ def test_csv_format_streams_data_rows( assert response.headers["content-type"].startswith("application/json") body = response.json() assert body["result"]["records"] == ( - "144,DCL,47.82\n" - "145,DCH,51.1\n" - "146,FFR,32.4\n" + "144,DCL,47.82\n" "145,DCH,51.1\n" "146,FFR,32.4\n" ) @@ -414,9 +454,7 @@ def test_tsv_format_streams_data_rows( assert response.headers["content-type"].startswith("application/json") body = response.json() assert body["result"]["records"] == ( - "144\tDCL\t47.82\n" - "145\tDCH\t51.1\n" - "146\tFFR\t32.4\n" + "144\tDCL\t47.82\n" "145\tDCH\t51.1\n" "146\tFFR\t32.4\n" ) @@ -454,58 +492,62 @@ def test_csv_quotes_values_with_special_chars( def test_search_objects_response_includes_links(client: TestClient) -> None: - """`_links.start` (no offset) and `_links.next` (offset=limit) come back - with the same scheme + host as the request — TestClient uses - `http://testserver`.""" + """Empty-table case (placeholder engine, total=0): only `start` is + emitted — `next` / `prev` don't apply and the page counters are + suppressed when there's nothing to page through. Scheme + host + carried from the request (TestClient uses `http://testserver`).""" response = client.get(SEARCH_URL, params=_params()) assert response.status_code == 200 links = response.json()["result"]["_links"] - assert set(links) == {"start", "next"} + assert set(links) == {"start", "page_size"} assert links["start"].startswith("http://testserver/api/3/action/datastore_search") - assert links["next"].startswith("http://testserver/api/3/action/datastore_search") - # start: no offset (defaults to 0); next: offset = 0 + default limit (100) assert "offset" not in links["start"] - assert "offset=100" in links["next"] assert f"resource_id={_RESOURCE_ID}" in links["start"] - assert f"resource_id={_RESOURCE_ID}" in links["next"] + assert links["page_size"] == 100 # default limit -def test_search_links_advance_by_limit(client: TestClient) -> None: - """`next` jumps by exactly `limit`, regardless of the caller's offset.""" +def test_search_links_prev_emitted_on_inner_page(client: TestClient) -> None: + """At `offset > 0`, `prev` lands at `max(0, offset - limit)`. No + `next` because placeholder total=0 means we're past the end.""" response = client.get(SEARCH_URL, params=_params(limit=50, offset=25)) assert response.status_code == 200 links = response.json()["result"]["_links"] - # start drops offset, keeps the rest assert "offset" not in links["start"] assert "limit=50" in links["start"] - # next advances offset by limit: 25 + 50 = 75 - assert "offset=75" in links["next"] - assert "limit=50" in links["next"] + # prev clamps to 0 since offset (25) < limit (50). + assert "offset=0" in links["prev"] + assert "limit=50" in links["prev"] + assert "next" not in links def test_search_links_preserve_other_query_params(client: TestClient) -> None: - """Filters / sort / fields ride along on both `start` and `next`.""" - response = client.get(SEARCH_URL, params=_params( - filters={"product_code": "DCL"}, - sort="delivery_start desc", - fields=["auction_id", "product_code"], - )) + """Filters / sort / fields ride along on every emitted link.""" + response = client.get( + SEARCH_URL, + params=_params( + filters={"product_code": "DCL"}, + sort="delivery_start desc", + fields=["auction_id", "product_code"], + ), + ) assert response.status_code == 200 links = response.json()["result"]["_links"] - for link in (links["start"], links["next"]): - assert "filters=" in link - assert "sort=" in link - assert "fields=" in link + for v in links.values(): + if not isinstance(v, str): + continue # `page` / `total_pages` are ints + assert "filters=" in v + assert "sort=" in v + assert "fields=" in v def test_search_lists_format_also_includes_links(client: TestClient) -> None: - """`records_format=lists` is still a JSON envelope, so `_links` is present.""" + """`records_format=lists` is still a JSON envelope, so `_links` is + present (placeholder engine, empty table: `start` + `page_size`).""" response = client.get(SEARCH_URL, params=_params(records_format="lists")) assert response.status_code == 200 links = response.json()["result"]["_links"] - assert set(links) == {"start", "next"} - + assert set(links) == {"start", "page_size"} diff --git a/tests/test_datastore_search_sql.py b/tests/test_datastore_search_sql.py index 654d8b2..d21f115 100644 --- a/tests/test_datastore_search_sql.py +++ b/tests/test_datastore_search_sql.py @@ -32,7 +32,7 @@ # 1. Happy path ------------------------------------------------------------- def test_basic_sql_succeeds(client: TestClient) -> None: - response = client.get(SQL_URL, params={"sql": "SELECT 1"}) + response = client.get(SQL_URL, params={"sql": "SELECT 1 LIMIT 10"}) assert response.status_code == 200 body = response.json() @@ -43,44 +43,112 @@ def test_basic_sql_succeeds(client: TestClient) -> None: def test_with_cte_succeeds(client: TestClient) -> None: """`WITH ... SELECT` (CTE) is allowed alongside plain SELECT.""" response = client.get(SQL_URL, params={ - "sql": "WITH t AS (SELECT 1 AS a) SELECT * FROM t" + "sql": "WITH t AS (SELECT 1 AS a) SELECT * FROM t LIMIT 10" }) assert response.status_code == 200 def test_trailing_semicolon_allowed(client: TestClient) -> None: - response = client.get(SQL_URL, params={"sql": "SELECT 1;"}) + response = client.get(SQL_URL, params={"sql": "SELECT 1 LIMIT 10;"}) assert response.status_code == 200 def test_leading_comment_then_select_allowed(client: TestClient) -> None: - response = client.get(SQL_URL, params={"sql": "-- a note\nSELECT 1"}) + response = client.get(SQL_URL, params={ + "sql": "-- a note\nSELECT 1 LIMIT 10" + }) assert response.status_code == 200 +def test_missing_limit_rejected(client: TestClient) -> None: + """LIMIT is required so the server can paginate and so unbounded + SELECTs can't pin the streaming response open.""" + response = client.get(SQL_URL, params={"sql": "SELECT 1"}) + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + assert "LIMIT" in body["error"]["message"] + + +def test_limit_above_max_rejected(client: TestClient) -> None: + """LIMIT must be <= `SEARCH_RESULT_ROWS_MAX` (default 32000). + Above the cap → 400 with a 'paginate with OFFSET' hint.""" + response = client.get(SQL_URL, params={ + "sql": "SELECT 1 LIMIT 50000", + }) + assert response.status_code == 400 + body = response.json() + assert body["error"]["__type"] == "Validation Error" + assert "OFFSET" in body["error"]["message"] + + # 2. Response envelope shape ------------------------------------------------ def test_response_shape_matches_datastore_search(client: TestClient) -> None: - """Same envelope as `datastore_search` so clients can share a parser.""" - response = client.get(SQL_URL, params={"sql": "SELECT 1"}) + """Same envelope as `datastore_search` so clients can share a parser. + `limit` / `offset` come from the SQL's LIMIT / OFFSET literals.""" + response = client.get(SQL_URL, params={ + "sql": "SELECT 1 LIMIT 50 OFFSET 100" + }) assert response.status_code == 200 assert response.headers["content-type"].startswith("application/json") result = response.json()["result"] assert set(result) >= { - "resource_id", "fields", "records", "limit", "offset", "_links", + "resource_id", "schema", "fields", "records", "limit", "offset", "_links", } - # Defaults for fields that don't apply to raw SQL. + # Both column shapes are present: canonical `schema` + legacy `fields`. + assert isinstance(result["schema"], dict) + assert "fields" in result["schema"] + # `resource_id` is empty (raw SQL doesn't bind to one resource); + # `limit` / `offset` mirror the SQL literals. assert result["resource_id"] == "" - assert result["offset"] == 0 + assert result["limit"] == 50 + assert result["offset"] == 100 def test_response_includes_pagination_links(client: TestClient) -> None: - """`_links` is emitted for envelope-shape parity (start / next keys).""" - response = client.get(SQL_URL, params={"sql": "SELECT 1"}) + """`_links` carries `start` + page counters. Placeholder engine + returns total=0 so `page` / `total_pages` are suppressed (empty + landing page rule); the URLs rewrite the SQL's OFFSET.""" + response = client.get(SQL_URL, params={"sql": "SELECT 1 LIMIT 10"}) links = response.json()["result"]["_links"] - assert set(links) == {"start", "next"} + assert "start" in links + assert links["page_size"] == 10 + # start URL embeds the SQL with OFFSET 0 + assert "OFFSET+0" in links["start"] or "OFFSET%200" in links["start"] + + +def test_response_echoes_original_sql(client: TestClient) -> None: + """`result.sql` echoes the request SQL verbatim. Useful when + `_links.next` rewrites the OFFSET — clients can still see what + actually ran on this page.""" + sql = ( + 'SELECT auction_id FROM "balancing_auction_results_2025" ' + 'LIMIT 5 OFFSET 10' + ) + response = client.get(SQL_URL, params={"sql": sql}) + assert response.status_code == 200 + assert response.json()["result"]["sql"] == sql + + +def test_pagination_links_rewrite_sql_offset(client: TestClient) -> None: + """When the placeholder reports total=0 there's no `next`, but + once the engine reports rows the `next` URL would carry a SQL + string with OFFSET advanced by LIMIT. Verify the URL builder + rewrites OFFSET on the `start` link from the current offset back + to 0.""" + response = client.get(SQL_URL, params={ + "sql": "SELECT 1 LIMIT 50 OFFSET 200" + }) + assert response.status_code == 200 + links = response.json()["result"]["_links"] + # `start` resets to OFFSET 0 — `prev` lands at max(0, 200-50) = 150. + assert "prev" in links + # URL is percent-encoded; the new OFFSET literal is in the `sql` + # query param. Decode-ish: look for the substring after encoding. + assert "OFFSET+150" in links["prev"] or "OFFSET%20150" in links["prev"] # 3. SQL validation --------------------------------------------------------- @@ -194,7 +262,7 @@ def test_parse_sql_references_rejects_unparseable() -> None: def test_disallowed_function_returns_validation_error(client: TestClient) -> None: """`pg_read_file` isn't in `ALLOWED_SQL_FUNCTIONS` → 400.""" response = client.get(SQL_URL, params={ - "sql": "SELECT pg_read_file('/etc/passwd')", + "sql": "SELECT pg_read_file('/etc/passwd') LIMIT 1", }) assert response.status_code == 400 body = response.json() @@ -204,7 +272,7 @@ def test_disallowed_function_returns_validation_error(client: TestClient) -> Non def test_allowed_function_succeeds(client: TestClient) -> None: """`COUNT` is in the allow-list — no tables, so no auth call either.""" - response = client.get(SQL_URL, params={"sql": "SELECT COUNT(*)"}) + response = client.get(SQL_URL, params={"sql": "SELECT COUNT(*) LIMIT 1"}) assert response.status_code == 200 @@ -215,7 +283,7 @@ def test_unknown_table_returns_404( ) -> None: """Each referenced table is authorized via CKAN — unknown → 404.""" response = client.get(SQL_URL, params={ - "sql": 'SELECT * FROM "does-not-exist"', + "sql": 'SELECT * FROM "does-not-exist" LIMIT 10', }) assert response.status_code == 404 body = response.json() @@ -227,7 +295,7 @@ def test_existing_table_authorized( ) -> None: """Referenced table that exists in CKAN clears auth → 200.""" response = client.get(SQL_URL, params={ - "sql": 'SELECT * FROM "balancing_auction_results_2025"', + "sql": 'SELECT * FROM "balancing_auction_results_2025" LIMIT 10', }) assert response.status_code == 200 @@ -238,7 +306,7 @@ def test_denied_api_key_returns_403( """Auth gate uses the same path as datastore_search — denial returns 403.""" fake_ckan.deny("test-token") response = client.get(SQL_URL, params={ - "sql": 'SELECT * FROM "balancing_auction_results_2025"', + "sql": 'SELECT * FROM "balancing_auction_results_2025" LIMIT 10', }) assert response.status_code == 403 assert response.json()["error"]["__type"] == "Authorization Error" @@ -253,7 +321,7 @@ def test_each_table_authorized_once_for_joins( response = client.get(SQL_URL, params={ "sql": ( 'SELECT a.id FROM "balancing_auction_results_2025" a ' - 'JOIN "other_table" b ON a.id = b.id' + 'JOIN "other_table" b ON a.id = b.id LIMIT 10' ), }) assert response.status_code == 200 diff --git a/tests/test_health.py b/tests/test_health.py new file mode 100644 index 0000000..faffdcf --- /dev/null +++ b/tests/test_health.py @@ -0,0 +1,136 @@ +"""End-to-end tests for `GET /`, `GET /health`, `GET /ready`. + +Covers: + 1. / — welcome envelope + 2. /health — always 200 while the process is up + 3. /ready — 200 when both engines pass healthcheck; 503 with a + Service Unavailable envelope when either fails +""" + +from __future__ import annotations + +from collections.abc import Iterator + +import pytest +from datastore.infrastructure.engines.registry import ( + reset_engine_cache, +) +from fastapi.testclient import TestClient + + +@pytest.fixture(autouse=True) +def _clean_engine_cache() -> Iterator[None]: + reset_engine_cache() + yield + reset_engine_cache() + + +# 1. Welcome ---------------------------------------------------------------- + +def test_welcome_returns_envelope(client: TestClient) -> None: + response = client.get("/") + + assert response.status_code == 200 + body = response.json() + assert body["success"] is True + assert isinstance(body["result"]["message"], str) + + +def test_welcome_not_mounted_under_action_prefix(client: TestClient) -> None: + """Welcome is root-only — `/api/3/action/` is the CKAN action + namespace and shouldn't echo a generic landing message.""" + response = client.get("/api/3/action/") + assert response.status_code == 404 + + +# 2. /health ---------------------------------------------------------------- + +def test_health_returns_ok(client: TestClient) -> None: + """Liveness — always 200. Mounted at both root and under + `/api/3/action` so k8s probes and CKAN clients can both reach it.""" + for path in ("/health", "/api/3/action/health"): + response = client.get(path) + assert response.status_code == 200, path + body = response.json() + assert body["success"] is True + assert body["result"]["status"] == "ok" + + +# 3. /ready ----------------------------------------------------------------- + +def test_ready_503_when_engine_unhealthy(client: TestClient) -> None: + """Default test env has `bigquery` engine + no BIGQUERY_PROJECT, so + the client is never built and healthcheck returns False. Both modes + fail → 503 in the StatusResponse envelope shape (`result.status` = + "not_ready"); the HTTP code + `success: false` carry the signal so + mode names don't leak into the response.""" + response = client.get("/ready") + + assert response.status_code == 503 + body = response.json() + assert body["success"] is False + assert body["result"]["status"] == "not_ready" + assert "error" not in body + + +def test_ready_200_when_engines_healthy( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + """Force every engine instance's healthcheck to True — the same + pattern other endpoint tests use to swap engine behaviour. /ready + is mounted at both root and `/api/3/action`.""" + from datastore.infrastructure.engines.bigquery.backend import ( + BigQueryBackend, + ) + + monkeypatch.setattr(BigQueryBackend, "healthcheck", lambda self: True) + + for path in ("/ready", "/api/3/action/ready"): + response = client.get(path) + assert response.status_code == 200, path + body = response.json() + assert body["success"] is True + assert body["result"]["status"] == "ready" + + +def test_ready_503_when_only_rw_fails( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + """If rw fails but ro passes, /ready still 503s — pod isn't really + 'ready' until both modes are reachable. Envelope stays in + StatusResponse shape (`result.status` = "not_ready").""" + from datastore.infrastructure.engines.bigquery.backend import ( + BigQueryBackend, + ) + + def fake_healthcheck(self: BigQueryBackend) -> bool: + return self.mode == "ro" + + monkeypatch.setattr(BigQueryBackend, "healthcheck", fake_healthcheck) + + response = client.get("/ready") + + assert response.status_code == 503 + body = response.json() + assert body["success"] is False + assert body["result"]["status"] == "not_ready" + + +def test_ready_handles_engine_construction_error( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + """If building the engine raises (bad credentials, missing module), + /ready returns 503 in StatusResponse shape instead of bubbling a 500.""" + def boom(*args: object, **kwargs: object) -> object: + raise RuntimeError("engine construction failed") + + monkeypatch.setattr( + "datastore.api.endpoints.health.get_datastore_engine", boom + ) + + response = client.get("/ready") + + assert response.status_code == 503 + body = response.json() + assert body["success"] is False + assert body["result"]["status"] == "not_ready" diff --git a/tests/test_read_service.py b/tests/test_read_service.py index 6c72c9a..8de698f 100644 --- a/tests/test_read_service.py +++ b/tests/test_read_service.py @@ -153,64 +153,149 @@ def test_tsv_records_string_is_empty_when_engine_yields_no_rows() -> None: def test_links_present_on_every_format() -> None: + """Placeholder engine returns total=0 → no rows, so `page` and + `total_pages` are suppressed. `page_size` is always present when + `limit > 0` since a UI can render it even on an empty page.""" for fmt in ("objects", "lists", "csv", "tsv"): body = _call(data_dict_overrides={"records_format": fmt}) - assert set(body["result"]["_links"]) == {"start", "next"}, fmt + links = body["result"]["_links"] + assert set(links) == {"start", "page_size"}, fmt + assert links["page_size"] == 100 # default limit -def test_next_link_advances_offset_by_limit() -> None: - body = _call( - data_dict_overrides={"limit": 50, "offset": 25}, - request_url="http://test/api/3/action/datastore_search?limit=50&offset=25", - ) - links = body["result"]["_links"] - assert "offset=75" in links["next"] - assert "limit=50" in links["next"] - - -# --- _build_pagination_links: URL surgery ---------------------------------- +# --- _build_pagination_links: URL surgery + presence rules ----------------- def test_links_bare_path_url() -> None: - """Bare path input → bare path output (no scheme/host to preserve).""" + """Bare path input → bare path output (no scheme/host to preserve). + With a known `total > offset + limit`, `next` is emitted.""" links = _build_pagination_links( - "/api/3/action/datastore_search", limit=100, offset=0 + "/api/3/action/datastore_search", + limit=100, offset=0, total=500, ) assert links["start"] == "/api/3/action/datastore_search" assert links["next"] == "/api/3/action/datastore_search?offset=100" def test_links_strip_offset_from_start() -> None: + """`start` always drops `offset` (it defaults to 0); `prev` lands + at `max(0, offset - limit)`; `next` advances by `limit`.""" links = _build_pagination_links( "/api/3/action/datastore_search?resource_id=res-1&offset=50", - limit=10, offset=50, + limit=10, offset=50, total=200, ) assert "offset" not in links["start"] assert "resource_id=res-1" in links["start"] + assert "offset=40" in links["prev"] assert "offset=60" in links["next"] def test_links_preserve_other_query_params() -> None: - """filters, sort, fields ride along on both start and next.""" + """filters, sort, fields ride along on every emitted URL. Page + counters travel as ints and don't carry params.""" url = ( "/api/3/action/datastore_search" "?resource_id=res-1&filters=%7B%22a%22%3A1%7D" "&sort=created+desc&fields=a,b" ) - links = _build_pagination_links(url, limit=20, offset=0) - for link in (links["start"], links["next"]): - assert "filters=" in link - assert "sort=" in link - assert "fields=" in link - assert "resource_id=res-1" in link + links = _build_pagination_links(url, limit=20, offset=20, total=100) + assert set(links) == { + "start", "prev", "next", "page_size", "page", "total_pages", + } + for v in links.values(): + if not isinstance(v, str): + continue # `page_size` / `page` / `total_pages` are ints + assert "filters=" in v + assert "sort=" in v + assert "fields=" in v + assert "resource_id=res-1" in v + assert links["page_size"] == 20 + assert links["page"] == 2 + assert links["total_pages"] == 5 def test_links_preserve_scheme_and_host_from_full_url() -> None: """Full URL input → full URL output (scheme + host carried through).""" links = _build_pagination_links( "http://example.com/api/3/action/datastore_search?limit=100", - limit=100, offset=0, + limit=100, offset=0, total=500, ) assert links["start"].startswith("http://example.com/api/3/action/datastore_search") assert links["next"].startswith("http://example.com/api/3/action/datastore_search") assert "offset=100" in links["next"] + + +def test_links_omit_next_when_total_reached() -> None: + """On the last page (`offset + limit >= total`), `next` is dropped.""" + links = _build_pagination_links( + "/path", limit=10, offset=90, total=100, + ) + assert "next" not in links + assert "prev" in links # offset > 0 + + +def test_links_omit_next_when_total_unknown() -> None: + """`include_total=False` → can't tell if a next page exists; drop + `next` and `total_pages` rather than guess. Clients detect end via + an empty `records` array. `page` + `page_size` stay since position + is meaningful for single-page pickers.""" + links = _build_pagination_links( + "/path", limit=10, offset=0, total=None, + ) + assert set(links) == {"start", "page_size", "page"} + assert links["page_size"] == 10 + assert links["page"] == 1 + assert "total_pages" not in links + assert "next" not in links + + +def test_links_omit_prev_at_first_page() -> None: + """`offset == 0` → no previous page exists, so `prev` is dropped.""" + links = _build_pagination_links( + "/path", limit=10, offset=0, total=100, + ) + assert "prev" not in links + assert "next" in links + + +def test_links_drop_page_counters_on_empty_resource() -> None: + """No rows on the current page (empty resource) → suppress `page` + and `total_pages`. `page_size` rides along since a UI can still + render it; `start` is the only meaningful nav URL.""" + links = _build_pagination_links( + "/path", limit=10, offset=0, total=0, + ) + assert set(links) == {"start", "page_size"} + assert links["page_size"] == 10 + + +def test_links_drop_page_counters_when_offset_past_total() -> None: + """Caller paged past the end (`offset >= total`) → counters lie + about position, so they're dropped. `prev` remains so the UI can + walk back to a real page; the empty `records` array signals the + overshoot.""" + links = _build_pagination_links( + "/path", limit=100, offset=400, total=302, + ) + assert "page" not in links + assert "total_pages" not in links + assert "prev" in links # offset > 0 + assert "next" not in links # nothing past the end + + +def test_links_keep_page_counters_on_real_page() -> None: + """Within total → page + total_pages reflect a real position.""" + links = _build_pagination_links( + "/path", limit=100, offset=200, total=302, + ) + assert links["page"] == 3 + assert links["total_pages"] == 4 + + +def test_links_prev_clamps_to_zero_on_partial_first_page() -> None: + """Paging back from `offset < limit` must land at offset=0, not a + negative offset.""" + links = _build_pagination_links( + "/path", limit=50, offset=20, total=100, + ) + assert "offset=0" in links["prev"] diff --git a/tests/test_write_service.py b/tests/test_write_service.py index f6638be..322185e 100644 --- a/tests/test_write_service.py +++ b/tests/test_write_service.py @@ -30,13 +30,24 @@ def _ctx() -> SimpleNamespace: return SimpleNamespace(config=Config(), ckan=_FakeCKAN()) +def _schema(primary_key: list[str] | None = None) -> dict[str, Any]: + """Minimal canonical frictionless schema for service-level tests. + + The request validator folds legacy `fields`/`primary_key` into this + shape; tests that bypass the boundary build it directly. + """ + schema: dict[str, Any] = {"fields": [{"name": "a", "type": "integer"}]} + if primary_key: + schema["primaryKey"] = primary_key + return schema + + def test_existing_resource_skips_resource_create() -> None: ctx = _ctx() data_dict = { "package": {"id": "pkg-1"}, "resource": "existing-resource-id", # str → existing flow - "fields": [{"id": "a", "type": "int4"}], - "primary_key": ["a"], + "schema": _schema(primary_key=["a"]), "records": [{"a": 1}, {"a": 2}], } @@ -44,7 +55,9 @@ def test_existing_resource_skips_resource_create() -> None: assert result.resource_id == "existing-resource-id" assert result.package_id == "pkg-1" + # Top-level `primary_key` and `schema.primaryKey` carry the same value. assert result.primary_key == ["a"] + assert result.schema["primaryKey"] == ["a"] assert ctx.ckan.created == [] # no CKAN call @@ -53,8 +66,7 @@ def test_new_resource_creates_via_ckan() -> None: data_dict = { "package": {"id": "pkg-1"}, "resource": {"package_id": "pkg-1", "name": "foo"}, # dict → new flow - "fields": [{"id": "a"}], - "primary_key": ["a"], + "schema": _schema(primary_key=["a"]), "records": [], } @@ -72,8 +84,7 @@ def test_missing_records_is_handled() -> None: data_dict = { "package": {"id": "pkg-x"}, "resource": "res-x", - "fields": [{"id": "a"}], - "primary_key": ["a"], + "schema": _schema(primary_key=["a"]), # records absent } @@ -82,18 +93,19 @@ def test_missing_records_is_handled() -> None: assert result.records is None # include_records defaults to False -def test_primary_key_defaults_to_empty_list() -> None: +def test_schema_without_primary_key_is_accepted() -> None: + """A schema with no `primaryKey` is valid — the response just omits it.""" ctx = _ctx() data_dict = { "package": {"id": "pkg-x"}, "resource": "res-x", - "fields": [{"id": "a"}], - # primary_key absent + "schema": _schema(), # no primaryKey "records": [], } result = asyncio.run(create_datastore(ctx, data_dict)) + assert "primaryKey" not in result.schema assert result.primary_key == [] @@ -103,8 +115,7 @@ def test_missing_package_returns_none_package_id() -> None: data_dict = { # no "package" key "resource": "res-x", - "fields": [{"id": "a"}], - "primary_key": ["a"], + "schema": _schema(primary_key=["a"]), "records": [{"a": 1}], } @@ -137,11 +148,16 @@ def test_upsert_returns_typed_result() -> None: def test_upsert_default_method_is_upsert() -> None: """`method` is optional; absence resolves to 'upsert' inside the service.""" ctx = _ctx() - result = asyncio.run(upsert_datastore(ctx, { - "resource_id": "res-1", - "records": [{"a": 1}], - # method absent - })) + result = asyncio.run( + upsert_datastore( + ctx, + { + "resource_id": "res-1", + "records": [{"a": 1}], + # method absent + }, + ) + ) assert result.method == "upsert" @@ -149,12 +165,17 @@ def test_upsert_default_method_is_upsert() -> None: def test_upsert_echoes_records_when_include_records() -> None: ctx = _ctx() records = [{"a": 1}, {"a": 2}] - result = asyncio.run(upsert_datastore(ctx, { - "resource_id": "res-1", - "records": records, - "method": "upsert", - "include_records": True, - })) + result = asyncio.run( + upsert_datastore( + ctx, + { + "resource_id": "res-1", + "records": records, + "method": "upsert", + "include_records": True, + }, + ) + ) assert result.records == records @@ -162,12 +183,17 @@ def test_upsert_echoes_records_when_include_records() -> None: def test_upsert_returns_total_when_include_total() -> None: """BigQuery placeholder returns `total=len(records)`; the service lifts it.""" ctx = _ctx() - result = asyncio.run(upsert_datastore(ctx, { - "resource_id": "res-1", - "records": [{"a": 1}, {"a": 2}], - "method": "upsert", - "include_total": True, - })) + result = asyncio.run( + upsert_datastore( + ctx, + { + "resource_id": "res-1", + "records": [{"a": 1}, {"a": 2}], + "method": "upsert", + "include_total": True, + }, + ) + ) assert result.total == 2 @@ -175,12 +201,17 @@ def test_upsert_returns_total_when_include_total() -> None: def test_upsert_omits_total_when_include_total_false() -> None: """Even if the engine populates total, the service gates on the request flag.""" ctx = _ctx() - result = asyncio.run(upsert_datastore(ctx, { - "resource_id": "res-1", - "records": [{"a": 1}, {"a": 2}], - "method": "upsert", - "include_total": False, - })) + result = asyncio.run( + upsert_datastore( + ctx, + { + "resource_id": "res-1", + "records": [{"a": 1}, {"a": 2}], + "method": "upsert", + "include_total": False, + }, + ) + ) assert result.total is None @@ -188,11 +219,16 @@ def test_upsert_omits_total_when_include_total_false() -> None: def test_upsert_records_optional() -> None: """`records` may be omitted — service defaults to [] and doesn't crash.""" ctx = _ctx() - result = asyncio.run(upsert_datastore(ctx, { - "resource_id": "res-1", - "method": "upsert", - # records absent - })) + result = asyncio.run( + upsert_datastore( + ctx, + { + "resource_id": "res-1", + "method": "upsert", + # records absent + }, + ) + ) assert result.resource_id == "res-1" assert result.records is None # include_records defaults to False