git-pkgs · larsborn · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/README.md b/README.md
@@ -577,7 +577,7 @@ Recently cached:
 | Endpoint | Description |
 |----------|-------------|
 | `GET /` | Dashboard (web UI) |
-| `GET /health` | Health check (returns "ok" if healthy) |
+| `GET /health` | Health check (JSON; HTTP 200 healthy, 503 unhealthy) |
 | `GET /stats` | Cache statistics (JSON) |
 | `GET /metrics` | Prometheus metrics |
 | `GET /npm/*` | npm registry protocol |
@@ -815,9 +815,28 @@ The proxy exposes Prometheus metrics at `GET /metrics`. All metric names are pre
 | `proxy_storage_operation_duration_seconds` | histogram | `operation` | Storage read/write latency |
 | `proxy_storage_errors_total` | counter | `operation` | Storage read/write failures |
 | `proxy_active_requests` | gauge | | In-flight requests |
+| `proxy_health_probe_failures_total` | Counter | `step` | Storage health probe failures by failing step (`write`, `size`, `read`, `verify`, `delete`). |
 
 Cache size and artifact count are refreshed every 60 seconds. The remaining metrics update on each request.
 
+### Health Check
+
+`/health` returns a structured JSON report of subsystem health. HTTP 200 if all checks pass; 503 if any fail.
+
+```json
+{
+  "status": "ok",
+  "checks": {
+    "database": {"status": "ok"},
+    "storage":  {"status": "ok"}
+  }
+}
+```
+
+Failing checks include an `"error"` field. Storage failures also include a `"step"` field identifying which probe step failed (`write`, `size`, `read`, `verify`, `delete`).
+
+Storage probe results are cached for `health.storage_probe_interval` (default 30s) to bound the cost of probing remote backends.
+
 Scrape config for Prometheus:
 
 ```yaml

diff --git a/cmd/proxy/main.go b/cmd/proxy/main.go
@@ -77,6 +77,7 @@
 //	PROXY_GRADLE_BUILD_CACHE_MAX_AGE         - Gradle cache max age eviction
 //	PROXY_GRADLE_BUILD_CACHE_MAX_SIZE        - Gradle cache max total size
 //	PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL  - Gradle cache eviction sweep interval
+//	PROXY_HEALTH_STORAGE_PROBE_INTERVAL - Storage health probe cache interval (default "30s")
 //
 // Example:
 //
@@ -203,6 +204,7 @@ func runServe() {
 		fmt.Fprintf(os.Stderr, "  PROXY_GRADLE_BUILD_CACHE_MAX_AGE         Gradle cache max age eviction\n")
 		fmt.Fprintf(os.Stderr, "  PROXY_GRADLE_BUILD_CACHE_MAX_SIZE        Gradle cache max total size\n")
 		fmt.Fprintf(os.Stderr, "  PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL  Gradle cache eviction sweep interval\n")
+		fmt.Fprintf(os.Stderr, "  PROXY_HEALTH_STORAGE_PROBE_INTERVAL      Storage health probe cache interval\n")
 	}
 
 	_ = fs.Parse(os.Args[1:])

diff --git a/config.example.yaml b/config.example.yaml
@@ -128,6 +128,15 @@ gradle:
     # How often eviction runs when max_age or max_size is set
     sweep_interval: "10m"
 
+# Health endpoint configuration.
+health:
+  # Minimum time between storage backend probes.
+  # The /health endpoint runs a write/read/verify/delete round-trip
+  # against the configured storage backend and caches the result for
+  # this interval. Set to "0" to probe on every request.
+  # Default: "30s".
+  storage_probe_interval: "30s"
+
 # Version cooldown configuration
 # Hides package versions published too recently, giving the community time
 # to spot malicious releases before they're pulled into projects.

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -277,7 +277,7 @@ HTTP server setup, web UI, and API handlers.
 - Web UI: dashboard, package browser, source browser, version comparison
 - Templates are embedded in the binary via `//go:embed`
 - Enrichment API for package metadata, vulnerability scanning, and outdated detection
-- Health, stats, and Prometheus metrics endpoints
+- Health, stats, and Prometheus metrics endpoints. `/health` runs an active write → read → verify → delete probe against the storage backend and returns a structured JSON response (`HealthResponse`) with `"ok"` / `"error"` status per subsystem. Probe results are cached (default 30 s, configurable via `health.storage_probe_interval`) to avoid overwhelming remote backends.
 
 ### `internal/metrics`
 

diff --git a/docs/swagger/docs.go b/docs/swagger/docs.go
@@ -399,7 +399,7 @@ const docTemplate = `{
         "/health": {
             "get": {
                 "produces": [
-                    "text/plain"
+                    "application/json"
                 ],
                 "tags": [
                     "meta"
@@ -409,13 +409,13 @@ const docTemplate = `{
                     "200": {
                         "description": "OK",
                         "schema": {
-                            "type": "string"
+                            "$ref": "#/definitions/server.HealthResponse"
                         }
                     },
                     "503": {
                         "description": "Service Unavailable",
                         "schema": {
-                            "type": "string"
+                            "$ref": "#/definitions/server.HealthResponse"
                         }
                     }
                 }
@@ -515,6 +515,34 @@ const docTemplate = `{
                 }
             }
         },
+        "server.HealthCheck": {
+            "type": "object",
+            "properties": {
+                "error": {
+                    "type": "string"
+                },
+                "status": {
+                    "type": "string"
+                },
+                "step": {
+                    "type": "string"
+                }
+            }
+        },
+        "server.HealthResponse": {
+            "type": "object",
+            "properties": {
+                "checks": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "$ref": "#/definitions/server.HealthCheck"
+                    }
+                },
+                "status": {
+                    "type": "string"
+                }
+            }
+        },
         "server.OutdatedPackage": {
             "type": "object",
             "properties": {

diff --git a/docs/swagger/swagger.json b/docs/swagger/swagger.json
@@ -392,7 +392,7 @@
         "/health": {
             "get": {
                 "produces": [
-                    "text/plain"
+                    "application/json"
                 ],
                 "tags": [
                     "meta"
@@ -402,13 +402,13 @@
                     "200": {
                         "description": "OK",
                         "schema": {
-                            "type": "string"
+                            "$ref": "#/definitions/server.HealthResponse"
                         }
                     },
                     "503": {
                         "description": "Service Unavailable",
                         "schema": {
-                            "type": "string"
+                            "$ref": "#/definitions/server.HealthResponse"
                         }
                     }
                 }
@@ -508,6 +508,34 @@
                 }
             }
         },
+        "server.HealthCheck": {
+            "type": "object",
+            "properties": {
+                "error": {
+                    "type": "string"
+                },
+                "status": {
+                    "type": "string"
+                },
+                "step": {
+                    "type": "string"
+                }
+            }
+        },
+        "server.HealthResponse": {
+            "type": "object",
+            "properties": {
+                "checks": {
+                    "type": "object",
+                    "additionalProperties": {
+                        "$ref": "#/definitions/server.HealthCheck"
+                    }
+                },
+                "status": {
+                    "type": "string"
+                }
+            }
+        },
         "server.OutdatedPackage": {
             "type": "object",
             "properties": {

diff --git a/internal/config/config.go b/internal/config/config.go
@@ -102,6 +102,9 @@ type Config struct {
 
 	// Gradle configures Gradle HttpBuildCache behavior.
 	Gradle GradleConfig `json:"gradle" yaml:"gradle"`
+
+	// Health configures the /health endpoint behavior.
+	Health HealthConfig `json:"health" yaml:"health"`
 }
 
 // CooldownConfig configures version cooldown periods.
@@ -182,6 +185,14 @@ type GradleBuildCacheConfig struct {
 	SweepInterval string `json:"sweep_interval" yaml:"sweep_interval"`
 }
 
+// HealthConfig configures the /health endpoint.
+type HealthConfig struct {
+	// StorageProbeInterval is the minimum time between storage backend probes.
+	// Uses Go duration syntax (e.g. "30s", "1m"). Default: "30s".
+	// Set to "0" to probe on every /health request (useful for low-traffic deployments).
+	StorageProbeInterval string `json:"storage_probe_interval" yaml:"storage_probe_interval"`
+}
+
 // DatabaseConfig configures the cache database.
 type DatabaseConfig struct {
 	// Driver is the database driver: "sqlite" or "postgres".
@@ -343,6 +354,7 @@ func Load(path string) (*Config, error) {
 //   - PROXY_DATABASE_PATH
 //   - PROXY_LOG_LEVEL
 //   - PROXY_LOG_FORMAT
+//   - PROXY_HEALTH_STORAGE_PROBE_INTERVAL
 func (c *Config) LoadFromEnv() {
 	if v := os.Getenv("PROXY_LISTEN"); v != "" {
 		c.Listen = v
@@ -410,6 +422,9 @@ func (c *Config) LoadFromEnv() {
 	if v := os.Getenv("PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL"); v != "" {
 		c.Gradle.BuildCache.SweepInterval = v
 	}
+	if v := os.Getenv("PROXY_HEALTH_STORAGE_PROBE_INTERVAL"); v != "" {
+		c.Health.StorageProbeInterval = v
+	}
 }
 
 // Validate checks the configuration for errors.

diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
@@ -128,6 +128,14 @@ var (
 		},
 		[]string{"ecosystem"},
 	)
+
+	HealthProbeFailures = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "proxy_health_probe_failures_total",
+			Help: "Total number of storage health probe failures, by step (write|size|read|verify|delete).",
+		},
+		[]string{"step"},
+	)
 )
 
 func init() {
@@ -147,6 +155,7 @@ func init() {
 		StorageErrors,
 		ActiveRequests,
 		IntegrityFailures,
+		HealthProbeFailures,
 	)
 }
 
@@ -192,6 +201,12 @@ func RecordIntegrityFailure(ecosystem string) {
 	IntegrityFailures.WithLabelValues(ecosystem).Inc()
 }
 
+// RecordHealthProbeFailure increments the health probe failure counter.
+// step is one of: "write", "size", "read", "verify", "delete".
+func RecordHealthProbeFailure(step string) {
+	HealthProbeFailures.WithLabelValues(step).Inc()
+}
+
 // RecordStorageError increments storage error counter.
 func RecordStorageError(operation string) {
 	StorageErrors.WithLabelValues(operation).Inc()