Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ Recently cached:
| Endpoint | Description |
|----------|-------------|
| `GET /` | Dashboard (web UI) |
| `GET /health` | Health check (returns "ok" if healthy) |
| `GET /health` | Health check (JSON; HTTP 200 healthy, 503 unhealthy) |
| `GET /stats` | Cache statistics (JSON) |
| `GET /metrics` | Prometheus metrics |
| `GET /npm/*` | npm registry protocol |
Expand Down Expand Up @@ -815,9 +815,28 @@ The proxy exposes Prometheus metrics at `GET /metrics`. All metric names are pre
| `proxy_storage_operation_duration_seconds` | histogram | `operation` | Storage read/write latency |
| `proxy_storage_errors_total` | counter | `operation` | Storage read/write failures |
| `proxy_active_requests` | gauge | | In-flight requests |
| `proxy_health_probe_failures_total` | Counter | `step` | Storage health probe failures by failing step (`write`, `size`, `read`, `verify`, `delete`). |

Cache size and artifact count are refreshed every 60 seconds. The remaining metrics update on each request.

### Health Check

`/health` returns a structured JSON report of subsystem health. HTTP 200 if all checks pass; 503 if any fail.

```json
{
"status": "ok",
"checks": {
"database": {"status": "ok"},
"storage": {"status": "ok"}
}
}
```

Failing checks include an `"error"` field. Storage failures also include a `"step"` field identifying which probe step failed (`write`, `size`, `read`, `verify`, `delete`).

Storage probe results are cached for `health.storage_probe_interval` (default 30s) to bound the cost of probing remote backends.

Scrape config for Prometheus:

```yaml
Expand Down
2 changes: 2 additions & 0 deletions cmd/proxy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
// PROXY_GRADLE_BUILD_CACHE_MAX_AGE - Gradle cache max age eviction
// PROXY_GRADLE_BUILD_CACHE_MAX_SIZE - Gradle cache max total size
// PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL - Gradle cache eviction sweep interval
// PROXY_HEALTH_STORAGE_PROBE_INTERVAL - Storage health probe cache interval (default "30s")
//
// Example:
//
Expand Down Expand Up @@ -203,6 +204,7 @@ func runServe() {
fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_MAX_AGE Gradle cache max age eviction\n")
fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_MAX_SIZE Gradle cache max total size\n")
fmt.Fprintf(os.Stderr, " PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL Gradle cache eviction sweep interval\n")
fmt.Fprintf(os.Stderr, " PROXY_HEALTH_STORAGE_PROBE_INTERVAL Storage health probe cache interval\n")
}

_ = fs.Parse(os.Args[1:])
Expand Down
9 changes: 9 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ gradle:
# How often eviction runs when max_age or max_size is set
sweep_interval: "10m"

# Health endpoint configuration.
health:
# Minimum time between storage backend probes.
# The /health endpoint runs a write/read/verify/delete round-trip
# against the configured storage backend and caches the result for
# this interval. Set to "0" to probe on every request.
# Default: "30s".
storage_probe_interval: "30s"

# Version cooldown configuration
# Hides package versions published too recently, giving the community time
# to spot malicious releases before they're pulled into projects.
Expand Down
2 changes: 1 addition & 1 deletion docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ HTTP server setup, web UI, and API handlers.
- Web UI: dashboard, package browser, source browser, version comparison
- Templates are embedded in the binary via `//go:embed`
- Enrichment API for package metadata, vulnerability scanning, and outdated detection
- Health, stats, and Prometheus metrics endpoints
- Health, stats, and Prometheus metrics endpoints. `/health` runs an active write → read → verify → delete probe against the storage backend and returns a structured JSON response (`HealthResponse`) with `"ok"` / `"error"` status per subsystem. Probe results are cached (default 30 s, configurable via `health.storage_probe_interval`) to avoid overwhelming remote backends.

### `internal/metrics`

Expand Down
34 changes: 31 additions & 3 deletions docs/swagger/docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ const docTemplate = `{
"/health": {
"get": {
"produces": [
"text/plain"
"application/json"
],
"tags": [
"meta"
Expand All @@ -409,13 +409,13 @@ const docTemplate = `{
"200": {
"description": "OK",
"schema": {
"type": "string"
"$ref": "#/definitions/server.HealthResponse"
}
},
"503": {
"description": "Service Unavailable",
"schema": {
"type": "string"
"$ref": "#/definitions/server.HealthResponse"
}
}
}
Expand Down Expand Up @@ -515,6 +515,34 @@ const docTemplate = `{
}
}
},
"server.HealthCheck": {
"type": "object",
"properties": {
"error": {
"type": "string"
},
"status": {
"type": "string"
},
"step": {
"type": "string"
}
}
},
"server.HealthResponse": {
"type": "object",
"properties": {
"checks": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/server.HealthCheck"
}
},
"status": {
"type": "string"
}
}
},
"server.OutdatedPackage": {
"type": "object",
"properties": {
Expand Down
34 changes: 31 additions & 3 deletions docs/swagger/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@
"/health": {
"get": {
"produces": [
"text/plain"
"application/json"
],
"tags": [
"meta"
Expand All @@ -402,13 +402,13 @@
"200": {
"description": "OK",
"schema": {
"type": "string"
"$ref": "#/definitions/server.HealthResponse"
}
},
"503": {
"description": "Service Unavailable",
"schema": {
"type": "string"
"$ref": "#/definitions/server.HealthResponse"
}
}
}
Expand Down Expand Up @@ -508,6 +508,34 @@
}
}
},
"server.HealthCheck": {
"type": "object",
"properties": {
"error": {
"type": "string"
},
"status": {
"type": "string"
},
"step": {
"type": "string"
}
}
},
"server.HealthResponse": {
"type": "object",
"properties": {
"checks": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/server.HealthCheck"
}
},
"status": {
"type": "string"
}
}
},
"server.OutdatedPackage": {
"type": "object",
"properties": {
Expand Down
15 changes: 15 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ type Config struct {

// Gradle configures Gradle HttpBuildCache behavior.
Gradle GradleConfig `json:"gradle" yaml:"gradle"`

// Health configures the /health endpoint behavior.
Health HealthConfig `json:"health" yaml:"health"`
}

// CooldownConfig configures version cooldown periods.
Expand Down Expand Up @@ -182,6 +185,14 @@ type GradleBuildCacheConfig struct {
SweepInterval string `json:"sweep_interval" yaml:"sweep_interval"`
}

// HealthConfig configures the /health endpoint.
type HealthConfig struct {
// StorageProbeInterval is the minimum time between storage backend probes.
// Uses Go duration syntax (e.g. "30s", "1m"). Default: "30s".
// Set to "0" to probe on every /health request (useful for low-traffic deployments).
StorageProbeInterval string `json:"storage_probe_interval" yaml:"storage_probe_interval"`
}

// DatabaseConfig configures the cache database.
type DatabaseConfig struct {
// Driver is the database driver: "sqlite" or "postgres".
Expand Down Expand Up @@ -343,6 +354,7 @@ func Load(path string) (*Config, error) {
// - PROXY_DATABASE_PATH
// - PROXY_LOG_LEVEL
// - PROXY_LOG_FORMAT
// - PROXY_HEALTH_STORAGE_PROBE_INTERVAL
func (c *Config) LoadFromEnv() {
if v := os.Getenv("PROXY_LISTEN"); v != "" {
c.Listen = v
Expand Down Expand Up @@ -410,6 +422,9 @@ func (c *Config) LoadFromEnv() {
if v := os.Getenv("PROXY_GRADLE_BUILD_CACHE_SWEEP_INTERVAL"); v != "" {
c.Gradle.BuildCache.SweepInterval = v
}
if v := os.Getenv("PROXY_HEALTH_STORAGE_PROBE_INTERVAL"); v != "" {
c.Health.StorageProbeInterval = v
}
}

// Validate checks the configuration for errors.
Expand Down
15 changes: 15 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ var (
},
[]string{"ecosystem"},
)

HealthProbeFailures = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "proxy_health_probe_failures_total",
Help: "Total number of storage health probe failures, by step (write|size|read|verify|delete).",
},
[]string{"step"},
)
)

func init() {
Expand All @@ -147,6 +155,7 @@ func init() {
StorageErrors,
ActiveRequests,
IntegrityFailures,
HealthProbeFailures,
)
}

Expand Down Expand Up @@ -192,6 +201,12 @@ func RecordIntegrityFailure(ecosystem string) {
IntegrityFailures.WithLabelValues(ecosystem).Inc()
}

// RecordHealthProbeFailure increments the health probe failure counter.
// step is one of: "write", "size", "read", "verify", "delete".
func RecordHealthProbeFailure(step string) {
HealthProbeFailures.WithLabelValues(step).Inc()
}

// RecordStorageError increments storage error counter.
func RecordStorageError(operation string) {
StorageErrors.WithLabelValues(operation).Inc()
Expand Down
Loading
Loading