From c98a8692d93de1ba810c9f2e6be8c7b2502329ed Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 28 Mar 2026 19:42:04 +0530 Subject: [PATCH 1/3] feat: add Radix UI alert dialog and update dependencies - Added @radix-ui/react-alert-dialog version 1.1.15 to package-lock.json. - Updated dependencies for leaflet and added types for leaflet.markercluster. - Removed unused dependencies related to @radix-ui/react-dialog from package-lock.json. chore: create PowerShell script for API smoke tests - Added run_api_smoke.ps1 to perform health checks on the API after startup. - The script checks the health endpoint and logs output for troubleshooting. feat: add admin audit log and circuit breaker persistence - Created admin_audit_log table for immutable audit trails of admin actions. - Added circuit breaker state persistence to the webhooks table to prevent data loss on Redis restarts. feat: implement webhook DLQ archival and retention - Created webhook_dlq_archive table to store archived DLQ entries for permanent history. - Ensured the schema is write-once to maintain an immutable audit trail. chore: enhance RLS and search path for new tables - Enabled row-level security on admin_audit_log and webhook_dlq_archive tables. - Added explicit policies for service role and authenticated admin access. - Locked search_path for the set_updated_at function to enhance security. --- .github/workflows/pr.yml | 25 +- apps/api/src/app.ts | 29 +- apps/api/src/config/env.ts | 48 + .../api/src/modules/admin/audit-log.routes.ts | 72 ++ .../src/modules/admin/system-health.routes.ts | 134 +++ .../src/modules/admin/webhook-dlq.routes.ts | 146 ++++ .../modules/webhooks/webhooks.repository.ts | 9 +- .../src/modules/webhooks/webhooks.service.ts | 9 +- apps/api/src/plugins/prometheus.ts | 27 +- .../src/plugins/security/ratelimit.plugin.ts | 146 +++- apps/api/src/routes/health.ts | 7 +- apps/api/src/server.ts | 15 +- apps/api/src/utils/audit.ts | 43 + apps/api/src/utils/errors.ts | 7 + apps/api/src/utils/hmac.ts | 54 ++ apps/api/src/workers/circuit-breaker.ts | 399 +++++++++ apps/api/src/workers/retry-intents.ts | 4 +- apps/api/src/workers/startup.ts | 15 + apps/api/src/workers/webhook.queue.ts | 277 +++++- apps/api/src/workers/webhook.worker.ts | 224 ++++- .../admin/webhooks.integration.test.ts | 46 +- .../api/tests/unit/utils/webhook.unit.test.ts | 78 +- apps/web/next.config.mjs | 7 +- apps/web/package.json | 3 + .../admin/monitoring/map/EmployeeMap.tsx | 198 +++-- .../(protected)/admin/monitoring/map/page.tsx | 237 ++++- .../app/(protected)/admin/webhooks/page.tsx | 819 ++++++++++++++++++ apps/web/src/app/(protected)/profile/page.tsx | 26 +- apps/web/src/app/globals.css | 3 + apps/web/src/app/providers.tsx | 26 +- apps/web/src/components/layout/Header.tsx | 4 +- apps/web/src/components/layout/Sidebar.tsx | 12 + apps/web/src/components/ui/alert-dialog.tsx | 127 +++ apps/web/src/contexts/AuthContext.tsx | 6 +- apps/web/src/hooks/queries/useAnalytics.ts | 13 +- apps/web/src/hooks/queries/useDashboard.ts | 1 + apps/web/src/hooks/queries/useEmployees.ts | 5 +- apps/web/src/hooks/queries/useExpenses.ts | 7 +- apps/web/src/hooks/queries/useSessions.ts | 7 +- apps/web/src/hooks/queries/useWebhooks.ts | 135 +++ apps/web/src/hooks/useAuth.ts | 10 +- apps/web/src/lib/api/client.ts | 39 +- apps/web/src/lib/api/endpoints.ts | 10 + apps/web/src/lib/auth/role.ts | 54 ++ apps/web/src/lib/query-client.ts | 35 + apps/web/src/middleware.ts | 25 +- docs/SLO.md | 136 +++ docs/WEBHOOK_SIGNATURES.md | 194 +++++ infra/grafana/dashboards/fieldtrack.json | 134 ++- infra/nginx/fieldtrack.conf | 4 +- infra/prometheus/alerts.yml | 389 ++++++++- package-lock.json | 217 +++-- run_api_smoke.ps1 | 90 ++ .../20260328134113_add_admin_audit_log.sql | 25 + ...0328134130_circuit_breaker_persistence.sql | 31 + .../20260328134140_webhook_dlq_archive.sql | 30 + ..._phase29_hardening_rls_and_search_path.sql | 65 ++ 57 files changed, 4616 insertions(+), 322 deletions(-) create mode 100644 apps/api/src/modules/admin/audit-log.routes.ts create mode 100644 apps/api/src/modules/admin/system-health.routes.ts create mode 100644 apps/api/src/modules/admin/webhook-dlq.routes.ts create mode 100644 apps/api/src/utils/audit.ts create mode 100644 apps/api/src/workers/circuit-breaker.ts create mode 100644 apps/web/src/app/(protected)/admin/webhooks/page.tsx create mode 100644 apps/web/src/components/ui/alert-dialog.tsx create mode 100644 apps/web/src/hooks/queries/useWebhooks.ts create mode 100644 apps/web/src/lib/auth/role.ts create mode 100644 docs/SLO.md create mode 100644 docs/WEBHOOK_SIGNATURES.md create mode 100644 run_api_smoke.ps1 create mode 100644 supabase/migrations/20260328134113_add_admin_audit_log.sql create mode 100644 supabase/migrations/20260328134130_circuit_breaker_persistence.sql create mode 100644 supabase/migrations/20260328134140_webhook_dlq_archive.sql create mode 100644 supabase/migrations/20260328135403_phase29_hardening_rls_and_search_path.sql diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index e91846f..9593662 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -71,6 +71,7 @@ jobs: with: node-version: '24' cache: npm + cache-dependency-path: '**/package-lock.json' - run: npm ci if: needs.detect-changes.outputs.backend == 'true' @@ -94,6 +95,7 @@ jobs: if: needs.detect-changes.outputs.backend == 'true' run: | docker build \ + --target production \ -t fieldtrack-backend:ci-validation \ -f apps/api/Dockerfile \ . @@ -110,15 +112,31 @@ jobs: -e SUPABASE_SERVICE_ROLE_KEY=${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }} \ fieldtrack-backend:ci-validation - sleep 3 + STATUS="000" + for i in $(seq 1 12); do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:3001/health || echo "000") + if [ "$STATUS" = "200" ]; then break; fi + echo "Health check attempt $i: HTTP $STATUS — waiting..." + sleep 2 + done - STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:3001/health || echo "000") if [ "$STATUS" != "200" ]; then - echo "❌ /health returned HTTP $STATUS (expected 200)" + echo "❌ /health returned HTTP $STATUS after 24 s (expected 200)" docker logs fieldtrack-ci-test --tail 50 exit 1 fi + # Smoke tests: admin endpoints must reject unauthenticated requests with 401 + for ENDPOINT in /admin/audit-log /admin/webhook-dlq; do + ECODE=$(curl -s -o /dev/null -w "%{http_code}" "http://127.0.0.1:3001${ENDPOINT}" || echo "000") + if [ "$ECODE" != "401" ]; then + echo "❌ ${ENDPOINT} expected 401 (unauthenticated), got ${ECODE}" + docker logs fieldtrack-ci-test --tail 50 + exit 1 + fi + echo "✓ ${ENDPOINT} → 401 (auth guard verified)" + done + docker rm -f fieldtrack-ci-test docker rmi fieldtrack-backend:ci-validation @@ -163,6 +181,7 @@ jobs: with: node-version: '24' cache: npm + cache-dependency-path: '**/package-lock.json' - run: npm ci if: needs.detect-changes.outputs.frontend == 'true' diff --git a/apps/api/src/app.ts b/apps/api/src/app.ts index bc62682..60f0177 100644 --- a/apps/api/src/app.ts +++ b/apps/api/src/app.ts @@ -98,8 +98,9 @@ export async function buildApp(): Promise { // Performance timing — logs every response with full correlation context: // requestId, method, route, status, elapsed ms, userId, organizationId. - // Emits an additional WARN for responses slower than 200 ms so slow endpoints - // are immediately visible in Grafana/Loki without a query. + // Emits WARN for responses slower than 500 ms so slow endpoints are + // immediately visible in Grafana/Loki without a query. + // Emits ERROR for responses slower than 2000 ms — indicates a serious problem. app.addHook("onResponse", async (request, reply) => { const ms = Math.round(reply.elapsedTime); const logPayload = { @@ -112,8 +113,10 @@ export async function buildApp(): Promise { userId: (request as { user?: { sub?: string } }).user?.sub, organizationId: (request as { organizationId?: string }).organizationId, }; - if (ms > 200) { - request.log.warn(logPayload, "slow response"); + if (ms > 2_000) { + request.log.error({ ...logPayload, slow_request: true }, "very_slow_response"); + } else if (ms > 500) { + request.log.warn({ ...logPayload, slow_request: true }, "slow_response"); } else { request.log.info(logPayload, "response"); } @@ -146,6 +149,16 @@ export async function buildApp(): Promise { } request.log.error({ error: error.message, requestId: request.id }, "Unhandled error"); + // Track error rate — emit structured log field so Loki/Grafana can count 5xx per route + request.log.error( + { + error_rate_event: true, + route: request.routeOptions?.url ?? request.url, + method: request.method, + requestId: request.id, + }, + "error_rate_event", + ); void reply.status(500).send({ success: false, error: "Internal server error", @@ -181,10 +194,18 @@ export async function buildApp(): Promise { if (shouldStartWorkers()) { const { adminQueuesRoutes } = await import("./modules/admin/queues.routes.js"); const { adminRetryIntentsRoutes } = await import("./modules/admin/retry-intents.routes.js"); + const { systemHealthRoutes } = await import("./modules/admin/system-health.routes.js"); + const { webhookDlqRoutes } = await import("./modules/admin/webhook-dlq.routes.js"); await app.register(adminQueuesRoutes); await app.register(adminRetryIntentsRoutes); + await app.register(systemHealthRoutes); + await app.register(webhookDlqRoutes); } + // Admin audit log — not worker-gated (pure DB, no Redis required). + const { auditLogRoutes } = await import("./modules/admin/audit-log.routes.js"); + await app.register(auditLogRoutes); + // NOTE: Workers and startup recovery are intentionally started in server.ts // after app.listen() resolves. This keeps lifecycle explicit and prevents // accidental starts during app construction or module import. diff --git a/apps/api/src/config/env.ts b/apps/api/src/config/env.ts index 9e0a40b..18c64bf 100644 --- a/apps/api/src/config/env.ts +++ b/apps/api/src/config/env.ts @@ -297,6 +297,39 @@ const envSchema = z */ MAX_QUEUE_DEPTH: z.coerce.number().int().positive().default(1_000), + /** + * Maximum number of jobs that may sit in the webhook DLQ before new + * entries are rejected. When the cap is reached the oldest job is + * archived and evicted to make room for the new entry. + * Default: 10 000 jobs. + */ + WEBHOOK_DLQ_MAX_SIZE: z.coerce.number().int().positive().default(10_000), + + /** + * Number of days to retain webhook DLQ entries before they are purged. + * Jobs older than this are archived to webhook_dlq_archive and removed. + * Range: 14–30 days. Default: 30 days. + */ + WEBHOOK_DLQ_RETENTION_DAYS: z.coerce + .number() + .int() + .min(14, "WEBHOOK_DLQ_RETENTION_DAYS must be at least 14") + .max(30, "WEBHOOK_DLQ_RETENTION_DAYS must be at most 30") + .default(30), + + /** + * Maximum webhook payload size in bytes for outbound delivery. + * Deliveries above this threshold are marked failed and moved to DLQ to + * prevent oversized payload retries from consuming worker capacity. + * Default: 256 KiB. + */ + WEBHOOK_MAX_PAYLOAD_BYTES: z.coerce + .number() + .int() + .min(16_384, "WEBHOOK_MAX_PAYLOAD_BYTES must be at least 16 KiB") + .max(1_000_000, "WEBHOOK_MAX_PAYLOAD_BYTES must be at most 1_000_000 bytes") + .default(262_144), + /** * Maximum GPS point count per session before the recalculation job is * rejected. Guards against pathological data saturating the event loop. @@ -340,6 +373,21 @@ const envSchema = z .max(50, "ANALYTICS_WORKER_CONCURRENCY must be at most 50 (database pressure above this is counterproductive)") .default(5), + /** + * Number of webhook delivery jobs the webhook worker processes + * concurrently per replica. Default 5 provides enough throughput for + * most deployments. Increase with caution — HTTP fan-out can exhaust + * the OS file descriptor limit at high concurrency. + * + * Range: 1–20. + */ + WEBHOOK_WORKER_CONCURRENCY: z.coerce + .number() + .int() + .min(1, "WEBHOOK_WORKER_CONCURRENCY must be at least 1") + .max(20, "WEBHOOK_WORKER_CONCURRENCY must be at most 20") + .default(5), + // ── Infrastructure availability ──────────────────────────────────────── /** diff --git a/apps/api/src/modules/admin/audit-log.routes.ts b/apps/api/src/modules/admin/audit-log.routes.ts new file mode 100644 index 0000000..a909df8 --- /dev/null +++ b/apps/api/src/modules/admin/audit-log.routes.ts @@ -0,0 +1,72 @@ +/** + * audit-log.routes.ts — GET /admin/audit-log + * + * Returns a paginated list of admin audit events from `public.admin_audit_log`. + * Supports cursor-based pagination via `before` (ISO timestamp) and optional + * filtering by `event` type. + * + * Auth: ADMIN role required. + * Not worker-gated — pure DB (does not require Redis / BullMQ). + */ + +import type { FastifyInstance } from "fastify"; +import { z } from "zod"; +import { authenticate } from "../../middleware/auth.js"; +import { requireRole } from "../../middleware/role-guard.js"; +import { supabaseServiceClient as supabase } from "../../config/supabase.js"; +import { handleError } from "../../utils/response.js"; + +export async function auditLogRoutes(app: FastifyInstance): Promise { + app.get( + "/admin/audit-log", + { + schema: { + tags: ["admin"], + description: "Paginated admin audit log — lists privileged actions (ADMIN only).", + querystring: z.object({ + limit: z.coerce.number().int().min(1).max(200).default(50), + before: z.string().datetime({ offset: true }).optional(), + event: z.string().optional(), + }), + }, + preValidation: [authenticate, requireRole("ADMIN")], + }, + async (request, reply) => { + try { + const { limit, before, event } = request.query as { + limit: number; + before?: string; + event?: string; + }; + + let query = supabase + .from("admin_audit_log") + .select("id, event, actor_id, organization_id, resource_type, resource_id, payload, created_at") + .order("created_at", { ascending: false }) + .limit(limit); + + if (before) { + query = query.lt("created_at", before); + } + + if (event) { + query = query.eq("event", event); + } + + const { data, error } = await query; + + if (error) { + throw new Error(`[audit-log] DB query failed: ${error.message}`); + } + + reply.status(200).send({ + success: true, + data: data ?? [], + count: (data ?? []).length, + }); + } catch (error) { + handleError(error, request, reply, "Failed to fetch audit log"); + } + }, + ); +} diff --git a/apps/api/src/modules/admin/system-health.routes.ts b/apps/api/src/modules/admin/system-health.routes.ts new file mode 100644 index 0000000..1018fd7 --- /dev/null +++ b/apps/api/src/modules/admin/system-health.routes.ts @@ -0,0 +1,134 @@ +/** + * system-health.routes.ts — Deep system health endpoint for operators. + * + * GET /admin/system-health + * + * Returns a single-call view of: + * - Worker status (expected 3-of-3: distance, analytics, webhook) + * - Queue backlog (waiting + delayed jobs per queue) + * - Webhook DLQ depth + * - Webhook delivery stats: success rate, failure count, retry count + * + * Auth: ADMIN only (JWT + role check). + * Redis reads only — no heavy DB aggregation in the hot path. + * Webhook stats use a lightweight DB count query scoped to the org. + */ + +import type { FastifyInstance } from "fastify"; +import { authenticate } from "../../middleware/auth.js"; +import { requireRole } from "../../middleware/role-guard.js"; +import { areWorkersStarted, getExpectedWorkerCount } from "../../workers/startup.js"; +import { getWebhookQueueDepth, getWebhookDlqDepth } from "../../workers/webhook.queue.js"; +import { getAnalyticsQueueStats } from "../../workers/analytics.queue.js"; +import { distanceQueue } from "../../workers/distance.queue.js"; +import { supabaseServiceClient as supabase } from "../../config/supabase.js"; +import { handleError } from "../../utils/response.js"; + +const EXPECTED_WORKER_COUNT = getExpectedWorkerCount(); // distance + analytics + webhook (driven by WORKER_TYPES) + +export async function systemHealthRoutes(app: FastifyInstance): Promise { + app.get( + "/admin/system-health", + { + schema: { + tags: ["admin"], + description: + "Deep system health: worker status, queue backlogs, DLQ depth, and webhook delivery stats (ADMIN only).", + }, + preValidation: [authenticate, requireRole("ADMIN")], + }, + async (request, reply) => { + try { + const orgId = (request as { organizationId?: string }).organizationId; + + // ── Worker status ────────────────────────────────────────────────── + const workersActive = areWorkersStarted() ? EXPECTED_WORKER_COUNT : 0; + + // ── Queue depths (Redis) ─────────────────────────────────────────── + const [ + webhookQueueDepth, + webhookDlqDepth, + analyticsStats, + distanceWaiting, + distanceDelayed, + ] = await Promise.allSettled([ + getWebhookQueueDepth(), + getWebhookDlqDepth(), + getAnalyticsQueueStats(), + distanceQueue.getWaitingCount(), + distanceQueue.getDelayedCount(), + ]); + + const safeNumber = (r: PromiseSettledResult) => + r.status === "fulfilled" ? r.value : -1; + + const analyticsQueueDepth = + analyticsStats.status === "fulfilled" + ? analyticsStats.value.waiting + (analyticsStats.value.active ?? 0) + : -1; + + // ── Webhook delivery stats (DB, org-scoped) ──────────────────────── + let webhookSuccessRate = 0; + let webhookFailureCount = 0; + let webhookRetryCount = 0; + let webhookTotalCount = 0; + + if (orgId) { + const { data: stats } = await supabase + .from("webhook_deliveries") + .select("status, attempt_count") + .eq("organization_id", orgId) + .limit(500); + + if (stats) { + webhookTotalCount = stats.length; + const successes = stats.filter((r) => r.status === "success").length; + webhookFailureCount = stats.filter((r) => r.status === "failed").length; + // Retry count = total attempts beyond the first across all deliveries + webhookRetryCount = stats.reduce( + (sum, r) => sum + Math.max(0, (r.attempt_count ?? 0) - 1), + 0, + ); + webhookSuccessRate = + webhookTotalCount > 0 + ? Math.round((successes / webhookTotalCount) * 100) + : 100; // 100% if no deliveries yet + } + } + + reply.status(200).send({ + success: true, + timestamp: new Date().toISOString(), + workers: { + active: workersActive, + expected: EXPECTED_WORKER_COUNT, + healthy: workersActive === EXPECTED_WORKER_COUNT, + }, + queues: { + webhook: { + backlog: safeNumber(webhookQueueDepth), + dlq: safeNumber(webhookDlqDepth), + }, + analytics: { + backlog: analyticsQueueDepth, + }, + distance: { + backlog: + safeNumber(distanceWaiting) >= 0 && safeNumber(distanceDelayed) >= 0 + ? safeNumber(distanceWaiting) + safeNumber(distanceDelayed) + : -1, + }, + }, + webhooks: { + successRatePct: webhookSuccessRate, + failureCount: webhookFailureCount, + retryCount: webhookRetryCount, + totalDeliveries: webhookTotalCount, + }, + }); + } catch (error) { + handleError(error, request, reply, "Failed to fetch system health"); + } + }, + ); +} diff --git a/apps/api/src/modules/admin/webhook-dlq.routes.ts b/apps/api/src/modules/admin/webhook-dlq.routes.ts new file mode 100644 index 0000000..bae65c3 --- /dev/null +++ b/apps/api/src/modules/admin/webhook-dlq.routes.ts @@ -0,0 +1,146 @@ +/** + * webhook-dlq.routes.ts — Admin API for Dead-Letter Queue (DLQ) management. + * + * GET /admin/webhook-dlq — list DLQ jobs pending review + * POST /admin/webhook-dlq/:id/replay — replay a single DLQ job (reset attempt_count) + * + * All routes require ADMIN role (JWT + RBAC). + * Only available when WORKERS_ENABLED=true (registered from app.ts). + * + * Replay semantics: + * - Removes the job from the DLQ + * - Re-enqueues into the main webhook-delivery queue with attempt_number=1 + * - Resets attempt_count in DB to allow full retry schedule + * - Logs a structured audit entry on every replay + */ + +import type { FastifyInstance } from "fastify"; +import { z } from "zod"; +import { authenticate } from "../../middleware/auth.js"; +import { requireRole } from "../../middleware/role-guard.js"; +import { + replayWebhookDlqJob, + listWebhookDlqJobs, + getWebhookDlqDepth, +} from "../../workers/webhook.queue.js"; +import { supabaseServiceClient as supabase } from "../../config/supabase.js"; +import { NotFoundError } from "../../utils/errors.js"; +import { handleError } from "../../utils/response.js"; +import { insertAuditRecord } from "../../utils/audit.js"; + +const DLQ_REPLAY_COOLDOWN_MS = 5_000; +let lastDlqReplayAt = 0; + +export async function webhookDlqRoutes(app: FastifyInstance): Promise { + // ── GET /admin/webhook-dlq ───────────────────────────────────────────────── + app.get( + "/admin/webhook-dlq", + { + schema: { + tags: ["admin", "webhooks"], + description: "List jobs in the webhook Dead-Letter Queue (ADMIN only).", + querystring: z.object({ + limit: z.coerce.number().int().min(1).max(100).default(50), + }), + }, + preValidation: [authenticate, requireRole("ADMIN")], + }, + async (request, reply) => { + try { + const { limit } = request.query as { limit: number }; + const [jobs, depth] = await Promise.all([ + listWebhookDlqJobs(limit), + getWebhookDlqDepth(), + ]); + reply.status(200).send({ + success: true, + dlq_depth: depth, + jobs, + }); + } catch (error) { + handleError(error, request, reply, "Failed to list DLQ jobs"); + } + }, + ); + + // ── POST /admin/webhook-dlq/:id/replay ──────────────────────────────────── + app.post<{ Params: { id: string } }>( + "/admin/webhook-dlq/:id/replay", + { + schema: { + tags: ["admin", "webhooks"], + description: "Replay a DLQ job: re-enqueue with attempt_count reset (ADMIN only).", + params: z.object({ id: z.string().uuid() }), + }, + preValidation: [authenticate, requireRole("ADMIN")], + }, + async (request, reply) => { + try { + const { id: deliveryId } = request.params; + const adminId = (request as { user?: { sub?: string } }).user?.sub; + const orgId = (request as { organizationId?: string }).organizationId; + + // Per-admin replay cooldown — prevents accidental mass re-delivery + const now = Date.now(); + const elapsed = now - lastDlqReplayAt; + if (elapsed < DLQ_REPLAY_COOLDOWN_MS) { + reply.status(429).send({ + success: false, + error: `DLQ replay rate-limited. Retry in ${DLQ_REPLAY_COOLDOWN_MS - elapsed}ms.`, + }); + return; + } + lastDlqReplayAt = now; + + const replayed = await replayWebhookDlqJob(deliveryId); + if (!replayed) { + throw new NotFoundError(`DLQ job for delivery ${deliveryId} not found`); + } + + // Reset attempt_count in DB so the full retry schedule applies + await supabase + .from("webhook_deliveries") + .update({ + status: "pending", + attempt_count: 0, + next_retry_at: new Date().toISOString(), + }) + .eq("id", deliveryId); + + // Structured audit log — queryable in Grafana/Loki + request.log.info( + { + audit: true, + event: "WEBHOOK_DLQ_REPLAY", + deliveryId, + adminId, + organizationId: orgId, + timestamp: new Date().toISOString(), + }, + "webhook-dlq: DLQ job replayed by admin", + ); + + // Persist to DB audit trail for GET /admin/audit-log + await insertAuditRecord({ + event: "WEBHOOK_DLQ_REPLAY", + actor_id: adminId, + organization_id: orgId, + resource_type: "webhook_delivery", + resource_id: deliveryId, + payload: { replayed_at: new Date().toISOString() }, + }); + + reply.status(200).send({ + success: true, + data: { + delivery_id: deliveryId, + replayed_at: new Date().toISOString(), + message: "Job re-queued with attempt_count reset", + }, + }); + } catch (error) { + handleError(error, request, reply, "Failed to replay DLQ job"); + } + }, + ); +} diff --git a/apps/api/src/modules/webhooks/webhooks.repository.ts b/apps/api/src/modules/webhooks/webhooks.repository.ts index 860a74f..792f512 100644 --- a/apps/api/src/modules/webhooks/webhooks.repository.ts +++ b/apps/api/src/modules/webhooks/webhooks.repository.ts @@ -18,6 +18,9 @@ import type { DeliveryListQuery, } from "./webhooks.schema.js"; +const WEBHOOK_DELIVERY_COLUMNS = + "id, webhook_id, event_id, organization_id, status, attempt_count, response_status, response_body, last_attempt_at, next_retry_at, created_at"; + // ─── Webhook CRUD ───────────────────────────────────────────────────────────── export const webhooksRepository = { @@ -112,7 +115,7 @@ export const webhooksRepository = { const to = from + query.limit - 1; let q = orgTable(request, "webhook_deliveries") - .select("*", { count: "exact" }) + .select(WEBHOOK_DELIVERY_COLUMNS, { count: "exact" }) .order("created_at", { ascending: false }) .range(from, to); @@ -136,7 +139,7 @@ export const webhooksRepository = { deliveryId: string, ): Promise { const { data, error } = await orgTable(request, "webhook_deliveries") - .select("*") + .select(WEBHOOK_DELIVERY_COLUMNS) .eq("id", deliveryId) .limit(1) .maybeSingle(); @@ -172,7 +175,7 @@ export const webhooksRepository = { const { data, error } = await orgTable(request, "webhook_deliveries") .update({ status: "pending", next_retry_at: nextRetryAt }) .eq("id", deliveryId) - .select("*") + .select(WEBHOOK_DELIVERY_COLUMNS) .single(); if (error) throw new Error(`Failed to reset delivery: ${error.message}`); diff --git a/apps/api/src/modules/webhooks/webhooks.service.ts b/apps/api/src/modules/webhooks/webhooks.service.ts index da7ad3b..2187fb4 100644 --- a/apps/api/src/modules/webhooks/webhooks.service.ts +++ b/apps/api/src/modules/webhooks/webhooks.service.ts @@ -13,8 +13,9 @@ import type { FastifyRequest } from "fastify"; import { webhooksRepository } from "./webhooks.repository.js"; import { validateWebhookUrl, InvalidWebhookUrlError } from "../../utils/url-validator.js"; -import { BadRequestError, NotFoundError } from "../../utils/errors.js"; +import { BadRequestError, NotFoundError, ServiceUnavailableError } from "../../utils/errors.js"; import { enqueueWebhookDelivery } from "../../workers/webhook.queue.js"; +import { shouldStartWorkers } from "../../workers/startup.js"; import type { CreateWebhookBody, UpdateWebhookBody, @@ -103,6 +104,12 @@ export const webhooksService = { const delivery = await webhooksRepository.findDeliveryById(request, deliveryId); if (!delivery) throw new NotFoundError("Delivery not found"); + if (!shouldStartWorkers()) { + throw new ServiceUnavailableError( + "Workers not enabled — webhook delivery requires WORKERS_ENABLED=true", + ); + } + if (delivery.status === "pending") { throw new BadRequestError("Delivery is already pending — retry not needed"); } diff --git a/apps/api/src/plugins/prometheus.ts b/apps/api/src/plugins/prometheus.ts index ec70f0f..45044a3 100644 --- a/apps/api/src/plugins/prometheus.ts +++ b/apps/api/src/plugins/prometheus.ts @@ -226,6 +226,13 @@ export const distanceJobsTotal = new client.Counter({ * millions). Each org generates at most O(event_types × statuses) = ~15 series. * Do NOT add high-cardinality labels such as event_id or webhook_id. * + * IMPORTANT: `event_type` MUST be sanitised through `normalizeEventType()` + * (defined in webhook.worker.ts) before being used as a label value. + * Raw values from the DB payload could be arbitrary strings, creating + * unbounded cardinality. `normalizeEventType()` maps unknown values to + * `"other"`. Update KNOWN_EVENT_TYPES whenever a new EventDataMap key is + * added to event-bus.ts. + * * Not yet wired to the delivery worker — defined here so the metric is * registered in the same process-level registry as all other metrics and * appears in /metrics output from day one (with zero counters until Phase 25 @@ -239,7 +246,7 @@ export const distanceJobsTotal = new client.Counter({ export const webhookDeliveriesTotal = new client.Counter({ name: "webhook_deliveries_total", help: "Total number of webhook delivery attempts", - labelNames: ["event_type", "status", "organization_id"] as const, + labelNames: ["event_type", "status"] as const, registers: [register], }); @@ -260,7 +267,23 @@ export const webhookDeliveriesTotal = new client.Counter({ export const webhookFailuresTotal = new client.Counter({ name: "webhook_failures_total", help: "Total number of webhook deliveries that permanently failed after all retries", - labelNames: ["event_type", "organization_id"] as const, + labelNames: ["event_type"] as const, + registers: [register], +}); + +/** + * Total webhook delivery retries scheduled. + * Incremented each time a failed attempt is re-queued (attempt 2..MAX). + * + * Usage in the delivery worker: + * webhookRetriesTotal + * .labels({ event_type: "my.event", organization_id: orgId }) + * .inc(); + */ +export const webhookRetriesTotal = new client.Counter({ + name: "webhook_retries_total", + help: "Total number of webhook delivery retries scheduled", + labelNames: ["event_type"] as const, registers: [register], }); diff --git a/apps/api/src/plugins/security/ratelimit.plugin.ts b/apps/api/src/plugins/security/ratelimit.plugin.ts index cf0cd39..0a8cd07 100644 --- a/apps/api/src/plugins/security/ratelimit.plugin.ts +++ b/apps/api/src/plugins/security/ratelimit.plugin.ts @@ -1,27 +1,30 @@ /** - * Phase 15: Redis-backed Rate Limiting Plugin + * Rate Limiting Plugin — two-tier, Redis-backed, sliding-window. * - * Registers @fastify/rate-limit globally with a Redis store so that limits are - * enforced across all container replicas — never in process-memory. + * Tier 1 — Per-user (1 200 req/min): + * @fastify/rate-limit with its Redis store. The plugin already implements + * a sliding-window counter when a Redis store is supplied, replacing the + * vulnerable fixed-window that allows a burst-then-reset exploit at every + * window boundary. * - * Global defaults: 1200 requests / minute per authenticated user (keyed by - * Authorization header). This is intentionally generous — an admin polling - * the dashboard every 5 s consumes only 12 req/min. The strict cap exists to - * block runaway loops, not legitimate clients. + * Tier 2 — Per-org (5 000 req/min): + * Implemented as a Fastify preHandler hook using a Redis sorted-set sliding + * window, executed atomically via a Lua script (EVAL). The set stores + * timestamps as both score and member, so each request occupies exactly one + * slot that ages out automatically. * - * Keying by token (not IP) means multiple real users behind the same NAT or - * load-test runner each get their own independent quota. + * Algorithm (runs inside one EVAL call per request — no TOCTOU): + * 1. ZREMRANGEBYSCORE key -∞ (now - window_ms) ← evict expired entries + * 2. ZADD key now_ms ":" ← register this request + * 3. ZCARD key ← count in-window entries + * 4. PEXPIRE key (window_ms * 2) ← keep key alive + * 5. return count * - * Routes that need stricter limits (e.g. auth) can override via route config: + * If count > max → HTTP 429. The random suffix in the member prevents two + * concurrent requests at exactly the same millisecond from aliasing onto + * the same key and causing an under-count. * - * { - * config: { - * rateLimit: { max: 5, timeWindow: '1 minute' } - * } - * } - * - * Localhost (127.0.0.1 / ::1) is always allow-listed so health checks and - * internal tooling never trigger limits. + * Localhost (127.0.0.1 / ::1) is always allow-listed. */ import fp from "fastify-plugin"; @@ -29,18 +32,52 @@ import type { FastifyInstance, FastifyPluginAsync } from "fastify"; import fastifyRateLimit from "@fastify/rate-limit"; import { shouldStartWorkers } from "../../workers/startup.js"; +// ── Tier-2 constants ───────────────────────────────────────────────────────── + +/** Per-org request ceiling per rolling window. */ +const ORG_RATE_LIMIT_MAX = 5_000; +/** Sliding-window size in milliseconds. */ +const ORG_RATE_LIMIT_WINDOW_MS = 60_000; // 1 minute + +/** + * Lua script: atomic sliding-window check + record using a sorted set. + * + * KEYS[1] → Redis key for this org's rate-limit window + * ARGV[1] → current timestamp in milliseconds (string) + * ARGV[2] → window size in milliseconds (string) + * ARGV[3] → unique member for this request (string: ":") + * ARGV[4] → jittered TTL in milliseconds (string: window_ms * 2 ± jitter) + * + * Returns the count of requests inside the current window AFTER recording + * this request (i.e., the value to compare against the cap). + */ +const SLIDING_WINDOW_LUA = ` +local key = KEYS[1] +local now_ms = tonumber(ARGV[1]) +local window_ms = tonumber(ARGV[2]) +local member = ARGV[3] +local ttl_ms = tonumber(ARGV[4]) +local cutoff = now_ms - window_ms + +redis.call('ZREMRANGEBYSCORE', key, '-inf', cutoff) +redis.call('ZADD', key, now_ms, member) +local count = redis.call('ZCARD', key) +redis.call('PEXPIRE', key, ttl_ms) +return count +`; + const rateLimitPlugin: FastifyPluginAsync = async (fastify: FastifyInstance) => { - // Rate limiting requires Redis. Skip when workers/Redis are not provisioned - // (CI, local dev without Redis). In production, WORKERS_ENABLED=true so - // this check always passes and rate limiting is enforced. if (!shouldStartWorkers()) { fastify.log.info("security-rate-limit plugin SKIPPED (WORKERS_ENABLED=false — Redis not provisioned)"); return; } - // Lazy import Redis only when needed const { Redis } = await import("ioredis"); const { redisConnectionOptions } = await import("../../config/redis.js"); + + // ── Tier 1: Per-user sliding window (1 200 req/min) ───────────────────── + // @fastify/rate-limit uses a sliding-window counter internally when a + // Redis store is provided — no fixed-window burst vulnerability. const rateLimitRedis = new Redis(redisConnectionOptions); await fastify.register(fastifyRateLimit, { @@ -48,26 +85,12 @@ const rateLimitPlugin: FastifyPluginAsync = async (fastify: FastifyInstance) => hook: "preHandler", max: 1200, timeWindow: "1 minute", - - // Redis store — required for Docker / multi-instance deployments. redis: rateLimitRedis, - - // Key by validated user ID (sub claim from JWT) so each authenticated - // user gets their own quota. This is more secure than keying by the - // raw Authorization header since it uses the verified identity. - // Unauthenticated requests fall back to client IP. keyGenerator: (request) => { const user = (request as { user?: { sub?: string } }).user; - if (user?.sub) { - return `user:${user.sub}`; - } - return `ip:${request.ip}`; + return user?.sub ? `rl:user:${user.sub}` : `rl:ip:${request.ip}`; }, - - // Bypass rate limiting for localhost health checks / internal tooling. allowList: ["127.0.0.1", "::1"], - - // Return a machine-readable error body on 429. errorResponseBuilder: (_request, context) => ({ success: false, error: "Too many requests", @@ -75,7 +98,54 @@ const rateLimitPlugin: FastifyPluginAsync = async (fastify: FastifyInstance) => }), }); - fastify.log.info("security-rate-limit plugin registered (Redis-backed, 1200 req/min per token)"); + // ── Tier 2: Per-org sliding window (5 000 req/min) ────────────────────── + const orgRlRedis = new Redis(redisConnectionOptions); + + // Pre-load the Lua script SHA for efficient reuse. + // evalsha is ~10 % faster than eval for hot-path scripts called thousands + // of times per minute because Redis skips the parse/compile step. + const slidingWindowSha = await orgRlRedis.script("LOAD", SLIDING_WINDOW_LUA) as string; + + fastify.addHook("preHandler", async (request, reply) => { + const orgId = (request as { organizationId?: string }).organizationId; + if (!orgId) return; + if (request.ip === "127.0.0.1" || request.ip === "::1") return; + + const nowMs = Date.now(); + // Unique per-request member prevents millisecond aliasing. + const member = `${nowMs}:${Math.random().toString(36).slice(2)}`; + const key = `rl:org:${orgId}`; + // Jitter the key TTL by 0–10% of the window to prevent a synchronized + // mass-expiry storm when many org keys were created at the same time. + const ttlMs = Math.round(ORG_RATE_LIMIT_WINDOW_MS * 2 + ORG_RATE_LIMIT_WINDOW_MS * 0.1 * Math.random()); + + let count: number; + try { + // Run via pre-loaded SHA; fall back to EVAL if the script was + // flushed from script cache (e.g. Redis restart). + count = await orgRlRedis + .evalsha(slidingWindowSha, 1, key, String(nowMs), String(ORG_RATE_LIMIT_WINDOW_MS), member, String(ttlMs)) + .catch(() => + orgRlRedis.eval(SLIDING_WINDOW_LUA, 1, key, String(nowMs), String(ORG_RATE_LIMIT_WINDOW_MS), member, String(ttlMs)), + ) as number; + } catch { + // Non-fatal: if Redis is unavailable let the request through. + return; + } + + if (count > ORG_RATE_LIMIT_MAX) { + // Return a consistent 429 with the standard retryAfter field. + void reply.status(429).send({ + success: false, + error: "Organization rate limit exceeded", + retryAfter: `${Math.ceil(ORG_RATE_LIMIT_WINDOW_MS / 1000)}s`, + }); + } + }); + + fastify.log.info( + "security-rate-limit plugin registered (per-user: 1200 req/min sliding, per-org: 5000 req/min sliding, Redis-backed)", + ); }; export default fp(rateLimitPlugin, { diff --git a/apps/api/src/routes/health.ts b/apps/api/src/routes/health.ts index a0a5d3d..98afbcc 100644 --- a/apps/api/src/routes/health.ts +++ b/apps/api/src/routes/health.ts @@ -1,5 +1,6 @@ import type { FastifyInstance } from "fastify"; import { getConfigHash } from "../config/env.js"; +import { shouldStartWorkers, areWorkersStarted, getExpectedWorkerCount } from "../workers/startup.js"; interface HealthResponse { status: string; @@ -73,7 +74,6 @@ export async function healthRoutes(app: FastifyInstance): Promise { const { supabaseServiceClient } = await import("../config/supabase.js"); const { distanceQueue } = await import("../workers/distance.queue.js"); const { analyticsQueue } = await import("../workers/analytics.queue.js"); - const { shouldStartWorkers, areWorkersStarted } = await import("../workers/startup.js"); const checks: ReadyResponse["checks"] = { redis: "error", @@ -106,11 +106,12 @@ export async function healthRoutes(app: FastifyInstance): Promise { checks.redis = redisResult.status === "fulfilled" ? "ok" : "error"; checks.supabase = supabaseResult.status === "fulfilled" ? "ok" : "error"; checks.bullmq = bullmqResult.status === "fulfilled" ? "ok" : "error"; + const expected = getExpectedWorkerCount(); if (!shouldStartWorkers()) { - checks.workers = { status: "skipped", active: 0, expected: 2 }; + checks.workers = { status: "skipped", active: 0, expected }; } else { const started = areWorkersStarted(); - checks.workers = { status: started ? "ok" : "error", active: started ? 2 : 0, expected: 2 }; + checks.workers = { status: started ? "ok" : "error", active: started ? expected : 0, expected }; } const ready = checks.redis === "ok" && checks.supabase === "ok" && checks.bullmq === "ok"; diff --git a/apps/api/src/server.ts b/apps/api/src/server.ts index 5f982cc..65182c7 100644 --- a/apps/api/src/server.ts +++ b/apps/api/src/server.ts @@ -1,7 +1,7 @@ import "./tracing.js"; import { env, getConfigHash, getEnv, logStartupConfig } from "./config/env.js"; import { buildApp } from "./app.js"; -import { shouldStartWorkers } from "./workers/startup.js"; +import { shouldStartWorkers, getExpectedWorkerCount } from "./workers/startup.js"; async function start(): Promise { // Force environment validation at process startup so production fails fast. @@ -54,10 +54,21 @@ async function start(): Promise { const { startRetryIntentCleanupJob } = await import("./workers/retry-cleanup.job.js"); await startWorkers(app); - app.log.info({ activeWorkers: 2 }, "[BOOT] workers started"); + app.log.info({ activeWorkers: getExpectedWorkerCount() }, "[BOOT] workers started"); performStartupRecovery(app); void replayPendingRetryIntents(app); startRetryIntentCleanupJob(app); + + // Restore any open circuit-breaker states from DB into Redis so that + // delivery workers respect open circuits after a Redis flush/restart. + const { syncCircuitBreakerState } = await import("./workers/circuit-breaker.js"); + const { getRedisConnectionOptions } = await import("./config/redis.js"); + const { Redis } = await import("ioredis"); + const cbSyncRedis = new Redis(getRedisConnectionOptions()); + cbSyncRedis.on("error", () => { /* non-fatal */ }); + void syncCircuitBreakerState(cbSyncRedis, app.log).finally(() => { + void cbSyncRedis.quit().catch(() => undefined); + }); } else { app.log.info( { diff --git a/apps/api/src/utils/audit.ts b/apps/api/src/utils/audit.ts new file mode 100644 index 0000000..489faab --- /dev/null +++ b/apps/api/src/utils/audit.ts @@ -0,0 +1,43 @@ +/** + * audit.ts — Lightweight structured audit log writer. + * + * Inserts records into `public.admin_audit_log` via the Supabase service + * client. Non-fatal: DB insertion failures are logged but never propagate + * to the caller, so a write error never breaks the primary admin action. + * + * Callers should also use `request.log.info({ audit: true, ... })` for + * structured log correlation in Loki/Grafana alongside DB records. + */ + +import { supabaseServiceClient as supabase } from "../config/supabase.js"; + +export interface AuditEntry { + event: string; + actor_id?: string | null; + organization_id?: string | null; + resource_type?: string; + resource_id?: string; + payload?: Record; +} + +/** + * Insert one record into `admin_audit_log`. + * + * Swallows any DB error and logs it as a warning — audit log failures must + * never interrupt the primary operation. + */ +export async function insertAuditRecord(entry: AuditEntry): Promise { + const { error } = await supabase.from("admin_audit_log").insert({ + event: entry.event, + actor_id: entry.actor_id ?? null, + organization_id: entry.organization_id ?? null, + resource_type: entry.resource_type ?? null, + resource_id: entry.resource_id ?? null, + payload: entry.payload ?? {}, + }); + + if (error) { + // Non-fatal: log but do not throw. + console.warn("[audit] Failed to persist audit record:", error.message, { event: entry.event }); + } +} diff --git a/apps/api/src/utils/errors.ts b/apps/api/src/utils/errors.ts index 17c2309..36cba29 100644 --- a/apps/api/src/utils/errors.ts +++ b/apps/api/src/utils/errors.ts @@ -62,6 +62,13 @@ export class QueueOverloadedError extends AppError { } } +export class ServiceUnavailableError extends AppError { + constructor(message = "Service unavailable") { + super(message, 503, "SERVICE_UNAVAILABLE"); + this.name = "ServiceUnavailableError"; + } +} + // ─── Domain-specific errors ─────────────────────────────────────────────────── export class EmployeeAlreadyCheckedIn extends BadRequestError { diff --git a/apps/api/src/utils/hmac.ts b/apps/api/src/utils/hmac.ts index ee667ab..ac31044 100644 --- a/apps/api/src/utils/hmac.ts +++ b/apps/api/src/utils/hmac.ts @@ -35,6 +35,36 @@ export function generateSignature(secret: string, payload: string): string { return `sha256=${hmac.digest("hex")}`; } +// ─── Replay-Safe Signature ──────────────────────────────────────────────────── + +/** + * Generate a timestamp-bound HMAC-SHA256 signature for outbound webhook delivery. + * + * Signing body: `{timestamp}.{payload}` — this binds the signature to both the + * payload content AND the delivery time, making captured requests non-replayable + * after the tolerance window (receivers should reject timestamps older than ~5 min). + * + * Returns both the Unix timestamp (seconds) used in signing, and the signature + * string. The caller must send `X-FieldTrack-Timestamp: ` as a header so + * the receiver can reconstruct the signed string for verification. + * + * @param secret The per-webhook signing secret. + * @param payload The raw request body string. + * @param tsSeconds Unix timestamp in seconds (defaults to `Date.now() / 1000 | 0`). + * @returns `{ signature: "sha256=", timestamp: number }` + */ +export function generateSignatureWithTimestamp( + secret: string, + payload: string, + tsSeconds = (Date.now() / 1000) | 0, +): { signature: string; timestamp: number } { + const signingBody = `${tsSeconds}.${payload}`; + const hmac = createHmac("sha256", secret); + hmac.update(signingBody, "utf8"); + const signature = `sha256=${hmac.digest("hex")}`; + return { signature, timestamp: tsSeconds }; +} + // ─── Signature Verification ─────────────────────────────────────────────────── /** @@ -64,3 +94,27 @@ export function verifySignature( return timingSafeEqual(Buffer.from(expected, "utf8"), Buffer.from(received, "utf8")); } + +/** + * Verify timestamp-bound signature with replay-window enforcement. + * + * Signing input must be `{timestamp}.{payload}` and the timestamp must be + * inside the accepted tolerance window (default ±300 s). + */ +export function verifySignatureWithTimestamp( + secret: string, + payload: string, + received: string, + timestampSeconds: number, + nowSeconds = Math.floor(Date.now() / 1000), + toleranceSeconds = 300, +): boolean { + if (!Number.isInteger(timestampSeconds)) return false; + if (!Number.isInteger(nowSeconds)) return false; + if (!Number.isFinite(toleranceSeconds) || toleranceSeconds < 0) return false; + if (Math.abs(nowSeconds - timestampSeconds) > toleranceSeconds) return false; + + const expected = generateSignature(secret, `${timestampSeconds}.${payload}`); + if (expected.length !== received.length) return false; + return timingSafeEqual(Buffer.from(expected, "utf8"), Buffer.from(received, "utf8")); +} diff --git a/apps/api/src/workers/circuit-breaker.ts b/apps/api/src/workers/circuit-breaker.ts new file mode 100644 index 0000000..c17561c --- /dev/null +++ b/apps/api/src/workers/circuit-breaker.ts @@ -0,0 +1,399 @@ +/** + * circuit-breaker.ts — Per-webhook circuit breaker, Redis + DB persistent. + * + * State model + * ─────────── + * CLOSED — normal operation (failure_streak < threshold) + * OPEN — webhook disabled; deliveries are skipped until + * circuit_open_until has elapsed + * HALF-OPEN — cooldown elapsed; next delivery attempt re-enables the webhook + * if it succeeds (or re-opens the circuit if it fails) + * + * Persistence strategy + * ──────────────────── + * Hot path (per delivery): Redis only (sub-ms reads, atomic INCR) + * Cold start / Redis flush: DB is the authoritative source of truth. + * - openCircuit() writes both Redis and DB + * - closeCircuit() clears both Redis and DB + * - syncCircuitBreakerState() is called once at startup to repopulate + * Redis from DB, guaranteeing open circuits survive Redis restarts + * + * Redis keys + * ────────── + * cb:failure_streak:{webhookId} — INCR counter, TTL=24 h + * cb:recovery_cooldown:{webhookId} — EX key, TTL=cooldown seconds + */ + +import type { FastifyBaseLogger } from "fastify"; +import type { Redis as IORedis } from "ioredis"; +import { supabaseServiceClient as supabase } from "../config/supabase.js"; +import { insertAuditRecord } from "../utils/audit.js"; + +// ─── Constants ───────────────────────────────────────────────────────────── + +/** Consecutive failures required to open the circuit. */ +export const CIRCUIT_OPEN_THRESHOLD = 5; + +/** How long a webhook stays disabled before auto-recovery attempt (ms). */ +export const CIRCUIT_RECOVERY_COOLDOWN_MS = 10 * 60_000; // 10 min + +/** How often to scan DB for expired open circuits and re-enable them. */ +export const CIRCUIT_RECOVERY_SCAN_INTERVAL_MS = 60_000; // 1 min + +/** TTL for the Redis streak key — auto-cleans idle webhooks. */ +const STREAK_TTL_SECONDS = 86_400; // 24 h + +// ─── Redis key helpers ────────────────────────────────────────────────────── + +export function streakKey(webhookId: string): string { + return `cb:failure_streak:${webhookId}`; +} + +export function cooldownKey(webhookId: string): string { + return `cb:recovery_cooldown:${webhookId}`; +} + +// ─── Cold-start sync ──────────────────────────────────────────────────────── + +/** + * Re-populate Redis from DB on process start. + * + * Reads every webhook row whose circuit_open_until timestamp is still in the + * future and sets the Redis cooldown key with the remaining TTL. This ensures + * that open circuits survive a Redis flush or process restart. + * + * Call once from server.ts after workers are started. + */ +export async function syncCircuitBreakerState( + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + try { + const now = new Date().toISOString(); + const { data: openWebhooks, error } = await supabase + .from("webhooks") + .select("id, circuit_open_until, failure_streak") + .gt("circuit_open_until", now); + + if (error) { + log.warn({ error: error.message }, "circuit-breaker: startup sync DB query failed"); + return; + } + + if (!openWebhooks?.length) { + log.info("circuit-breaker: startup sync — no open circuits found"); + return; + } + + const pipeline = redis.pipeline(); + for (const wh of openWebhooks) { + const openUntil = new Date(wh.circuit_open_until as string).getTime(); + const remainingMs = openUntil - Date.now(); + if (remainingMs <= 0) continue; + + const remainingSec = Math.ceil(remainingMs / 1000); + pipeline.set(cooldownKey(wh.id as string), "1", "EX", remainingSec); + if ((wh.failure_streak as number) > 0) { + pipeline.set(streakKey(wh.id as string), String(wh.failure_streak)); + pipeline.expire(streakKey(wh.id as string), STREAK_TTL_SECONDS); + } + } + await pipeline.exec(); + + log.info( + { count: openWebhooks.length }, + "circuit-breaker: startup sync — restored open circuits from DB", + ); + } catch (err) { + log.warn( + { error: err instanceof Error ? err.message : String(err) }, + "circuit-breaker: startup sync failed (non-fatal)", + ); + } +} + +// ─── Per-delivery operations ──────────────────────────────────────────────── + +/** + * Record a successful delivery — resets failure streak in both Redis and DB. + */ +export async function recordDeliverySuccess( + webhookId: string, + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + try { + const key = streakKey(webhookId); + const prev = await redis.getdel(key); + const prevStreak = prev ? parseInt(prev, 10) : 0; + + if (prevStreak > 0) { + log.info( + { webhookId, previousStreak: prevStreak, circuitBreaker: "reset" }, + "circuit-breaker: streak reset after successful delivery", + ); + // Persist the cleared streak to DB (non-blocking — failure is non-fatal) + supabase + .from("webhooks") + .update({ failure_streak: 0 }) + .eq("id", webhookId) + .then(({ error }) => { + if (error) { + log.warn({ webhookId, error: error.message }, "circuit-breaker: failed to persist streak reset to DB"); + } + }); + } + } catch (err) { + log.warn( + { webhookId, error: err instanceof Error ? err.message : String(err) }, + "circuit-breaker: failed to reset streak (Redis error)", + ); + } +} + +/** + * Record a delivery failure — increments streak in Redis + DB, + * opens the circuit if the threshold is reached. + * + * @returns current streak count + */ +export async function recordDeliveryFailure( + webhookId: string, + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + try { + const key = streakKey(webhookId); + const streak = await redis.incr(key); + await redis.expire(key, STREAK_TTL_SECONDS); + + log.info( + { webhookId, streak, threshold: CIRCUIT_OPEN_THRESHOLD, circuitBreaker: "failure" }, + "circuit-breaker: failure recorded", + ); + + // Persist streak to DB every increment so restarts see the latest value. + // Fire-and-forget — delivery path must not block on DB write. + supabase + .from("webhooks") + .update({ failure_streak: streak }) + .eq("id", webhookId) + .then(({ error }) => { + if (error) { + log.warn({ webhookId, error: error.message }, "circuit-breaker: failed to persist streak to DB"); + } + }); + + if (streak >= CIRCUIT_OPEN_THRESHOLD) { + await openCircuit(webhookId, streak, redis, log); + } + + return streak; + } catch (err) { + log.warn( + { webhookId, error: err instanceof Error ? err.message : String(err) }, + "circuit-breaker: failed to record failure (Redis error)", + ); + return 0; + } +} + +// ─── Circuit open / close ─────────────────────────────────────────────────── + +/** + * Open the circuit — disable the webhook in DB + set Redis cooldown key. + * Writes `circuit_open_until` to the webhooks row for cross-restart persistence. + */ +async function openCircuit( + webhookId: string, + streak: number, + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + const cooldownSeconds = Math.ceil(CIRCUIT_RECOVERY_COOLDOWN_MS / 1000); + const openUntil = new Date(Date.now() + CIRCUIT_RECOVERY_COOLDOWN_MS).toISOString(); + + log.warn( + { + webhookId, + streak, + threshold: CIRCUIT_OPEN_THRESHOLD, + recoveryCooldownMs: CIRCUIT_RECOVERY_COOLDOWN_MS, + circuitBreaker: "open", + }, + "circuit-breaker: OPEN — disabling webhook temporarily", + ); + + // DB write: persist open state + timestamp so it survives restarts. + const { error } = await supabase + .from("webhooks") + .update({ + is_active: false, + failure_streak: streak, + circuit_open_until: openUntil, + }) + .eq("id", webhookId); + + if (error) { + log.error( + { webhookId, error: error.message }, + "circuit-breaker: failed to persist OPEN state to DB", + ); + // Still set Redis key so in-process workers respect the circuit. + } + + // Redis cooldown key: hot-path check + await redis.set(cooldownKey(webhookId), "1", "EX", cooldownSeconds); + + log.warn( + { webhookId, cooldownSeconds, openUntil, circuitBreaker: "open" }, + "circuit-breaker: webhook disabled, auto-recovery scheduled", + ); + + await insertAuditRecord({ + event: "CIRCUIT_BREAKER_OPENED", + resource_type: "webhook", + resource_id: webhookId, + payload: { streak, threshold: CIRCUIT_OPEN_THRESHOLD, cooldown_seconds: cooldownSeconds, open_until: openUntil }, + }); +} + +/** + * Check if a webhook's circuit is ready for auto-recovery. + * Checks Redis first (fast), falls back to DB (resilient). + * + * @returns true if the cooldown has elapsed and circuit can be closed + */ +export async function isCircuitReadyToRecover( + webhookId: string, + redis: IORedis, +): Promise { + // Primary: Redis TTL-based check + try { + const exists = await redis.exists(cooldownKey(webhookId)); + if (exists === 1) return false; // cooldown still active + } catch { + // Redis unavailable — fall through to DB check + } + + // Fallback: DB authoritative check + const { data } = await supabase + .from("webhooks") + .select("circuit_open_until") + .eq("id", webhookId) + .single(); + + if (!data?.circuit_open_until) return true; // no open circuit in DB + return new Date(data.circuit_open_until as string) <= new Date(); +} + +/** + * Close the circuit — re-enable the webhook, clear streak, clear cooldown. + * Writes to both DB and Redis. + */ +export async function closeCircuit( + webhookId: string, + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + log.info( + { webhookId, circuitBreaker: "closed" }, + "circuit-breaker: CLOSED — re-enabling webhook", + ); + + // DB: re-enable and clear circuit state + const { error } = await supabase + .from("webhooks") + .update({ + is_active: true, + failure_streak: 0, + circuit_open_until: null, + }) + .eq("id", webhookId); + + if (error) { + log.error({ webhookId, error: error.message }, "circuit-breaker: failed to persist CLOSED state to DB"); + } + + // Redis: clear streak + cooldown keys + await redis.del(streakKey(webhookId)); + await redis.del(cooldownKey(webhookId)); + + await insertAuditRecord({ + event: "CIRCUIT_BREAKER_CLOSED", + resource_type: "webhook", + resource_id: webhookId, + payload: {}, + }); +} + +/** Interval handle so the recovery scanner is started only once per process. */ +let _circuitRecoveryInterval: ReturnType | undefined; + +/** + * Find all expired open circuits and close them. + * + * This restores webhook activity after cooldown without requiring a new + * delivery attempt to trigger recovery logic. + */ +export async function recoverExpiredCircuits( + redis: IORedis, + log: FastifyBaseLogger, +): Promise { + const now = new Date().toISOString(); + + const { data: recoverable, error } = await supabase + .from("webhooks") + .select("id") + .eq("is_active", false) + .not("circuit_open_until", "is", null) + .lte("circuit_open_until", now) + .limit(500); + + if (error) { + log.warn({ error: error.message }, "circuit-breaker: recovery scan query failed"); + return 0; + } + + if (!recoverable?.length) return 0; + + let recovered = 0; + for (const row of recoverable as Array<{ id: string }>) { + try { + await closeCircuit(row.id, redis, log); + recovered++; + } catch (err) { + log.warn( + { + webhookId: row.id, + error: err instanceof Error ? err.message : String(err), + }, + "circuit-breaker: failed to recover expired circuit", + ); + } + } + + if (recovered > 0) { + log.info({ recovered }, "circuit-breaker: recovered expired circuits"); + } + + return recovered; +} + +/** Start periodic scan that closes circuits after cooldown expiration. */ +export function startCircuitRecoveryInterval( + redis: IORedis, + log: FastifyBaseLogger, +): ReturnType { + if (_circuitRecoveryInterval) return _circuitRecoveryInterval; + + void recoverExpiredCircuits(redis, log); + _circuitRecoveryInterval = setInterval( + () => { + void recoverExpiredCircuits(redis, log); + }, + CIRCUIT_RECOVERY_SCAN_INTERVAL_MS, + ); + _circuitRecoveryInterval.unref(); + return _circuitRecoveryInterval; +} diff --git a/apps/api/src/workers/retry-intents.ts b/apps/api/src/workers/retry-intents.ts index 7c3efda..d0c8dd2 100644 --- a/apps/api/src/workers/retry-intents.ts +++ b/apps/api/src/workers/retry-intents.ts @@ -29,7 +29,9 @@ function nextRetryIso(retryCount: number): string { RETRY_MAX_DELAY_SECONDS, RETRY_BASE_DELAY_SECONDS * (2 ** Math.max(0, retryCount - 1)), ); - return new Date(Date.now() + backoffSeconds * 1000).toISOString(); + // Add 0–20% jitter to prevent thundering-herd when many intents retry at once. + const jitterSeconds = backoffSeconds * 0.2 * Math.random(); + return new Date(Date.now() + (backoffSeconds + jitterSeconds) * 1000).toISOString(); } export async function persistRetryIntent( diff --git a/apps/api/src/workers/startup.ts b/apps/api/src/workers/startup.ts index ffe90f4..aa17096 100644 --- a/apps/api/src/workers/startup.ts +++ b/apps/api/src/workers/startup.ts @@ -1,6 +1,21 @@ import type { FastifyInstance } from "fastify"; import { env } from "../config/env.js"; +// ─── Worker registry ────────────────────────────────────────────────────────── + +/** + * Canonical list of all background worker types. + * Adding a new worker here automatically propagates the expected count to + * /ready, /admin/system-health, and all boot logs — no manual number updates. + */ +export const WORKER_TYPES = ["distance", "analytics", "webhook"] as const; +export type WorkerType = (typeof WORKER_TYPES)[number]; + +/** Expected number of background workers in a fully-started process. */ +export function getExpectedWorkerCount(): number { + return WORKER_TYPES.length; +} + /** * Overrides accepted by shouldStartWorkers() for unit-test injection. * Production code always calls shouldStartWorkers() with no arguments. diff --git a/apps/api/src/workers/webhook.queue.ts b/apps/api/src/workers/webhook.queue.ts index 3503298..c4d7557 100644 --- a/apps/api/src/workers/webhook.queue.ts +++ b/apps/api/src/workers/webhook.queue.ts @@ -8,12 +8,23 @@ * * Job payload contains everything the worker needs to sign and deliver * the request without additional DB round-trips in the hot path. + * + * DLQ retention + * ───────────── + * - Max DLQ size: WEBHOOK_DLQ_MAX_SIZE (default 10 000 jobs) + * - Retention window: WEBHOOK_DLQ_RETENTION_DAYS (default 30 days) + * - Jobs older than the window are archived to webhook_dlq_archive (DB) + * then removed from Redis. + * - purgeDlqJobs() is called on process start and every hour by the + * purge interval started in webhook.worker.ts. */ import { Queue } from "bullmq"; import { getRedisConnectionOptions } from "../config/redis.js"; import { env } from "../config/env.js"; import { QueueOverloadedError } from "../utils/errors.js"; +import { supabaseServiceClient as supabase } from "../config/supabase.js"; +import { insertAuditRecord } from "../utils/audit.js"; // ─── Job Payload ────────────────────────────────────────────────────────────── @@ -47,42 +58,41 @@ export const WEBHOOK_QUEUE_NAME = "webhook-delivery" as const; // ─── Retry back-off delays (milliseconds) ──────────────────────────────────── // // Attempt 1 → immediate (delay = 0, handled as first-try in BullMQ) -// Attempt 2 → 30 s -// Attempt 3 → 2 min -// Attempt 4 → 10 min +// Attempt 2 → 1 min +// Attempt 3 → 5 min +// Attempt 4 → 15 min // Attempt 5 → 1 h // -// This matches the spec. BullMQ's built-in exponential backoff is not used -// here because the spec defines specific absolute delays (not a geometric -// series), so we supply a custom `delay` per job via the retry handler. +// Production-grade exponential backoff matching the audit spec. +// After attempt 5 fails, the delivery moves to the Dead-Letter Queue (DLQ). export const WEBHOOK_RETRY_DELAYS_MS: ReadonlyArray = [ - 0, // attempt 1 — immediate - 30_000, // attempt 2 — 30 s - 120_000, // attempt 3 — 2 min - 600_000, // attempt 4 — 10 min - 3_600_000, // attempt 5 — 1 h + 0, // attempt 1 — immediate + 60_000, // attempt 2 — 1 min + 300_000, // attempt 3 — 5 min + 900_000, // attempt 4 — 15 min + 3_600_000, // attempt 5 — 1 h ]; export const WEBHOOK_MAX_ATTEMPTS = WEBHOOK_RETRY_DELAYS_MS.length; /** - * Calculate retry delay with ±10% jitter to prevent thundering herd. + * Calculate retry delay with one-sided 0-20% jitter to prevent thundering herd. * * Without jitter, 100 failed deliveries all retry at the same time, * creating a synchronized spike that can cascade. Jitter spreads retries * across a window, stabilizing the system. * - * Example: baseDelay=30s → 27-33s range (±10% jitter) + * Example: baseDelay=60s → 60-72s range (+0-20% jitter) * * @param attemptNumber 1-based attempt number (1=first retry, 2=second, etc.) * @returns delay in milliseconds for this retry */ export function calculateRetryDelay(attemptNumber: number): number { const baseDelay = WEBHOOK_RETRY_DELAYS_MS[attemptNumber - 1]; - // ±10% jitter: add/subtract up to 10% of base delay - const jitterRange = baseDelay * 0.1; - const jitterMs = jitterRange * (Math.random() * 2 - 1); // [-jitterRange, +jitterRange] + if (baseDelay === 0) return 0; // attempt 1 is always immediate — no jitter + // Mandatory formula: delay = base + random(0-20% of base) + const jitterMs = baseDelay * 0.2 * Math.random(); return Math.round(baseDelay + jitterMs); } @@ -141,6 +151,12 @@ export async function enqueueWebhookDelivery( { jobId: `delivery:${data.delivery_id}:${data.attempt_number}`, delay: delayMs, + // Priority ensures fresh first-attempt deliveries are never starved by + // a flood of retry jobs under sustained load. + // BullMQ priority: lower number = higher priority (1 = highest). + // attempt 1 (first delivery) → priority 1 — processed first + // attempt 2+ (retries) → priority 2 — processed after fresh jobs + priority: data.attempt_number === 1 ? 1 : 2, }, ); } @@ -157,3 +173,232 @@ export async function getWebhookQueueDepth(): Promise { ]); return waiting + delayed; } + +// ─── Dead-Letter Queue (DLQ) ───────────────────────────────────────────────── +// +// Jobs that exhaust all retry attempts are moved here for visibility and +// potential manual reprocessing by admins. The DLQ is a separate BullMQ +// queue so it does not pollute the main delivery queue metrics. + +export const WEBHOOK_DLQ_NAME = "webhook-delivery-dlq" as const; + +let _webhookDlq: Queue | undefined; + +function getWebhookDlq(): Queue { + if (_webhookDlq) return _webhookDlq; + + _webhookDlq = new Queue(WEBHOOK_DLQ_NAME, { + connection: getRedisConnectionOptions(), + defaultJobOptions: { + attempts: 1, + removeOnComplete: false, // keep DLQ entries for admin inspection + removeOnFail: false, + }, + }); + + return _webhookDlq; +} + +/** + * Move a permanently failed delivery job to the Dead-Letter Queue. + * + * Enforces DLQ_MAX_SIZE: if the DLQ is at capacity, the oldest job is + * archived and evicted before the new job is added. + */ +export async function enqueueToDlq(data: WebhookDeliveryJobData): Promise { + const dlq = getWebhookDlq(); + + // ── Max-size guard ──────────────────────────────────────────────────────── + const depth = await dlq.getWaitingCount(); + if (depth >= env.WEBHOOK_DLQ_MAX_SIZE) { + // Evict the oldest job to stay within the cap. + const [oldest] = await dlq.getWaiting(0, 0); + if (oldest) { + await _archiveAndRemoveDlqJob(oldest, "max_size_eviction"); + } + } + + await dlq.add( + "dlq-delivery", + data, + { jobId: `dlq:${data.delivery_id}` }, + ); +} + +/** + * Return the current DLQ depth for health monitoring. + */ +export async function getWebhookDlqDepth(): Promise { + const dlq = getWebhookDlq(); + return dlq.getWaitingCount(); +} + +/** + * Replay a single DLQ job by delivery ID. + * + * Called from `POST /admin/webhook-dlq/:deliveryId/replay`. + * Moves the job back to the main delivery queue for re-attempt. + * + * @returns `true` if the job was found and replayed, `false` if not found. + */ +export async function replayWebhookDlqJob(deliveryId: string): Promise { + const dlq = getWebhookDlq(); + const jobId = `dlq:${deliveryId}`; + const job = await dlq.getJob(jobId); + if (!job) return false; + + // Re-enqueue in main queue with attempt_number reset to 1 — fresh start. + const data: WebhookDeliveryJobData = { ...job.data, attempt_number: 1 }; + await enqueueWebhookDelivery(data, 0); + await job.remove(); + return true; +} + +/** + * List all jobs currently in the DLQ (up to `limit`). + * Used by the admin review UI. + */ +export async function listWebhookDlqJobs( + limit = 50, +): Promise> { + const dlq = getWebhookDlq(); + const jobs = await dlq.getWaiting(0, limit - 1); + return jobs.map((j) => ({ + jobId: j.id ?? "(unknown)", + data: j.data, + failedAt: j.timestamp, + })); +} + +// ─── DLQ Retention / Purge ──────────────────────────────────────────────────── + +/** + * Archive a single DLQ job to the DB then remove it from Redis. + * Internal helper; exported for testability. + */ +export async function _archiveAndRemoveDlqJob( + job: { id?: string; data: WebhookDeliveryJobData; timestamp: number }, + reason: string, +): Promise { + const { data } = job; + const failedAt = new Date(job.timestamp).toISOString(); + + await supabase.from("webhook_dlq_archive").insert({ + delivery_id: data.delivery_id, + webhook_id: data.webhook_id, + event_id: data.event_id, + url: data.url, + attempt_number: data.attempt_number, + failed_at: failedAt, + reason, + }); + + await insertAuditRecord({ + event: "WEBHOOK_DLQ_DELETED", + resource_type: "webhook_delivery", + resource_id: data.delivery_id, + payload: { + webhook_id: data.webhook_id, + event_id: data.event_id, + attempt_number: data.attempt_number, + failed_at: failedAt, + reason, + }, + }); + + // Remove from BullMQ after successful archive write. + // If the archive insert failed, Supabase-js throws so the job is NOT removed — + // the retention policy degrades gracefully to "keep but warn" rather than lose data. + const dlq = getWebhookDlq(); + const liveJob = job.id ? await dlq.getJob(job.id) : undefined; + await liveJob?.remove(); +} + +/** + * Purge DLQ jobs older than WEBHOOK_DLQ_RETENTION_DAYS. + * + * For each expired job: + * 1. Archive payload to webhook_dlq_archive (DB) + * 2. Remove from BullMQ (Redis) + * + * Also enforces WEBHOOK_DLQ_MAX_SIZE: if depth still exceeds the cap after + * expiry-based purge, continues evicting oldest jobs until under the cap. + * + * Call on startup and then every hour (managed by startDlqPurgeInterval()). + * + * @returns count of jobs archived and removed + */ +export async function purgeDlqJobs(log?: { info: (msg: string, ctx?: object) => void; warn: (msg: string, ctx?: object) => void }): Promise { + const dlq = getWebhookDlq(); + const retentionMs = env.WEBHOOK_DLQ_RETENTION_DAYS * 24 * 3_600_000; + const cutoffMs = Date.now() - retentionMs; + + // Fetch all waiting jobs — DLQ is expected to be small (O(hundreds) max) + const allJobs = await dlq.getWaiting(0, -1); + const expired = allJobs.filter((j) => j.timestamp < cutoffMs); + let purgeCount = 0; + + for (const job of expired) { + try { + await _archiveAndRemoveDlqJob(job, "retention_policy"); + purgeCount++; + } catch (err) { + log?.warn("dlq-purge: failed to archive job", { + jobId: job.id, + error: err instanceof Error ? err.message : String(err), + }); + } + } + + // After expiry purge, enforce max size by evicting oldest remaining jobs. + const remaining = await dlq.getWaiting(0, -1); + const overflow = remaining.length - env.WEBHOOK_DLQ_MAX_SIZE; + if (overflow > 0) { + // Oldest first (lowest timestamp) + const toEvict = remaining + .sort((a, b) => a.timestamp - b.timestamp) + .slice(0, overflow); + + for (const job of toEvict) { + try { + await _archiveAndRemoveDlqJob(job, "max_size_eviction"); + purgeCount++; + } catch (err) { + log?.warn("dlq-purge: failed to evict overflow job", { + jobId: job.id, + error: err instanceof Error ? err.message : String(err), + }); + } + } + } + + if (purgeCount > 0) { + log?.info("dlq-purge: completed", { purgeCount, retentionDays: env.WEBHOOK_DLQ_RETENTION_DAYS }); + } + + return purgeCount; +} + +/** Purge interval handle — stored so the interval can be cleared in tests. */ +let _dlqPurgeInterval: ReturnType | undefined; + +/** + * Start the hourly DLQ purge background interval. + * Returns the interval handle for cleanup. Safe to call multiple times. + */ +export function startDlqPurgeInterval( + log?: { info: (msg: string, ctx?: object) => void; warn: (msg: string, ctx?: object) => void }, +): ReturnType { + if (_dlqPurgeInterval) return _dlqPurgeInterval; + + // Run once immediately on startup, then every hour. + void purgeDlqJobs(log); + + _dlqPurgeInterval = setInterval( + () => { void purgeDlqJobs(log); }, + 3_600_000, // 1 hour + ); + _dlqPurgeInterval.unref(); // Don't block process exit + + return _dlqPurgeInterval; +} diff --git a/apps/api/src/workers/webhook.worker.ts b/apps/api/src/workers/webhook.worker.ts index e11dd55..2f07f41 100644 --- a/apps/api/src/workers/webhook.worker.ts +++ b/apps/api/src/workers/webhook.worker.ts @@ -4,7 +4,8 @@ * Lifecycle per job: * 1. Fetch the event payload from webhook_events. * 2. Serialize the envelope to a stable JSON string. - * 3. Generate HMAC-SHA256 signature over the raw body. + * 3. Generate timestamp-bound HMAC-SHA256 signature over `timestamp.raw_body`. + * 4. Reject payloads above WEBHOOK_MAX_PAYLOAD_BYTES (move to DLQ). * 4. POST to the webhook URL with a 5 s timeout. * 5. On success → mark delivery as `success`. * 6. On failure → schedule a retry (exponential delays) up to MAX_ATTEMPTS. @@ -14,7 +15,7 @@ * - DNS rebinding defence: The hostname is resolved immediately before the * HTTP request and checked against private IP ranges. * - Request timeout enforced at 5 s. - * - Signature is HMAC-SHA256(secret, rawBody), header: X-FieldTrack-Signature. + * - Signature is HMAC-SHA256(secret, `${timestamp}.${rawBody}`), header: X-FieldTrack-Signature. * * Worker gate: `startWebhookWorker()` is only called when * `shouldStartWorkers()` returns true (WORKERS_ENABLED=true AND not test env). @@ -24,17 +25,70 @@ import { Worker } from "bullmq"; import type { Job } from "bullmq"; import type { FastifyInstance } from "fastify"; import dns from "node:dns/promises"; -import { redisConnectionOptions } from "../config/redis.js"; +import { Redis } from "ioredis"; +import { redisConnectionOptions, getRedisConnectionOptions } from "../config/redis.js"; import { supabaseServiceClient as supabase } from "../config/supabase.js"; -import { generateSignature } from "../utils/hmac.js"; +import { generateSignatureWithTimestamp } from "../utils/hmac.js"; import { subscribeToEventBus } from "./webhook-event.service.js"; import { WEBHOOK_QUEUE_NAME, WEBHOOK_MAX_ATTEMPTS, enqueueWebhookDelivery, + enqueueToDlq, calculateRetryDelay, + startDlqPurgeInterval, type WebhookDeliveryJobData, } from "./webhook.queue.js"; +import { + recordDeliverySuccess, + recordDeliveryFailure, + startCircuitRecoveryInterval, +} from "./circuit-breaker.js"; +import { + webhookDeliveriesTotal, + webhookFailuresTotal, + webhookRetriesTotal, +} from "../plugins/prometheus.js"; +import { env } from "../config/env.js"; + +// ─── Metrics helpers ────────────────────────────────────────────────────────── + +/** + * Map a raw event_type string to a bounded Prometheus label value. + * + * Prometheus label cardinality must stay bounded. Event types arrive from + * the DB payload and could theoretically be any string (e.g. from a future + * schema migration, a bug, or a bad INSERT). Mapping unknowns to "other" + * keeps the label set finite and prevents cardinality explosion. + * + * Update this set whenever a new EventDataMap key is added to event-bus.ts. + */ +const KNOWN_EVENT_TYPES = new Set([ + "employee.checked_in", + "employee.checked_out", + "expense.created", + "expense.approved", + "expense.rejected", + "employee.created", +]); + +function normalizeEventType(raw: string | undefined): string { + if (!raw) return "unknown"; + return KNOWN_EVENT_TYPES.has(raw) ? raw : "other"; +} + +// ─── Shared Redis client for circuit-breaker streak counters ────────────────── +// Lazy-created so tests that never call startWebhookWorker() pay zero cost. +let _cbRedis: Redis | undefined; +function getCbRedis(): Redis { + if (!_cbRedis) { + _cbRedis = new Redis(getRedisConnectionOptions()); + _cbRedis.on("error", () => { + // Swallow — circuit-breaker Redis errors are non-fatal (delivery still proceeds) + }); + } + return _cbRedis; +} // ─── Private IP ranges (DNS rebinding defence) ─────────────────────────────── @@ -58,6 +112,7 @@ function isPrivateAddress(ip: string): boolean { // ─── HTTP delivery ──────────────────────────────────────────────────────────── const DELIVERY_TIMEOUT_MS = 5_000; +const WEBHOOK_PAYLOAD_MAX_BYTES = env.WEBHOOK_MAX_PAYLOAD_BYTES; /** * Perform one HTTP delivery attempt. @@ -71,6 +126,9 @@ async function deliverWebhook( url: string, rawBody: string, signature: string, + eventType: string, + timestamp: number, + deliveryId: string, ): Promise<{ status: number; body: string }> { // ── DNS rebinding defence ────────────────────────────────────────────────── const parsed = new URL(url); @@ -105,12 +163,17 @@ async function deliverWebhook( const response = await fetch(url, { method: "POST", headers: { - "Content-Type": "application/json", - "X-FieldTrack-Signature": signature, - "X-FieldTrack-Event": "webhook-delivery", - "User-Agent": "FieldTrack-Webhooks/1.0", + "Content-Type": "application/json", + "X-FieldTrack-Signature": signature, + "X-FieldTrack-Event": eventType, + "X-FieldTrack-Timestamp": String(timestamp), + "X-FieldTrack-Delivery-Id": deliveryId, + "User-Agent": "FieldTrack-Webhooks/1.0", }, body: rawBody, + // Never follow redirects — a redirect could point to an internal address + // that bypassed the SSRF DNS check performed above. + redirect: "error", signal: controller.signal, }); @@ -208,8 +271,25 @@ async function scheduleRetryOrFail( } else { app.log.warn( { deliveryId, webhookId: webhook_id, attemptNumber }, - "webhook.worker: max attempts reached, delivery permanently failed", + "webhook.worker: max attempts reached, moving delivery to DLQ", ); + // Move to Dead-Letter Queue so the delivery remains visible to admins. + try { + await enqueueToDlq({ + delivery_id: deliveryId, + webhook_id, + event_id, + url, + secret, + attempt_number: attemptNumber, + }); + } catch (dlqErr: unknown) { + const msg = dlqErr instanceof Error ? dlqErr.message : String(dlqErr); + app.log.error( + { deliveryId, webhookId: webhook_id, error: msg }, + "webhook.worker: failed to enqueue to DLQ — delivery already marked failed in DB", + ); + } } } @@ -240,6 +320,25 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { "webhook.worker: processing delivery job", ); + // ── Idempotency guard ──────────────────────────────────────────────── + // Verify the delivery row is still in `pending` state before proceeding. + // Prevents duplicate delivery if BullMQ re-delivers a job (e.g. after an + // ungraceful shutdown) or an admin manually retried while a queued job + // was already in flight. + const { data: deliveryCheck } = await supabase + .from("webhook_deliveries") + .select("status") + .eq("id", delivery_id) + .single(); + + if (deliveryCheck && deliveryCheck.status === "success") { + app.log.info( + { deliveryId: delivery_id, webhookId: webhook_id }, + "webhook.worker: delivery already succeeded \u2014 skipping duplicate job", + ); + return; + } + // ── Fetch event payload ────────────────────────────────────────────── const { data: eventRow, error: fetchError } = await supabase .from("webhook_events") @@ -265,20 +364,85 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { // ── Build and sign the request body ─────────────────────────────────── const rawBody = JSON.stringify(eventRow.payload); - const signature = generateSignature(secret, rawBody); + const { signature, timestamp: deliveryTs } = generateSignatureWithTimestamp(secret, rawBody); + // Extract the event type from the envelope payload for the request header. + // The payload is the full EventEnvelope which always carries a `type` field. + const eventType = + (eventRow.payload as Record).type as string | undefined + ?? "webhook-delivery"; + + const payloadBytes = Buffer.byteLength(rawBody, "utf8"); + if (payloadBytes > WEBHOOK_PAYLOAD_MAX_BYTES) { + const message = + `Payload size ${payloadBytes} bytes exceeds cap ${WEBHOOK_PAYLOAD_MAX_BYTES} bytes`; + app.log.error( + { + deliveryId: delivery_id, + webhookId: webhook_id, + eventId: event_id, + payloadBytes, + maxBytes: WEBHOOK_PAYLOAD_MAX_BYTES, + }, + "webhook.worker: payload exceeds size cap, marking failed", + ); + + await supabase + .from("webhook_deliveries") + .update({ + status: "failed", + attempt_count: attempt_number, + response_body: message, + last_attempt_at: new Date().toISOString(), + next_retry_at: null, + }) + .eq("id", delivery_id); + + webhookDeliveriesTotal + .labels({ event_type: normalizeEventType(eventType), status: "failed" }) + .inc(); + webhookFailuresTotal + .labels({ event_type: normalizeEventType(eventType) }) + .inc(); + + try { + await enqueueToDlq({ + delivery_id, + webhook_id, + event_id, + url, + secret, + attempt_number, + }); + } catch (dlqErr: unknown) { + app.log.error( + { + deliveryId: delivery_id, + webhookId: webhook_id, + error: dlqErr instanceof Error ? dlqErr.message : String(dlqErr), + }, + "webhook.worker: failed to enqueue oversize payload delivery to DLQ", + ); + } + return; + } // ── Deliver ─────────────────────────────────────────────────────────── try { - const { status, body } = await deliverWebhook(url, rawBody, signature); + const { status, body } = await deliverWebhook(url, rawBody, signature, eventType, deliveryTs, delivery_id); const succeeded = status >= 200 && status < 300; if (succeeded) { await markSuccess(delivery_id, status, body); + await recordDeliverySuccess(webhook_id, getCbRedis(), app.log); + webhookDeliveriesTotal + .labels({ event_type: normalizeEventType(eventType), status: "success" }) + .inc(); app.log.info( { deliveryId: delivery_id, webhookId: webhook_id, responseStatus: status }, "webhook.worker: delivery succeeded", ); } else { + const willRetry = attempt_number + 1 <= WEBHOOK_MAX_ATTEMPTS; app.log.warn( { deliveryId: delivery_id, @@ -288,6 +452,19 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { }, "webhook.worker: delivery got non-2xx response, scheduling retry", ); + await recordDeliveryFailure(webhook_id, getCbRedis(), app.log); + webhookDeliveriesTotal + .labels({ event_type: normalizeEventType(eventType), status: "failed" }) + .inc(); + if (willRetry) { + webhookRetriesTotal + .labels({ event_type: normalizeEventType(eventType) }) + .inc(); + } else { + webhookFailuresTotal + .labels({ event_type: normalizeEventType(eventType) }) + .inc(); + } await scheduleRetryOrFail( delivery_id, webhook_id, @@ -302,6 +479,7 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { } } catch (err: unknown) { const message = err instanceof Error ? err.message : String(err); + const willRetry = attempt_number + 1 <= WEBHOOK_MAX_ATTEMPTS; app.log.error( { deliveryId: delivery_id, @@ -311,6 +489,19 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { }, "webhook.worker: delivery attempt threw error, scheduling retry", ); + await recordDeliveryFailure(webhook_id, getCbRedis(), app.log); + webhookDeliveriesTotal + .labels({ event_type: normalizeEventType(eventType), status: "error" }) + .inc(); + if (willRetry) { + webhookRetriesTotal + .labels({ event_type: normalizeEventType(eventType) }) + .inc(); + } else { + webhookFailuresTotal + .labels({ event_type: normalizeEventType(eventType) }) + .inc(); + } await scheduleRetryOrFail( delivery_id, webhook_id, @@ -326,7 +517,7 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { }, { connection: redisConnectionOptions, - concurrency: 5, + concurrency: env.WEBHOOK_WORKER_CONCURRENCY, lockDuration: 30_000, }, ); @@ -339,6 +530,13 @@ export function startWebhookWorker(app: FastifyInstance): Worker | null { ); }); - app.log.info("webhook.worker: started"); + // Start the hourly DLQ retention purge (archives expired jobs, enforces max size). + startDlqPurgeInterval(app.log); + startCircuitRecoveryInterval(getCbRedis(), app.log); + + app.log.info( + { concurrency: env.WEBHOOK_WORKER_CONCURRENCY }, + "webhook.worker: started", + ); return worker; } diff --git a/apps/api/tests/integration/admin/webhooks.integration.test.ts b/apps/api/tests/integration/admin/webhooks.integration.test.ts index 70a40a0..80de00c 100644 --- a/apps/api/tests/integration/admin/webhooks.integration.test.ts +++ b/apps/api/tests/integration/admin/webhooks.integration.test.ts @@ -21,6 +21,14 @@ vi.mock("../../../src/config/redis.js", () => ({ redisConnectionOptions: {}, })); +// shouldStartWorkers must return true so the retry endpoint does not reject +// with 503 "Workers not enabled" in test context. +vi.mock("../../../src/workers/startup.js", () => ({ + shouldStartWorkers: vi.fn().mockReturnValue(true), + areWorkersStarted: vi.fn().mockReturnValue(true), + startWorkers: vi.fn().mockResolvedValue(undefined), +})); + vi.mock("../../../src/workers/distance.queue.js", () => ({ enqueueDistanceJob: vi.fn().mockResolvedValue(undefined), })); @@ -31,10 +39,12 @@ vi.mock("../../../src/workers/analytics.queue.js", () => ({ vi.mock("../../../src/workers/webhook.queue.js", () => ({ enqueueWebhookDelivery: vi.fn().mockResolvedValue(undefined), + enqueueToDlq: vi.fn().mockResolvedValue(undefined), WEBHOOK_QUEUE_NAME: "webhook-delivery", - WEBHOOK_RETRY_DELAYS_MS: [0, 30_000, 120_000, 600_000, 3_600_000], + WEBHOOK_RETRY_DELAYS_MS: [0, 60_000, 300_000, 900_000, 3_600_000], WEBHOOK_MAX_ATTEMPTS: 5, getWebhookQueueDepth: vi.fn().mockResolvedValue(0), + getWebhookDlqDepth: vi.fn().mockResolvedValue(0), })); vi.mock("../../../src/modules/webhooks/webhooks.repository.js", () => ({ @@ -377,6 +387,23 @@ describe("Webhooks Admin API", () => { expect(res.statusCode).toBe(200); }); + + it("returns 403 for EMPLOYEE role", async () => { + const res = await app.inject({ + method: "GET", + url: "/admin/webhook-deliveries", + headers: { authorization: `Bearer ${employeeToken}` }, + }); + expect(res.statusCode).toBe(403); + }); + + it("returns 401 with no token", async () => { + const res = await app.inject({ + method: "GET", + url: "/admin/webhook-deliveries", + }); + expect(res.statusCode).toBe(401); + }); }); // ─── POST /admin/webhook-deliveries/:id/retry ─────────────────────────────── @@ -436,5 +463,22 @@ describe("Webhooks Admin API", () => { expect(res.statusCode).toBe(400); }); + + it("returns 403 for EMPLOYEE role", async () => { + const res = await app.inject({ + method: "POST", + url: `/admin/webhook-deliveries/${DELIVERY_ID}/retry`, + headers: { authorization: `Bearer ${employeeToken}` }, + }); + expect(res.statusCode).toBe(403); + }); + + it("returns 401 with no token", async () => { + const res = await app.inject({ + method: "POST", + url: `/admin/webhook-deliveries/${DELIVERY_ID}/retry`, + }); + expect(res.statusCode).toBe(401); + }); }); }); diff --git a/apps/api/tests/unit/utils/webhook.unit.test.ts b/apps/api/tests/unit/utils/webhook.unit.test.ts index 54d8b02..28000b0 100644 --- a/apps/api/tests/unit/utils/webhook.unit.test.ts +++ b/apps/api/tests/unit/utils/webhook.unit.test.ts @@ -12,7 +12,15 @@ * graph can be wired up without vi.doMock / vi.resetModules complications. */ -import { describe, it, expect } from "vitest"; +import { describe, it, expect, vi } from "vitest"; + +// Mock Redis and BullMQ queue creation so importing webhook.queue.ts does not +// attempt a real TCP connection to redis://localhost:6379 in unit-test context. +vi.mock("../../../src/config/redis.js", () => ({ + redisClient: { on: vi.fn(), quit: vi.fn(), disconnect: vi.fn() }, + getRedisConnectionOptions: vi.fn().mockReturnValue({ host: "localhost", port: 6379 }), + redisConnectionOptions: { host: "localhost", port: 6379 }, +})); // ─── hmac.ts ───────────────────────────────────────────────────────────────── @@ -43,6 +51,18 @@ describe("generateSignature", () => { }); }); +describe("generateSignatureWithTimestamp", () => { + it("should sign timestamp.payload and return timestamp", async () => { + const { generateSignature, generateSignatureWithTimestamp } = await import("../../../src/utils/hmac.js"); + const payload = JSON.stringify({ id: "evt_123" }); + const ts = 1_700_000_000; + const { signature, timestamp } = generateSignatureWithTimestamp("secret", payload, ts); + + expect(timestamp).toBe(ts); + expect(signature).toBe(generateSignature("secret", `${ts}.${payload}`)); + }); +}); + describe("verifySignature", () => { it("should return true for a correctly generated signature", async () => { const { generateSignature, verifySignature } = await import("../../../src/utils/hmac.js"); @@ -72,6 +92,34 @@ describe("verifySignature", () => { }); }); +describe("verifySignatureWithTimestamp", () => { + it("returns true for valid signature inside tolerance", async () => { + const { generateSignatureWithTimestamp, verifySignatureWithTimestamp } = await import( + "../../../src/utils/hmac.js" + ); + const payload = JSON.stringify({ type: "expense.created" }); + const now = 1_700_000_000; + const { signature, timestamp } = generateSignatureWithTimestamp("secret", payload, now - 60); + + expect( + verifySignatureWithTimestamp("secret", payload, signature, timestamp, now, 300), + ).toBe(true); + }); + + it("returns false when timestamp is outside tolerance", async () => { + const { generateSignatureWithTimestamp, verifySignatureWithTimestamp } = await import( + "../../../src/utils/hmac.js" + ); + const payload = JSON.stringify({ type: "expense.created" }); + const now = 1_700_000_000; + const { signature, timestamp } = generateSignatureWithTimestamp("secret", payload, now - 400); + + expect( + verifySignatureWithTimestamp("secret", payload, signature, timestamp, now, 300), + ).toBe(false); + }); +}); + // ─── url-validator.ts ───────────────────────────────────────────────────────── describe("validateWebhookUrl", () => { @@ -124,11 +172,29 @@ describe("WEBHOOK_RETRY_DELAYS_MS", () => { ); expect(WEBHOOK_MAX_ATTEMPTS).toBe(5); expect(WEBHOOK_RETRY_DELAYS_MS).toHaveLength(5); - expect(WEBHOOK_RETRY_DELAYS_MS[0]).toBe(0); // attempt 1 immediate - expect(WEBHOOK_RETRY_DELAYS_MS[1]).toBe(30_000); // attempt 2 → 30 s - expect(WEBHOOK_RETRY_DELAYS_MS[2]).toBe(120_000); // attempt 3 → 2 min - expect(WEBHOOK_RETRY_DELAYS_MS[3]).toBe(600_000); // attempt 4 → 10 min - expect(WEBHOOK_RETRY_DELAYS_MS[4]).toBe(3_600_000); // attempt 5 → 1 h + expect(WEBHOOK_RETRY_DELAYS_MS[0]).toBe(0); // attempt 1 immediate + expect(WEBHOOK_RETRY_DELAYS_MS[1]).toBe(60_000); // attempt 2 → 1 min + expect(WEBHOOK_RETRY_DELAYS_MS[2]).toBe(300_000); // attempt 3 → 5 min + expect(WEBHOOK_RETRY_DELAYS_MS[3]).toBe(900_000); // attempt 4 → 15 min + expect(WEBHOOK_RETRY_DELAYS_MS[4]).toBe(3_600_000); // attempt 5 → 1 h + }); +}); + +describe("calculateRetryDelay", () => { + it("should never return less than base delay", async () => { + const { calculateRetryDelay, WEBHOOK_RETRY_DELAYS_MS } = await import( + "../../../src/workers/webhook.queue.js" + ); + vi.spyOn(Math, "random").mockReturnValue(0); + expect(calculateRetryDelay(2)).toBe(WEBHOOK_RETRY_DELAYS_MS[1]); + vi.restoreAllMocks(); + }); + + it("should cap jitter at +20% when Math.random() is 1", async () => { + const { calculateRetryDelay } = await import("../../../src/workers/webhook.queue.js"); + vi.spyOn(Math, "random").mockReturnValue(1); + expect(calculateRetryDelay(2)).toBe(72_000); // 60_000 + 20% + vi.restoreAllMocks(); }); }); diff --git a/apps/web/next.config.mjs b/apps/web/next.config.mjs index f3daa52..6173a49 100644 --- a/apps/web/next.config.mjs +++ b/apps/web/next.config.mjs @@ -1,5 +1,7 @@ /** @type {import('next').NextConfig} */ +const isDev = process.env.NODE_ENV === 'development'; + // NEXT_PUBLIC_API_BASE_URL controls how the browser reaches the backend. // // Mode A — Direct (recommended for Vercel): @@ -47,6 +49,7 @@ const nextConfig = { "https://*.tiles.mapbox.com", // Mapbox raster / vector tiles "https://api.mapbox.com", // Mapbox geocoding, directions, styles "https://events.mapbox.com", // Mapbox telemetry + "https://*.tile.openstreetmap.org", // Leaflet / OpenStreetMap tiles ]; // Only add the API origin when it is a full URL — same-origin requests // (/api/proxy path) are already covered by 'self' above. @@ -67,7 +70,9 @@ const nextConfig = { key: "Content-Security-Policy", value: [ "default-src 'self'", - "script-src 'self' 'unsafe-inline'", + // In development, Next.js Fast Refresh (HMR) requires 'unsafe-eval'. + // Without it the React event system breaks and forms submit natively. + isDev ? "script-src 'self' 'unsafe-inline' 'unsafe-eval'" : "script-src 'self' 'unsafe-inline'", "style-src 'self' 'unsafe-inline'", // blob: required for Mapbox GL sprite / image atlas "img-src 'self' data: blob: https:", diff --git a/apps/web/package.json b/apps/web/package.json index db821bf..19d2cce 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -15,6 +15,7 @@ }, "dependencies": { "@fieldtrack/types": "*", + "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-avatar": "^1.1.2", "@radix-ui/react-dialog": "^1.1.4", "@radix-ui/react-dropdown-menu": "^2.1.4", @@ -28,10 +29,12 @@ "@supabase/supabase-js": "^2.46.2", "@tanstack/react-query": "^5.62.7", "@types/leaflet": "^1.9.21", + "@types/leaflet.markercluster": "^1.5.6", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "framer-motion": "^12.36.0", "leaflet": "^1.9.4", + "leaflet.markercluster": "^1.5.3", "lucide-react": "^0.468.0", "mapbox-gl": "^3.8.0", "next": "^15.1.3", diff --git a/apps/web/src/app/(protected)/admin/monitoring/map/EmployeeMap.tsx b/apps/web/src/app/(protected)/admin/monitoring/map/EmployeeMap.tsx index 2191dbc..8efbb8b 100644 --- a/apps/web/src/app/(protected)/admin/monitoring/map/EmployeeMap.tsx +++ b/apps/web/src/app/(protected)/admin/monitoring/map/EmployeeMap.tsx @@ -1,61 +1,80 @@ "use client"; /** - * EmployeeMap — Leaflet map component. + * EmployeeMap — Leaflet map with MarkerClusterGroup support. * * Imported dynamically with `ssr: false` from the parent page because Leaflet * accesses `window` at module initialisation time and will crash Next.js SSR. * * Marker colour scheme: - * ACTIVE → green (checked in within the last 2 hours) - * RECENT → orange (checked out, still this calendar day) - * INACTIVE → grey (no session activity today) + * ACTIVE → green (checked in within the last 2 hours) + * RECENT → orange (checked out, still this calendar day) + * INACTIVE → grey (no session activity today) + * + * Selected employee → enlarged SVG + pulsing ring overlay. + * Clustering → nearby markers grouped at low zoom via MarkerClusterGroup. */ import { useEffect, useRef } from "react"; import type { Map as LeafletMap, Marker as LeafletMarker } from "leaflet"; import L from "leaflet"; +import "leaflet.markercluster"; import type { EmployeeMapMarker } from "@/types"; -// ─── Marker icon colours matching status ────────────────────────────────────── +// ─── Marker icon colours ────────────────────────────────────────────────────── const STATUS_COLOURS: Record = { - ACTIVE: "#22c55e", // green-500 - RECENT: "#f97316", // orange-500 - INACTIVE: "#94a3b8", // slate-400 + ACTIVE: "#22c55e", // green-500 + RECENT: "#f97316", // orange-500 + INACTIVE: "#94a3b8", // slate-400 }; -function makeIcon(status: EmployeeMapMarker["status"]) { +function makeIcon( + status: EmployeeMapMarker["status"], + selected = false +) { const colour = STATUS_COLOURS[status]; - // Inline SVG circle marker — avoids the default Leaflet PNG which requires - // webpack file-loader config. Works in all build setups without extra config. + const size = selected ? 32 : 24; + const inner = selected ? 8 : 5; + + const pulse = selected + ? ` + + + ` + : ""; + const svg = ` - - - + + ${pulse} + + `.trim(); return L.divIcon({ html: svg, - className: "", // prevent Leaflet's default white-box class - iconSize: [24, 24], - iconAnchor: [12, 12], - popupAnchor: [0, -14], + className: "", // prevent Leaflet's default white-box class + iconSize: [size, size], + iconAnchor: [size / 2, size / 2], + popupAnchor: [0, -(size / 2 + 2)], }); } -// ─── Popup HTML (pure string — Leaflet renders these) ───────────────────────── +// ─── Popup HTML ─────────────────────────────────────────────────────────────── function buildPopupHtml(m: EmployeeMapMarker): string { - const ts = new Date(m.recordedAt).toLocaleString(); + const ts = new Date(m.recordedAt).toLocaleString(); const code = m.employeeCode ? ` (${m.employeeCode})` : ""; - const statusColour = m.status === "ACTIVE" ? "green" : m.status === "RECENT" ? "orange" : "grey"; + const statusColour = + m.status === "ACTIVE" ? "#22c55e" : + m.status === "RECENT" ? "#f97316" : "#94a3b8"; + return ` -
+
${m.employeeName}${code}
- ${m.status}
- Last fix: ${ts} + ${m.status}
+ Last fix: ${ts}
`.trim(); } @@ -63,28 +82,26 @@ function buildPopupHtml(m: EmployeeMapMarker): string { // ─── Component ──────────────────────────────────────────────────────────────── interface Props { - markers: EmployeeMapMarker[]; - isLoading: boolean; + markers: EmployeeMapMarker[]; + isLoading: boolean; + selectedEmployeeId?: string | null; } -export default function EmployeeMap({ markers, isLoading }: Props) { +export default function EmployeeMap({ markers, isLoading, selectedEmployeeId }: Props) { const mapContainerRef = useRef(null); - const mapRef = useRef(null); - const markerLayerRef = useRef([]); + const mapRef = useRef(null); + const clusterGroupRef = useRef(null); + // Track current markers by employeeId → Leaflet marker + const markerMapRef = useRef>(new Map()); - // Initialise Leaflet map once + // ── Initialise Leaflet map once ──────────────────────────────────────────── useEffect(() => { if (!mapContainerRef.current || mapRef.current) return; + const container = mapContainerRef.current; - // Leaflet's default icon path breaks with webpack/Next.js — fix it - // by telling it to use an empty icon. We override icons per-marker anyway. - // @ts-expect-error _getIconUrl is an internal Leaflet method - delete L.Icon.Default.prototype._getIconUrl; - L.Icon.Default.mergeOptions({ iconUrl: "", shadowUrl: "" }); - - const map = L.map(mapContainerRef.current, { - center: [20, 0], // world view until we fit to markers - zoom: 2, + const map = L.map(container, { + center: [20, 0], + zoom: 2, zoomControl: true, }); @@ -94,46 +111,106 @@ export default function EmployeeMap({ markers, isLoading }: Props) { maxZoom: 19, }).addTo(map); + // Marker cluster group with custom cluster icon + const clusterGroup = L.markerClusterGroup({ + maxClusterRadius: 60, + showCoverageOnHover: false, + iconCreateFunction(cluster) { + const count = cluster.getChildCount(); + return L.divIcon({ + html: `
${count}
`, + className: "", + iconSize: [36, 36], + iconAnchor: [18, 18], + }); + }, + }); + map.addLayer(clusterGroup); + clusterGroupRef.current = clusterGroup; mapRef.current = map; + // ResizeObserver → invalidateSize when container dimensions change + const ro = new ResizeObserver(() => map.invalidateSize({ animate: false })); + ro.observe(container); + + const raf = requestAnimationFrame(() => map.invalidateSize({ animate: false })); + return () => { + ro.disconnect(); + cancelAnimationFrame(raf); map.remove(); - mapRef.current = null; + mapRef.current = null; + clusterGroupRef.current = null; + markerMapRef.current.clear(); }; }, []); - // Update markers whenever data changes + // ── Sync markers when data or selection changes ──────────────────────────── useEffect(() => { - const map = mapRef.current; - if (!map) return; - - // Remove old markers - for (const m of markerLayerRef.current) { - m.remove(); + const map = mapRef.current; + const clusterGroup = clusterGroupRef.current; + if (!map || !clusterGroup) return; + + const incoming = new Map(markers.map((m) => [m.employeeId, m])); + const existing = markerMapRef.current; + + // Remove markers no longer in the data set + for (const [id, leafletMarker] of existing) { + if (!incoming.has(id)) { + clusterGroup.removeLayer(leafletMarker); + existing.delete(id); + } } - markerLayerRef.current = []; - - if (markers.length === 0) return; - const newMarkers: LeafletMarker[] = []; const latLngs: [number, number][] = []; + const toAdd: LeafletMarker[] = []; for (const m of markers) { - const icon = makeIcon(m.status); - const marker = L.marker([m.latitude, m.longitude], { icon }) - .addTo(map) - .bindPopup(buildPopupHtml(m)); - newMarkers.push(marker); + const isSelected = selectedEmployeeId === m.employeeId; + const icon = makeIcon(m.status, isSelected); latLngs.push([m.latitude, m.longitude]); + + if (existing.has(m.employeeId)) { + // Update existing marker position + icon (smooth move, no remove/re-add) + const lm = existing.get(m.employeeId)!; + lm.setLatLng([m.latitude, m.longitude]); + lm.setIcon(icon); + lm.setPopupContent(buildPopupHtml(m)); + } else { + // New marker + const lm = L.marker([m.latitude, m.longitude], { icon }).bindPopup(buildPopupHtml(m)); + existing.set(m.employeeId, lm); + toAdd.push(lm); + } } - markerLayerRef.current = newMarkers; + if (toAdd.length > 0) { + clusterGroup.addLayers(toAdd); + } - // Fit the map to show all markers (with a small padding) - if (latLngs.length > 0) { + markerMapRef.current = existing; + + // Auto-centre: only on first data load (when no markers existed before) + if (latLngs.length > 0 && existing.size === toAdd.length) { map.fitBounds(L.latLngBounds(latLngs), { padding: [40, 40], maxZoom: 14 }); } - }, [markers]); + + // Pan to selected employee marker if it exists + if (selectedEmployeeId) { + const sel = existing.get(selectedEmployeeId); + if (sel) { + map.setView(sel.getLatLng(), Math.max(map.getZoom(), 13), { animate: true }); + sel.openPopup(); + } + } + }, [markers, selectedEmployeeId]); return (
@@ -142,7 +219,6 @@ export default function EmployeeMap({ markers, isLoading }: Props) { Loading positions…
)} - {/* The map mounts into this div */}
); diff --git a/apps/web/src/app/(protected)/admin/monitoring/map/page.tsx b/apps/web/src/app/(protected)/admin/monitoring/map/page.tsx index e3f15b7..f78477b 100644 --- a/apps/web/src/app/(protected)/admin/monitoring/map/page.tsx +++ b/apps/web/src/app/(protected)/admin/monitoring/map/page.tsx @@ -1,5 +1,6 @@ "use client"; +import { useState } from "react"; import dynamic from "next/dynamic"; import { useEffect } from "react"; import { useRouter } from "next/navigation"; @@ -8,7 +9,9 @@ import { useAdminMap } from "@/hooks/queries/useDashboard"; import { ErrorBanner } from "@/components/ErrorBanner"; import { Badge } from "@/components/ui/badge"; import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; -import { MapPin, RefreshCw } from "lucide-react"; +import { MapPin, RefreshCw, Users, Search } from "lucide-react"; +import { Input } from "@/components/ui/input"; +import { cn } from "@/lib/utils"; import type { EmployeeMapMarker } from "@/types"; // ─── Dynamic Leaflet import (SSR disabled — Leaflet uses `window`) ──────────── @@ -16,25 +19,79 @@ import type { EmployeeMapMarker } from "@/types"; const EmployeeMap = dynamic(() => import("./EmployeeMap"), { ssr: false, loading: () => ( -
+
Loading map…
), }); -// ─── Status badge helper ────────────────────────────────────────────────────── +// ─── Helpers ────────────────────────────────────────────────────────────────── -const STATUS_VARIANTS: Record< - EmployeeMapMarker["status"], - "default" | "secondary" | "outline" -> = { - ACTIVE: "default", - RECENT: "secondary", - INACTIVE: "outline", +const STATUS_DOT: Record = { + ACTIVE: "bg-emerald-500", + RECENT: "bg-orange-400", + INACTIVE: "bg-slate-400", }; -function statusLabel(status: EmployeeMapMarker["status"]) { - return status.charAt(0) + status.slice(1).toLowerCase(); +const STATUS_LABEL: Record = { + ACTIVE: "Active", + RECENT: "Recent", + INACTIVE: "Inactive", +}; + +// ─── Employee List Item ──────────────────────────────────────────────────────── + +function EmployeeListItem({ + marker, + selected, + onClick, +}: { + marker: EmployeeMapMarker; + selected: boolean; + onClick: () => void; +}) { + const initials = marker.employeeName + .split(" ") + .slice(0, 2) + .map((n) => n[0] ?? "") + .join("") + .toUpperCase(); + + return ( + + ); } // ─── Page ───────────────────────────────────────────────────────────────────── @@ -42,6 +99,8 @@ function statusLabel(status: EmployeeMapMarker["status"]) { export default function MonitoringMapPage() { const { permissions } = useAuth(); const router = useRouter(); + const [selectedId, setSelectedId] = useState(null); + const [search, setSearch] = useState(""); useEffect(() => { if (!permissions.viewAnalytics) { @@ -53,67 +112,157 @@ export default function MonitoringMapPage() { if (!permissions.viewAnalytics) return null; - const activeCount = markers.filter((m) => m.status === "ACTIVE").length; - const recentCount = markers.filter((m) => m.status === "RECENT").length; + const activeCount = markers.filter((m) => m.status === "ACTIVE").length; + const recentCount = markers.filter((m) => m.status === "RECENT").length; + const inactiveCount = markers.filter((m) => m.status === "INACTIVE").length; + + const filtered = markers.filter((m) => + search + ? m.employeeName.toLowerCase().includes(search.toLowerCase()) || + (m.employeeCode ?? "").toLowerCase().includes(search.toLowerCase()) + : true + ); + + function handleSelect(id: string) { + setSelectedId((prev) => (prev === id ? null : id)); + } return (
{/* Header */}
-

Live Employee Map

+

Live Employee Map

Showing latest GPS position per employee. Refreshes every 30 s.

- - {dataUpdatedAt - ? `Updated ${new Date(dataUpdatedAt).toLocaleTimeString()}` - : null} - + {dataUpdatedAt ? ( + + Updated {new Date(dataUpdatedAt).toLocaleTimeString()} + + ) : null}
{/* Summary badges */} -
- - +
+ + {activeCount} Active - {recentCount} Recent - {markers.length} Total on map + + + {recentCount} Recent + + + + {markers.length} on map +
{/* Error */} {error ? : null} - {/* Map */} - - - Employee Positions - - -
- -
-
-
+ {/* Main content: map + employee list */} +
+ {/* Map */} + + + Employee Positions + + +
+ +
+
+
+ + {/* Employee sidebar */} + + + + + Employees + + + + {/* Search */} +
+ + setSearch(e.target.value)} + className="pl-8 h-8 text-sm" + /> +
+ + {/* Status summary */} +
+ {activeCount} active + {recentCount} recent + {inactiveCount} inactive +
+ + {/* Scrollable list */} +
+ {isLoading && ( +
+ {Array.from({ length: 5 }).map((_, i) => ( +
+
+
+
+ ))} +
+ )} + + {!isLoading && filtered.length === 0 && ( +

No employees found

+ )} + + {/* Sort: ACTIVE first, then RECENT, then INACTIVE */} + {[...filtered] + .sort((a, b) => { + const order = { ACTIVE: 0, RECENT: 1, INACTIVE: 2 }; + return order[a.status] - order[b.status]; + }) + .map((m) => ( + handleSelect(m.employeeId)} + /> + ))} +
+ + +
{/* Empty state */} - {!isLoading && markers.length === 0 && !error ? ( -

- No employees with GPS data found. Markers appear after employees check in and record a - location point. -

- ) : null} + {!isLoading && markers.length === 0 && !error && ( +
+ +

No GPS data yet

+

+ Markers appear after employees check in and record a location point. +

+
+ )}
); } diff --git a/apps/web/src/app/(protected)/admin/webhooks/page.tsx b/apps/web/src/app/(protected)/admin/webhooks/page.tsx new file mode 100644 index 0000000..7e25147 --- /dev/null +++ b/apps/web/src/app/(protected)/admin/webhooks/page.tsx @@ -0,0 +1,819 @@ +"use client"; + +import { useState } from "react"; +import { motion, AnimatePresence } from "framer-motion"; +import { useToast } from "@/components/ui/use-toast"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { Badge } from "@/components/ui/badge"; +import { Skeleton } from "@/components/ui/skeleton"; +import { + Sheet, + SheetContent, + SheetHeader, + SheetTitle, + SheetFooter, +} from "@/components/ui/sheet"; +import { + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from "@/components/ui/alert-dialog"; +import { + Webhook, + Plus, + Trash2, + RefreshCw, + ChevronDown, + ChevronUp, + CheckCircle2, + XCircle, + Clock, + ToggleLeft, + ToggleRight, + Eye, + EyeOff, + Copy, + Check, +} from "lucide-react"; +import { cn } from "@/lib/utils"; +import { + useWebhooks, + useWebhookDeliveries, + useCreateWebhook, + useUpdateWebhook, + useDeleteWebhook, + useRetryDelivery, + WEBHOOK_EVENT_TYPES, + type WebhookRecord, + type WebhookDelivery, + type DeliveryStatus, + type CreateWebhookBody, +} from "@/hooks/queries/useWebhooks"; + +// ─── Constants ──────────────────────────────────────────────────────────────── + +const EVENT_LABELS: Record = { + "employee.checked_in": "Check In", + "employee.checked_out": "Check Out", + "expense.created": "Expense Created", + "expense.approved": "Expense Approved", + "expense.rejected": "Expense Rejected", + "employee.created": "Employee Created", +}; + +const STATUS_CONFIG: Record< + DeliveryStatus, + { label: string; icon: React.ElementType; className: string } +> = { + success: { label: "Success", icon: CheckCircle2, className: "text-emerald-500" }, + failed: { label: "Failed", icon: XCircle, className: "text-rose-500" }, + pending: { label: "Pending", icon: Clock, className: "text-amber-500" }, +}; + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +function formatRelativeTime(iso: string): string { + const diff = Date.now() - new Date(iso).getTime(); + if (diff < 60_000) return "Just now"; + if (diff < 3_600_000) return `${Math.floor(diff / 60_000)}m ago`; + if (diff < 86_400_000) return `${Math.floor(diff / 3_600_000)}h ago`; + return `${Math.floor(diff / 86_400_000)}d ago`; +} + +// ─── Delivery Status Badge ──────────────────────────────────────────────────── + +function DeliveryStatusBadge({ status }: { status: DeliveryStatus }) { + const { label, icon: Icon, className } = STATUS_CONFIG[status]; + return ( + + + {label} + + ); +} + +// ─── Expandable Payload Row ─────────────────────────────────────────────────── + +function DeliveryRow({ delivery, onRetry, isRetrying }: { + delivery: WebhookDelivery; + onRetry: (id: string) => void; + isRetrying: boolean; +}) { + const [expanded, setExpanded] = useState(false); + + return ( +
+ + )} + {expanded ? ( + + ) : ( + + )} + + + + {expanded && ( + +
+
+

+ Response Body +

+
+                  {delivery.response_body ?? "(no response body)"}
+                
+
+
+ Event: {delivery.event_id.slice(0, 8)}… + Delivery: {delivery.id.slice(0, 8)}… +
+
+
+ )} +
+
+ ); +} + +// ─── Deliveries Panel ───────────────────────────────────────────────────────── + +function DeliveriesPanel({ webhookId }: { webhookId: string | null }) { + const [page, setPage] = useState(1); + const [statusFilter, setStatusFilter] = useState(undefined); + const retryDelivery = useRetryDelivery(); + const { toast } = useToast(); + + const { data, isLoading } = useWebhookDeliveries( + page, + 20, + webhookId ?? undefined, + statusFilter, + ); + + const deliveries = data?.data ?? []; + const total = data?.pagination.total ?? 0; + const hasMore = page * 20 < total; + + function handleRetry(id: string) { + retryDelivery.mutate(id, { + onSuccess: () => toast({ title: "Delivery queued for retry" }), + onError: (e) => toast({ variant: "destructive", title: "Retry failed", description: e.message }), + }); + } + + const FILTERS: { key: DeliveryStatus | undefined; label: string }[] = [ + { key: undefined, label: "All" }, + { key: "pending", label: "Pending" }, + { key: "success", label: "Success" }, + { key: "failed", label: "Failed" }, + ]; + + return ( +
+ {/* Filters */} +
+ {FILTERS.map((f) => ( + + ))} +
+ +
+ {isLoading && ( +
+ {Array.from({ length: 5 }).map((_, i) => ( +
+ + + +
+ ))} +
+ )} + + {!isLoading && deliveries.length === 0 && ( +
+ +

No deliveries yet

+

+ Deliveries appear here when a webhook event is triggered. +

+
+ )} + + {!isLoading && deliveries.length > 0 && ( +
+ {deliveries.map((d) => ( + + ))} +
+ )} +
+ + {(deliveries.length > 0 || page > 1) && ( +
+ {total} total deliveries +
+ + +
+
+ )} +
+ ); +} + +// ─── Webhook Card ───────────────────────────────────────────────────────────── + +function WebhookCard({ + webhook, + onEdit, + onDelete, +}: { + webhook: WebhookRecord; + onEdit: (w: WebhookRecord) => void; + onDelete: (id: string) => void; +}) { + const [showDeliveries, setShowDeliveries] = useState(false); + const [copied, setCopied] = useState(false); + + const updateWebhook = useUpdateWebhook(webhook.id); + const { toast } = useToast(); + + function handleToggleActive() { + updateWebhook.mutate( + { is_active: !webhook.is_active }, + { + onSuccess: () => + toast({ title: `Webhook ${!webhook.is_active ? "enabled" : "disabled"}` }), + onError: (e) => + toast({ variant: "destructive", title: "Update failed", description: e.message }), + } + ); + } + + function copyUrl() { + void navigator.clipboard.writeText(webhook.url); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + } + + return ( + + {/* Header row */} +
+ {/* Status dot */} +
+ {webhook.is_active ? ( + + + + + ) : ( + + )} +
+ + {/* URL + events */} +
+
+ + {webhook.url} + + +
+
+ {webhook.events.map((e) => ( + + {EVENT_LABELS[e] ?? e} + + ))} +
+
+ + {/* Actions */} +
+ + + +
+
+ + {/* Deliveries toggle */} + + + + {showDeliveries && ( + +
+ +
+
+ )} +
+
+ ); +} + +// ─── Create / Edit Sheet ────────────────────────────────────────────────────── + +interface WebhookFormState { + url: string; + secret: string; + events: Set; +} + +function WebhookSheet({ + open, + editing, + onClose, +}: { + open: boolean; + editing: WebhookRecord | null; + onClose: () => void; +}) { + const { toast } = useToast(); + const createWebhook = useCreateWebhook(); + // Always call the hook — pass editing.id when editing, empty string otherwise. + // An empty string never triggers a real request (mutations are on-demand). + const updateWebhook = useUpdateWebhook(editing?.id ?? ""); + const [showSecret, setShowSecret] = useState(false); + const [form, setForm] = useState({ + url: "", + secret: "", + events: new Set(), + }); + + // Sync form when the editing target changes + // eslint-disable-next-line react-hooks/exhaustive-deps + useState(() => { + if (editing) { + setForm({ url: editing.url, secret: "", events: new Set(editing.events) }); + } else { + setForm({ url: "", secret: "", events: new Set() }); + } + }); + + function handleOpen(isOpen: boolean) { + if (!isOpen) { + setForm({ url: "", secret: "", events: new Set() }); + onClose(); + } + } + + function toggleEvent(event: string) { + setForm((f) => { + const next = new Set(f.events); + next.has(event) ? next.delete(event) : next.add(event); + return { ...f, events: next }; + }); + } + + function handleSubmit(e: React.FormEvent) { + e.preventDefault(); + + if (form.events.size === 0) { + toast({ variant: "destructive", title: "Select at least one event" }); + return; + } + + if (editing) { + const patch: Parameters[0] = { + url: form.url || editing.url, + events: [...form.events] as CreateWebhookBody["events"], + }; + if (form.secret) patch.secret = form.secret; + + updateWebhook.mutate(patch, { + onSuccess: () => { toast({ title: "Webhook updated" }); onClose(); }, + onError: (err) => toast({ variant: "destructive", title: "Update failed", description: err.message }), + }); + } else { + if (form.url.length < 5) { + toast({ variant: "destructive", title: "Enter a valid URL" }); + return; + } + if (form.secret.length < 16) { + toast({ variant: "destructive", title: "Secret must be ≥ 16 characters" }); + return; + } + createWebhook.mutate( + { + url: form.url, + secret: form.secret, + events: [...form.events] as CreateWebhookBody["events"], + }, + { + onSuccess: () => { toast({ title: "Webhook registered" }); onClose(); }, + onError: (err) => toast({ variant: "destructive", title: "Failed to create webhook", description: err.message }), + } + ); + } + } + + const isPending = createWebhook.isPending || updateWebhook.isPending; + + return ( + + + + + + {editing ? "Edit Webhook" : "Register Webhook"} + + + +
+
+ {/* URL */} +
+ + setForm((f) => ({ ...f, url: e.target.value }))} + required={!editing} + /> +

+ FieldTrack will POST JSON events to this URL. +

+
+ + {/* Secret */} +
+ +
+ setForm((f) => ({ ...f, secret: e.target.value }))} + required={!editing} + className="pr-10" + /> + +
+

+ Used to sign the X-FieldTrack-Signature header. +

+
+ + {/* Events */} +
+ +
+ {WEBHOOK_EVENT_TYPES.map((event) => { + const checked = form.events.has(event); + return ( + + ); + })} +
+
+
+ + + + + +
+
+
+ ); +} + +// ─── Delete Confirm Dialog ──────────────────────────────────────────────────── + +function DeleteWebhookDialog({ + webhookId, + onClose, +}: { + webhookId: string | null; + onClose: () => void; +}) { + const deleteWebhook = useDeleteWebhook(); + const { toast } = useToast(); + + function handleConfirm() { + if (!webhookId) return; + deleteWebhook.mutate(webhookId, { + onSuccess: () => { toast({ title: "Webhook deleted" }); onClose(); }, + onError: (e) => { toast({ variant: "destructive", title: "Delete failed", description: e.message }); onClose(); }, + }); + } + + return ( + !open && onClose()}> + + + Delete webhook? + + This will permanently remove the webhook endpoint and all its delivery + history. This action cannot be undone. + + + + Cancel + + {deleteWebhook.isPending ? "Deleting…" : "Delete"} + + + + + ); +} + +// ─── Page ───────────────────────────────────────────────────────────────────── + +export default function WebhooksPage() { + const { data: webhooks, isLoading, error } = useWebhooks(); + const [sheetOpen, setSheetOpen] = useState(false); + const [editingWebhook, setEditingWebhook] = useState(null); + const [deletingId, setDeletingId] = useState(null); + + function openCreate() { + setEditingWebhook(null); + setSheetOpen(true); + } + + function openEdit(w: WebhookRecord) { + setEditingWebhook(w); + setSheetOpen(true); + } + + function closeSheet() { + setSheetOpen(false); + setEditingWebhook(null); + } + + return ( +
+ {/* Page header */} +
+
+

+ + Webhooks +

+

+ Register HTTP endpoints to receive real-time FieldTrack events. +

+
+ +
+ + {/* Error state */} + {error && ( +
+ Failed to load webhooks: {error.message} +
+ )} + + {/* Loading state */} + {isLoading && ( +
+ {Array.from({ length: 3 }).map((_, i) => ( +
+
+ + + +
+
+ + +
+
+ ))} +
+ )} + + {/* Empty state */} + {!isLoading && !error && (webhooks ?? []).length === 0 && ( +
+
+ +
+
+

No webhooks registered

+

+ Register an HTTP endpoint to receive real-time events like check-ins, + expense submissions, and employee updates. +

+
+ +
+ )} + + {/* Webhook cards */} + {!isLoading && (webhooks ?? []).length > 0 && ( + <> + {/* Summary bar */} +
+ + {webhooks!.length}{" "} + webhook{webhooks!.length !== 1 ? "s" : ""} + + · + + + {webhooks!.filter((w) => w.is_active).length} + {" "} + active + +
+ + +
+ {webhooks!.map((webhook) => ( + + ))} +
+
+ + )} + + {/* Global delivery history — shows all org deliveries when no specific webhook is selected */} + {!isLoading && (webhooks ?? []).length > 0 && ( +
+

+ All Deliveries +

+ +
+ )} + + {/* Sheets + Dialogs */} + + setDeletingId(null)} /> +
+ ); +} diff --git a/apps/web/src/app/(protected)/profile/page.tsx b/apps/web/src/app/(protected)/profile/page.tsx index 7f510ee..0ad3196 100644 --- a/apps/web/src/app/(protected)/profile/page.tsx +++ b/apps/web/src/app/(protected)/profile/page.tsx @@ -2,12 +2,15 @@ import { useMyProfile } from "@/hooks/queries/useProfile"; import { useLeaderboard } from "@/hooks/queries/useAnalytics"; +import { useAuth } from "@/hooks/useAuth"; import { ErrorBanner } from "@/components/ErrorBanner"; import { Skeleton } from "@/components/ui/skeleton"; import { ProfileView } from "@/components/ProfileView"; import { PageTransition } from "@/components/motion"; +import { UserCircle } from "lucide-react"; export default function MyProfilePage() { + const { user, role } = useAuth(); const { data: profile, isLoading: profileLoading, error } = useMyProfile(); const { data: leaderboard } = useLeaderboard("distance", 50); @@ -35,7 +38,27 @@ export default function MyProfilePage() {
) : error ? ( - + role === "ADMIN" ? ( + // Admins typically don't have a field employee profile — show a graceful message +
+
+ +
+
+

{user?.email?.split("@")[0] ?? "Admin"}

+

{user?.email}

+ + {role} + +
+

+ Administrator accounts do not have a field employee profile. Employee performance + metrics, GPS sessions, and attendance data are accessible through the admin dashboard. +

+
+ ) : ( + + ) ) : profile ? ( ) : null} @@ -43,4 +66,3 @@ export default function MyProfilePage() { ); } - diff --git a/apps/web/src/app/globals.css b/apps/web/src/app/globals.css index 54cfe52..7fe4f07 100644 --- a/apps/web/src/app/globals.css +++ b/apps/web/src/app/globals.css @@ -1,4 +1,7 @@ @import "mapbox-gl/dist/mapbox-gl.css"; +@import "leaflet/dist/leaflet.css"; +@import "leaflet.markercluster/dist/MarkerCluster.css"; +@import "leaflet.markercluster/dist/MarkerCluster.Default.css"; @tailwind base; @tailwind components; diff --git a/apps/web/src/app/providers.tsx b/apps/web/src/app/providers.tsx index 1e82833..a59255e 100644 --- a/apps/web/src/app/providers.tsx +++ b/apps/web/src/app/providers.tsx @@ -5,12 +5,12 @@ import { queryClient } from "@/lib/query-client"; import { AuthProvider } from "@/contexts/AuthContext"; import { ThemeProvider } from "@/components/providers/theme-provider"; import { Toaster } from "@/components/ui/toaster"; +import { useToast } from "@/components/ui/use-toast"; import { validateEnv } from "@/lib/env"; import { useEffect } from "react"; function EnvValidator({ children }: { children: React.ReactNode }) { useEffect(() => { - // Log API routing mode on every startup — instant misconfiguration visibility. console.log("[FieldTrack] API mode:", { base: process.env.NEXT_PUBLIC_API_BASE_URL ?? "(not set)", proxy: process.env.API_DESTINATION_URL ?? "(not set — only relevant in proxy mode)", @@ -26,6 +26,29 @@ function EnvValidator({ children }: { children: React.ReactNode }) { return <>{children}; } +/** + * GlobalErrorToast — listens for `fieldtrack:query-error` events emitted by + * the query-client.ts error handler and shows a toast notification. + */ +function GlobalErrorToast() { + const { toast } = useToast(); + + useEffect(() => { + function handler(e: Event) { + const detail = (e as CustomEvent<{ message: string }>).detail; + toast({ + variant: "destructive", + title: "Something went wrong", + description: detail.message ?? "An unexpected error occurred.", + }); + } + window.addEventListener("fieldtrack:query-error", handler); + return () => window.removeEventListener("fieldtrack:query-error", handler); + }, [toast]); + + return null; +} + export function Providers({ children }: { children: React.ReactNode }) { return ( @@ -33,6 +56,7 @@ export function Providers({ children }: { children: React.ReactNode }) { {children} + diff --git a/apps/web/src/components/layout/Header.tsx b/apps/web/src/components/layout/Header.tsx index 7260acf..9416e0e 100644 --- a/apps/web/src/components/layout/Header.tsx +++ b/apps/web/src/components/layout/Header.tsx @@ -117,13 +117,13 @@ function AvatarInitials({ name, size = "sm" }: { name: string; size?: "sm" | "md export function Header() { const { user, role, logout } = useAuth(); + const isAdmin = role === "ADMIN"; const { data: profile } = useMyProfile(); const { from, to } = useTodayRange(); - const { data: orgSummary } = useOrgSummary(from, to); + const { data: orgSummary } = useOrgSummary(from, to, isAdmin); const today = useTodayString(); const [searchOpen, setSearchOpen] = useState(false); - const isAdmin = role === "ADMIN"; const displayName = profile?.name ?? user?.email?.split("@")[0] ?? "Account"; const firstName = displayName.split(" ")[0]; diff --git a/apps/web/src/components/layout/Sidebar.tsx b/apps/web/src/components/layout/Sidebar.tsx index da7b322..ba8ad0b 100644 --- a/apps/web/src/components/layout/Sidebar.tsx +++ b/apps/web/src/components/layout/Sidebar.tsx @@ -16,6 +16,8 @@ import { ChevronLeft, ChevronRight, Users, + Map, + Webhook, } from "lucide-react"; import { cn } from "@/lib/utils"; import { useAuth } from "@/hooks/useAuth"; @@ -177,6 +179,16 @@ export function SidebarNav({ collapsed = false }: { collapsed?: boolean }) { label: "Employees", icon: , }, + { + href: "/admin/monitoring/map", + label: "Live Map", + icon: , + }, + { + href: "/admin/webhooks", + label: "Webhooks", + icon: , + }, ] : []; diff --git a/apps/web/src/components/ui/alert-dialog.tsx b/apps/web/src/components/ui/alert-dialog.tsx new file mode 100644 index 0000000..a5d0458 --- /dev/null +++ b/apps/web/src/components/ui/alert-dialog.tsx @@ -0,0 +1,127 @@ +"use client"; + +import * as React from "react"; +import * as AlertDialogPrimitive from "@radix-ui/react-alert-dialog"; +import { cn } from "@/lib/utils"; +import { buttonVariants } from "@/components/ui/button"; + +const AlertDialog = AlertDialogPrimitive.Root; +const AlertDialogTrigger = AlertDialogPrimitive.Trigger; +const AlertDialogPortal = AlertDialogPrimitive.Portal; + +const AlertDialogOverlay = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +AlertDialogOverlay.displayName = AlertDialogPrimitive.Overlay.displayName; + +const AlertDialogContent = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + + + + +)); +AlertDialogContent.displayName = AlertDialogPrimitive.Content.displayName; + +const AlertDialogHeader = ({ + className, + ...props +}: React.HTMLAttributes) => ( +
+); +AlertDialogHeader.displayName = "AlertDialogHeader"; + +const AlertDialogFooter = ({ + className, + ...props +}: React.HTMLAttributes) => ( +
+); +AlertDialogFooter.displayName = "AlertDialogFooter"; + +const AlertDialogTitle = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +AlertDialogTitle.displayName = AlertDialogPrimitive.Title.displayName; + +const AlertDialogDescription = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +AlertDialogDescription.displayName = AlertDialogPrimitive.Description.displayName; + +const AlertDialogAction = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +AlertDialogAction.displayName = AlertDialogPrimitive.Action.displayName; + +const AlertDialogCancel = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)); +AlertDialogCancel.displayName = AlertDialogPrimitive.Cancel.displayName; + +export { + AlertDialog, + AlertDialogPortal, + AlertDialogOverlay, + AlertDialogTrigger, + AlertDialogContent, + AlertDialogHeader, + AlertDialogFooter, + AlertDialogTitle, + AlertDialogDescription, + AlertDialogAction, + AlertDialogCancel, +}; diff --git a/apps/web/src/contexts/AuthContext.tsx b/apps/web/src/contexts/AuthContext.tsx index 194c890..df68505 100644 --- a/apps/web/src/contexts/AuthContext.tsx +++ b/apps/web/src/contexts/AuthContext.tsx @@ -4,6 +4,7 @@ import React, { createContext, useContext, useEffect, useState } from "react"; import { Session, User, AuthChangeEvent } from "@supabase/supabase-js"; import { supabase } from "@/lib/supabase"; import { derivePermissions } from "@/lib/permissions"; +import { extractRoleFromSession } from "@/lib/auth/role"; import { UserRole, UserPermissions } from "@/types"; interface AuthContextValue { @@ -40,10 +41,7 @@ export function AuthProvider({ children }: { children: React.ReactNode }) { const [isLoading, setIsLoading] = useState(true); function extractRole(s: Session): UserRole { - const metaRole = - (s.user.user_metadata?.role as UserRole | undefined) ?? - (s.user.app_metadata?.role as UserRole | undefined); - return metaRole ?? "EMPLOYEE"; + return extractRoleFromSession(s, { allowUserMetadataFallback: true }); } useEffect(() => { diff --git a/apps/web/src/hooks/queries/useAnalytics.ts b/apps/web/src/hooks/queries/useAnalytics.ts index 8bcfe66..c4ac0a6 100644 --- a/apps/web/src/hooks/queries/useAnalytics.ts +++ b/apps/web/src/hooks/queries/useAnalytics.ts @@ -1,11 +1,11 @@ "use client"; -import { useQuery } from "@tanstack/react-query"; +import { useQuery, keepPreviousData } from "@tanstack/react-query"; import { apiGet } from "@/lib/api/client"; import { API } from "@/lib/api/endpoints"; import { OrgSummaryData, TopPerformerEntry, SessionTrendEntry, LeaderboardEntry } from "@/types"; -export function useOrgSummary(from?: string, to?: string) { +export function useOrgSummary(from?: string, to?: string, enabled = true) { return useQuery({ queryKey: ["orgSummary", from, to], queryFn: () => { @@ -14,6 +14,9 @@ export function useOrgSummary(from?: string, to?: string) { if (to) params["to"] = to; return apiGet(API.orgSummary, params); }, + enabled, + staleTime: 30_000, // dashboard stats: fresh for 30s + placeholderData: keepPreviousData, }); } @@ -32,6 +35,8 @@ export function useTopPerformers( if (to) params["to"] = to; return apiGet(API.topPerformers, params); }, + staleTime: 60_000, // chart data: fresh for 1 min + placeholderData: keepPreviousData, }); } @@ -44,6 +49,8 @@ export function useSessionTrend(from?: string, to?: string) { if (to) params["to"] = to; return apiGet(API.sessionTrend, params); }, + staleTime: 60_000, // trend chart: fresh for 1 min + placeholderData: keepPreviousData, }); } @@ -62,5 +69,7 @@ export function useLeaderboard( if (to) params["to"] = to; return apiGet(API.leaderboard, params); }, + staleTime: 120_000, // ranking: fresh for 2 min (slow-moving) + placeholderData: keepPreviousData, }); } diff --git a/apps/web/src/hooks/queries/useDashboard.ts b/apps/web/src/hooks/queries/useDashboard.ts index f39634c..f16a8c2 100644 --- a/apps/web/src/hooks/queries/useDashboard.ts +++ b/apps/web/src/hooks/queries/useDashboard.ts @@ -9,6 +9,7 @@ export function useMyDashboard() { return useQuery({ queryKey: ["myDashboard"], queryFn: () => apiGet(API.myDashboard), + staleTime: 30_000, }); } diff --git a/apps/web/src/hooks/queries/useEmployees.ts b/apps/web/src/hooks/queries/useEmployees.ts index 34c5633..3afe696 100644 --- a/apps/web/src/hooks/queries/useEmployees.ts +++ b/apps/web/src/hooks/queries/useEmployees.ts @@ -1,6 +1,6 @@ "use client"; -import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query"; +import { useQuery, useMutation, useQueryClient, keepPreviousData } from "@tanstack/react-query"; import { apiGet, apiGetPaginated, apiPost, apiPatch } from "@/lib/api/client"; import { API } from "@/lib/api/endpoints"; import type { PaginatedResponse } from "@/types"; @@ -33,6 +33,8 @@ export function useEmployeeList( if (filters?.search) params["search"] = filters.search; return apiGetPaginated(API.listEmployees, params); }, + staleTime: 120_000, // employee roster: fresh for 2 min + placeholderData: keepPreviousData, }); } @@ -41,6 +43,7 @@ export function useEmployee(id: string | null) { queryKey: ["employee", id], enabled: id !== null, queryFn: () => apiGet(API.getEmployee(id!)), + staleTime: 120_000, }); } diff --git a/apps/web/src/hooks/queries/useExpenses.ts b/apps/web/src/hooks/queries/useExpenses.ts index 550930e..6761720 100644 --- a/apps/web/src/hooks/queries/useExpenses.ts +++ b/apps/web/src/hooks/queries/useExpenses.ts @@ -21,6 +21,8 @@ export function useMyExpenses(page: number, limit: number) { page: String(page), limit: String(limit), }), + staleTime: 30_000, + placeholderData: keepPreviousData, }); } @@ -32,6 +34,8 @@ export function useOrgExpenses(page: number, limit: number) { page: String(page), limit: String(limit), }), + staleTime: 30_000, + placeholderData: keepPreviousData, }); } @@ -63,7 +67,8 @@ export function useAllOrgExpenses() { queryFn: ({ pageParam }) => apiGetPaginated(API.orgExpenses, { page: String(pageParam), - limit: "1000", + // Backend validates limit <= 100 for /admin/expenses. + limit: "100", }), initialPageParam: 1, getNextPageParam: (lastPage, allPages) => { diff --git a/apps/web/src/hooks/queries/useSessions.ts b/apps/web/src/hooks/queries/useSessions.ts index 15ef35c..f1a599b 100644 --- a/apps/web/src/hooks/queries/useSessions.ts +++ b/apps/web/src/hooks/queries/useSessions.ts @@ -14,6 +14,8 @@ export function useMySessions(page: number, limit: number) { page: String(page), limit: String(limit), }), + staleTime: 30_000, + placeholderData: keepPreviousData, }); } @@ -25,6 +27,8 @@ export function useOrgSessions(page: number, limit: number) { page: String(page), limit: String(limit), }), + staleTime: 30_000, + placeholderData: keepPreviousData, }); } @@ -42,7 +46,8 @@ export function useAllOrgSessions() { queryFn: ({ pageParam }) => apiGetPaginated(API.adminSessions, { page: String(pageParam), - limit: "1000", + // Backend validates limit <= 100 for /admin/sessions. + limit: "100", }), staleTime: 60_000, placeholderData: keepPreviousData, diff --git a/apps/web/src/hooks/queries/useWebhooks.ts b/apps/web/src/hooks/queries/useWebhooks.ts new file mode 100644 index 0000000..65c7ddd --- /dev/null +++ b/apps/web/src/hooks/queries/useWebhooks.ts @@ -0,0 +1,135 @@ +"use client"; + +import { useQuery, useMutation, useQueryClient, keepPreviousData } from "@tanstack/react-query"; +import { apiGet, apiGetPaginated, apiPost, apiPatch, apiDelete } from "@/lib/api/client"; +import { API } from "@/lib/api/endpoints"; +import type { PaginatedResponse } from "@/types"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export const WEBHOOK_EVENT_TYPES = [ + "employee.checked_in", + "employee.checked_out", + "expense.created", + "expense.approved", + "expense.rejected", + "employee.created", +] as const; + +export type WebhookEventType = (typeof WEBHOOK_EVENT_TYPES)[number]; + +export interface WebhookRecord { + id: string; + organization_id: string; + url: string; + is_active: boolean; + events: WebhookEventType[]; + created_at: string; + updated_at: string; +} + +export type DeliveryStatus = "pending" | "success" | "failed"; + +export interface WebhookDelivery { + id: string; + webhook_id: string; + event_id: string; + organization_id: string; + status: DeliveryStatus; + attempt_count: number; + response_status: number | null; + response_body: string | null; + last_attempt_at: string | null; + next_retry_at: string | null; + created_at: string; +} + +export interface CreateWebhookBody { + url: string; + events: WebhookEventType[]; + secret: string; +} + +export interface UpdateWebhookBody { + url?: string; + events?: WebhookEventType[]; + is_active?: boolean; + secret?: string; +} + +// ─── Queries ────────────────────────────────────────────────────────────────── + +/** List all webhooks for the org. */ +export function useWebhooks() { + return useQuery({ + queryKey: ["webhooks"], + queryFn: () => apiGet(API.webhooks), + staleTime: 30_000, + }); +} + +/** Paginated delivery history, optionally filtered by webhookId or status. */ +export function useWebhookDeliveries( + page: number, + limit: number, + webhookId?: string, + status?: DeliveryStatus +) { + return useQuery>({ + queryKey: ["webhookDeliveries", page, limit, webhookId, status], + queryFn: () => { + const params: Record = { + page: String(page), + limit: String(limit), + }; + if (webhookId) params["webhook_id"] = webhookId; + if (status) params["status"] = status; + return apiGetPaginated(API.webhookDeliveries, params); + }, + staleTime: 15_000, // deliveries: refresh more frequently + placeholderData: keepPreviousData, + }); +} + +// ─── Mutations ──────────────────────────────────────────────────────────────── + +/** Register a new webhook endpoint. */ +export function useCreateWebhook() { + const client = useQueryClient(); + return useMutation({ + mutationFn: (body) => apiPost(API.webhooks, body), + onSuccess: () => void client.invalidateQueries({ queryKey: ["webhooks"] }), + }); +} + +/** Update a webhook's URL, events, active state, or secret. */ +export function useUpdateWebhook(id: string) { + const client = useQueryClient(); + return useMutation({ + mutationFn: (body) => apiPatch(API.webhookById(id), body), + onSuccess: () => void client.invalidateQueries({ queryKey: ["webhooks"] }), + }); +} + +/** Delete a webhook and all its delivery history. */ +export function useDeleteWebhook() { + const client = useQueryClient(); + return useMutation({ + mutationFn: (id) => apiDelete(API.webhookById(id)), + onSuccess: () => { + void client.invalidateQueries({ queryKey: ["webhooks"] }); + void client.invalidateQueries({ queryKey: ["webhookDeliveries"] }); + }, + }); +} + +/** Manually retry a failed (or succeeded) delivery. */ +export function useRetryDelivery() { + const client = useQueryClient(); + return useMutation({ + mutationFn: (deliveryId) => + apiPost(API.retryDelivery(deliveryId), {}), + onSuccess: () => + void client.invalidateQueries({ queryKey: ["webhookDeliveries"] }), + }); +} diff --git a/apps/web/src/hooks/useAuth.ts b/apps/web/src/hooks/useAuth.ts index a82a146..68fe6e3 100644 --- a/apps/web/src/hooks/useAuth.ts +++ b/apps/web/src/hooks/useAuth.ts @@ -3,6 +3,8 @@ import { useRouter } from "next/navigation"; import { supabase } from "@/lib/supabase"; import { useAuthContext } from "@/contexts/AuthContext"; +import { extractRoleFromSession } from "@/lib/auth/role"; +import { clearAuthTokenCache } from "@/lib/api/client"; import { UserRole } from "@/types"; import { queryClient } from "@/lib/query-client"; @@ -11,15 +13,15 @@ export function useAuth() { const { user, session, role, permissions, isLoading } = useAuthContext(); async function login(email: string, password: string): Promise { + // Prevent stale bearer reuse when switching users (e.g. employee -> admin). + clearAuthTokenCache(); const { data, error } = await supabase.auth.signInWithPassword({ email, password }); if (error) throw error; - const metaRole = - (data.session.user.user_metadata?.role as UserRole | undefined) ?? - (data.session.user.app_metadata?.role as UserRole | undefined); - return metaRole ?? "EMPLOYEE"; + return extractRoleFromSession(data.session, { allowUserMetadataFallback: true }); } async function logout(): Promise { + clearAuthTokenCache(); await supabase.auth.signOut(); queryClient.clear(); router.push("/login"); diff --git a/apps/web/src/lib/api/client.ts b/apps/web/src/lib/api/client.ts index 2d325da..66a55a2 100644 --- a/apps/web/src/lib/api/client.ts +++ b/apps/web/src/lib/api/client.ts @@ -6,6 +6,11 @@ import { ApiError, ApiResponse, PaginatedResponse } from "@/types"; let cachedToken: string | null = null; let tokenExpiry: number = 0; +export function clearAuthTokenCache(): void { + cachedToken = null; + tokenExpiry = 0; +} + async function getAuthHeaders(): Promise> { const now = Date.now(); @@ -40,8 +45,7 @@ async function getAuthHeaders(): Promise> { async function handleAuthFailure(): Promise { // Clear cached token - cachedToken = null; - tokenExpiry = 0; + clearAuthTokenCache(); // Sign out and redirect await supabase.auth.signOut(); @@ -281,3 +285,34 @@ export async function apiPatch(path: string, body: unknown): Promise { return handleResponse(response); } + +export async function apiDelete(path: string): Promise { + if (!env.NEXT_PUBLIC_API_BASE_URL) { + throw new ApiError( + "NEXT_PUBLIC_API_BASE_URL is not set.", + 500 + ); + } + const headers = await getAuthHeaders(); + const response = await fetchWithTimeout(`${env.NEXT_PUBLIC_API_BASE_URL}${path}`, { + method: "DELETE", + headers, + }); + + // 204 No Content is success — nothing to parse + if (response.status === 204) return; + + if (response.status === 401) { + await handleAuthFailure(); + throw new ApiError("Unauthorized. Please log in again.", 401); + } + + if (!response.ok) { + const text = await response.text(); + throw new ApiError( + `HTTP ${response.status} error from API`, + response.status + ); + } +} + diff --git a/apps/web/src/lib/api/endpoints.ts b/apps/web/src/lib/api/endpoints.ts index 35dd781..aa98db8 100644 --- a/apps/web/src/lib/api/endpoints.ts +++ b/apps/web/src/lib/api/endpoints.ts @@ -60,4 +60,14 @@ export const API = { adminEvents: "/admin/events", /** GPS playback points for a specific session. */ sessionLocations: (id: string) => `/admin/sessions/${id}/locations`, + + // Webhooks + /** List all registered webhooks for the org (secrets omitted). */ + webhooks: "/admin/webhooks", + /** Create / update / delete a webhook (/:id for PATCH+DELETE). */ + webhookById: (id: string) => `/admin/webhooks/${id}`, + /** Paginated delivery attempts. */ + webhookDeliveries: "/admin/webhook-deliveries", + /** Retry a specific delivery. */ + retryDelivery: (id: string) => `/admin/webhook-deliveries/${id}/retry`, } as const; diff --git a/apps/web/src/lib/auth/role.ts b/apps/web/src/lib/auth/role.ts new file mode 100644 index 0000000..5d48068 --- /dev/null +++ b/apps/web/src/lib/auth/role.ts @@ -0,0 +1,54 @@ +import type { Session } from "@supabase/supabase-js"; +import { UserRole } from "@/types"; + +function toUserRole(value: unknown): UserRole | undefined { + return value === "ADMIN" || value === "EMPLOYEE" ? value : undefined; +} + +function decodeBase64Url(input: string): string { + const normalized = input.replace(/-/g, "+").replace(/_/g, "/"); + const padded = normalized.padEnd(Math.ceil(normalized.length / 4) * 4, "="); + + if (typeof atob === "function") { + return atob(padded); + } + + // Next.js middleware runs in an edge-like runtime where atob exists. + // This fallback keeps the helper usable in Node-based contexts too. + const nodeBuffer = (globalThis as { Buffer?: { from: (data: string, enc: string) => { toString: (enc: string) => string } } }).Buffer; + if (!nodeBuffer) { + throw new Error("No base64 decoder available in current runtime"); + } + return nodeBuffer.from(padded, "base64").toString("utf-8"); +} + +export function extractRoleFromAccessToken(accessToken: string | null | undefined): UserRole | undefined { + if (!accessToken) return undefined; + + try { + const parts = accessToken.split("."); + if (parts.length < 2) return undefined; + const payload = JSON.parse(decodeBase64Url(parts[1])) as Record; + return toUserRole(payload.role); + } catch { + return undefined; + } +} + +export function extractRoleFromSession( + session: Session, + options: { allowUserMetadataFallback?: boolean } = {} +): UserRole { + const claimRole = extractRoleFromAccessToken(session.access_token); + if (claimRole) return claimRole; + + const appMetaRole = toUserRole((session.user.app_metadata as Record | undefined)?.role); + if (appMetaRole) return appMetaRole; + + if (options.allowUserMetadataFallback) { + const userMetaRole = toUserRole((session.user.user_metadata as Record | undefined)?.role); + if (userMetaRole) return userMetaRole; + } + + return "EMPLOYEE"; +} diff --git a/apps/web/src/lib/query-client.ts b/apps/web/src/lib/query-client.ts index b40cd7c..f3c6beb 100644 --- a/apps/web/src/lib/query-client.ts +++ b/apps/web/src/lib/query-client.ts @@ -1,5 +1,27 @@ +/** + * query-client.ts — Global React Query client. + * + * Default options prevent refetch storms: + * - staleTime: 60s — data is "fresh" for 1 min (per-hook overrides are more specific) + * - retry: 1 — one retry on network failure + * - refetchOnWindowFocus: false — don't hammer API on tab switch + * + * Global error handler fires a toast for any failed query, providing consistent + * error visibility without each page needing its own error boundary. + */ + import { QueryClient } from "@tanstack/react-query"; +function showErrorToast(message: string) { + // Fires a custom event that the Toaster (in providers.tsx) listens to. + // This avoids importing the toast hook here (hooks can't be used outside React). + if (typeof window !== "undefined") { + window.dispatchEvent( + new CustomEvent("fieldtrack:query-error", { detail: { message } }) + ); + } +} + export const queryClient = new QueryClient({ defaultOptions: { queries: { @@ -7,5 +29,18 @@ export const queryClient = new QueryClient({ retry: 1, refetchOnWindowFocus: false, }, + mutations: { + // Do NOT auto-toast mutations — pages handle mutation errors inline + // (with form validation feedback, toast on onError callback, etc.) + }, }, }); + +// Wire global query error handler after client is constructed +queryClient.getQueryCache().config.onError = (error) => { + const msg = + error instanceof Error ? error.message : "An unexpected error occurred"; + // Suppress 401 errors — auth failures redirect to /login automatically + if (msg.toLowerCase().includes("unauthorized")) return; + showErrorToast(msg); +}; diff --git a/apps/web/src/middleware.ts b/apps/web/src/middleware.ts index 29d496a..e8f3df3 100644 --- a/apps/web/src/middleware.ts +++ b/apps/web/src/middleware.ts @@ -1,6 +1,7 @@ import { createServerClient } from "@supabase/ssr"; import { NextResponse, type NextRequest } from "next/server"; import type { CookieOptions } from "@supabase/ssr"; +import { extractRoleFromSession } from "@/lib/auth/role"; /** * Auth + role middleware. @@ -13,7 +14,7 @@ import type { CookieOptions } from "@supabase/ssr"; * /login, /_next/*, /favicon.ico, static assets * * Role-protected routes: - * /admin/** → requires role = "ADMIN" in user_metadata + * /admin/** → requires role = "ADMIN" in JWT claims */ export async function middleware(request: NextRequest) { const response = NextResponse.next({ @@ -51,11 +52,20 @@ export async function middleware(request: NextRequest) { const { pathname } = request.nextUrl; + // Let proxied API requests pass through untouched so the backend can return + // proper JSON errors (401/403/etc.) instead of this middleware redirecting + // fetches to the HTML login page. + if (pathname.startsWith("/api/proxy")) { + return response; + } + // Already on the login page — don't redirect in a loop if (pathname.startsWith("/login")) { - // If user is already authenticated, send them to the right landing page + // If user is already authenticated, send them to their correct landing page if (session) { - return NextResponse.redirect(new URL("/sessions", request.url)); + const role = extractRoleFromSession(session, { allowUserMetadataFallback: false }); + const landing = role === "ADMIN" ? "/admin/sessions" : "/sessions"; + return NextResponse.redirect(new URL(landing, request.url)); } return response; } @@ -68,11 +78,9 @@ export async function middleware(request: NextRequest) { } // Role-based protection for /admin routes. - // The role is embedded in app_metadata by the custom_access_token_hook, which reads - // the authoritative value from public.users.role (server-controlled). - // user_metadata is user-editable and MUST NOT be used for authorization decisions. + // Use JWT/app_metadata-derived claims only; avoid user_metadata for authz. if (pathname.startsWith("/admin")) { - const role = (session.user?.app_metadata as Record | undefined)?.role as string | undefined; + const role = extractRoleFromSession(session, { allowUserMetadataFallback: false }); if (role !== "ADMIN") { // Redirect employees and unknown roles away from admin pages. return NextResponse.redirect(new URL("/sessions", request.url)); @@ -86,11 +94,12 @@ export const config = { matcher: [ /* * Match all paths EXCEPT: + * - api/proxy (proxied backend API; backend handles auth/errors) * - _next/static (static files) * - _next/image (image optimisation) * - favicon.ico * - public assets (png, jpg, svg, etc.) */ - "/((?!_next/static|_next/image|favicon\\.ico|.*\\.(?:png|jpg|jpeg|gif|svg|ico|webp|woff2?|ttf|otf|css|js)).*)", + "/((?!api/proxy|_next/static|_next/image|favicon\\.ico|.*\\.(?:png|jpg|jpeg|gif|svg|ico|webp|woff2?|ttf|otf|css|js)).*)", ], }; diff --git a/docs/SLO.md b/docs/SLO.md new file mode 100644 index 0000000..33b7f6e --- /dev/null +++ b/docs/SLO.md @@ -0,0 +1,136 @@ +# FieldTrack Service Level Objectives (SLOs) + +This document defines the service-level objectives for FieldTrack production services. Each SLO has a corresponding error budget and alert rules in `infra/prometheus/alerts.yml`. + +--- + +## Definitions + +| Term | Meaning | +|---|---| +| **SLO** | Service Level Objective — the target reliability level | +| **SLI** | Service Level Indicator — the metric used to measure the objective | +| **Error Budget** | Allowable downtime / failure rate before the SLO is violated | +| **Burn Rate** | How fast the error budget is being consumed relative to normal | + +--- + +## SLO 1 — API Availability + +| | | +|---|---| +| **SLI** | `up{job=~"fieldtrack-backend.*"}` | +| **Target** | 99.9% monthly availability | +| **Error budget** | 43.8 minutes / month | +| **Window** | 30-day rolling | + +### Rationale +Sub-1h monthly downtime budget is appropriate for a B2B scheduling SaaS. Breaching this SLO triggers an incident review. + +--- + +## SLO 2 — API Latency + +| | | +|---|---| +| **SLI** | `histogram_quantile(0.95, ...)` over `http_request_duration_seconds_bucket` | +| **Target p95** | < 500 ms | +| **Target p99** | < 2 000 ms | +| **Error budget** | 5% of requests may exceed the p95 threshold | +| **Window** | 5-minute rolling (monitored), 1-hour burn rate (alerting) | + +### Rationale +500 ms p95 ensures interactive response times for the React frontend. The 2 s p99 provides a safety margin for background operations (bulk import, report generation) without breaching the user-visible latency SLO. + +--- + +## SLO 3 — API Error Rate + +| | | +|---|---| +| **SLI** | `rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])` | +| **Target** | < 1% 5xx error rate | +| **Error budget** | 1% of requests may fail with 5xx | +| **Window** | 5-minute rolling | + +### Rationale +1% is tight but achievable given the stateless Fastify API + managed Supabase backend. 4xx errors (client mistakes) are excluded from the SLO. + +--- + +## SLO 4 — Webhook Delivery + +| | | +|---|---| +| **SLI** | Fraction of webhook deliveries that eventually succeed within the retry window | +| **Target** | 99% of deliveries succeed within 1 hour (across all retry attempts) | +| **Error budget** | 1% permanent failure rate | +| **Window** | 1-hour rolling | + +### Retry schedule (for reference) + +| Attempt | Delay from previous | +|---|---| +| 1 | Immediate | +| 2 | ~1 min (±20% jitter) | +| 3 | ~5 min (±20% jitter) | +| 4 | ~15 min (±20% jitter) | +| 5 | ~1 h (±20% jitter) | +| After attempt 5 | Moved to Dead-Letter Queue | + +All 5 retry attempts fit within the 1-hour SLO window. + +### Rationale +Webhook delivery failures directly affect customer integrations. The DLQ captures permanent failures for manual replay; the SLO tracks the fraction that need manual intervention. + +--- + +## SLO 5 — Dead-Letter Queue Depth + +| | | +|---|---| +| **SLI** | `dlq_size{queue="webhook-delivery-dlq"}` | +| **Target** | DLQ depth stays below 100 jobs | +| **Error budget** | DLQ may transiently spike above 100 for < 30 minutes | +| **Window** | 30-minute sustained | + +### Rationale +A DLQ backlog above 100 indicates a systemic delivery failure (bad endpoint configuration, network partition) requiring operator attention. Transient spikes under 30 minutes are tolerated. + +--- + +## Error Budget Alert Strategy + +The following multi-burn-rate windows are used for the error budget alerts to catch both fast burns (page immediately) and slow burns (ticket within the hour): + +| Window | Burn rate threshold | Severity | Action | +|---|---|---|---| +| 1h/5m | 14× | critical | Page on-call | +| 6h/30m | 6× | warning | Open ticket | +| 1d/2h | 3× | warning | Engineering review | + +--- + +## Alert → SLO Mapping + +| Alert name | SLO | Severity | +|---|---|---| +| `FieldTrackHighErrorRate` | SLO 3 | critical | +| `FieldTrackSloErrorBudgetBurnFast` | SLO 3 | critical | +| `FieldTrackSloErrorBudgetBurnSlow` | SLO 3 | warning | +| `FieldTrackHighLatency` | SLO 2 | warning | +| `FieldTrackLatencyP99High` | SLO 2 p99 | warning | +| `WebhookDeliveryFailureRateHigh` | SLO 4 | critical | +| `WebhookDeliveryFailureRateWarning` | SLO 4 | warning | +| `WebhookDlqGrowing` | SLO 5 | warning | +| `WebhookCircuitBreakerOpened` | SLO 4 | warning | +| `DeploymentFailure` | SLO 1 | critical | +| `ReadinessCheckFailing` | SLO 1 | critical | + +--- + +## Review Cadence + +- **Monthly**: review error budget consumption; adjust SLO thresholds if engineering velocity is affected. +- **Post-incident**: update error budget retroactively; add alert tuning if a regression was missed. +- **Quarterly**: revisit SLO targets vs. customer expectations. diff --git a/docs/WEBHOOK_SIGNATURES.md b/docs/WEBHOOK_SIGNATURES.md new file mode 100644 index 0000000..5161c9a --- /dev/null +++ b/docs/WEBHOOK_SIGNATURES.md @@ -0,0 +1,194 @@ +# FieldTrack Webhook Signature Verification + +Every outbound webhook request from FieldTrack includes security headers that allow receivers to verify authenticity and reject replayed requests. + +--- + +## Headers + +| Header | Example value | Purpose | +|---|---|---| +| `X-FieldTrack-Signature` | `sha256=a3f1c8...` | HMAC-SHA256 of the signing body (see below) | +| `X-FieldTrack-Timestamp` | `1711618200` | Unix timestamp **in seconds** at delivery time | +| `X-FieldTrack-Event` | `employee.checked_in` | Logical event type for routing | +| `X-FieldTrack-Delivery-Id` | `1b2f...-uuid` | Unique delivery attempt id for idempotency / replay dedupe | + +--- + +## Signing algorithm + +``` +signing_body = "." +signature = "sha256=" + hex( HMAC-SHA256( secret, signing_body ) ) +``` + +Where: +- `` is the value of `X-FieldTrack-Timestamp` (decimal string, no padding) +- `` is the **exact** bytes of the HTTP request body (UTF-8 JSON, no re-serialisation) +- `secret` is the **per-webhook signing secret** shown in the FieldTrack webhooks dashboard +- The HMAC key is the raw UTF-8 string of the secret (not Base64-decoded) +- Dot (`.`) is the separator between timestamp and body + +### Why timestamp-bound? + +Including the timestamp in the signing input means the same payload signed at a different time produces a different signature. This prevents _replay attacks_: a valid request captured by a MITM cannot be replayed after the tolerance window expires. + +**Receivers MUST reject requests where `|now - timestamp| > 300 seconds` (5 minutes).** + +--- + +## Verification steps (receiver side) + +1. Extract `X-FieldTrack-Timestamp` → `ts` (integer) +2. Verify `|time.now() - ts| <= 300` — reject with HTTP 400 if stale. +3. Construct `signing_body = ts + "." + request_body_string` +4. Compute `expected = "sha256=" + hex(HMAC-SHA256(secret, signing_body))` +5. Compare `expected` to `X-FieldTrack-Signature` using a **timing-safe** equality function. +6. Reject with HTTP 401 if signatures do not match. +7. Optional replay guard: store `X-FieldTrack-Delivery-Id` for 24 h and reject duplicates. + +> ⚠ **Never** use regular string equality (`==`) to compare signatures — it is vulnerable to timing attacks. Always use `hmac.compare_digest` (Python) or `crypto.timingSafeEqual` (Node.js). + +--- + +## Node.js verification example + +```typescript +import { createHmac, timingSafeEqual } from "node:crypto"; +import type { IncomingMessage, ServerResponse } from "node:http"; + +const TOLERANCE_SECONDS = 300; + +function verifyFieldTrackWebhook( + rawBody: string, + secret: string, + receivedSignature: string, + receivedTimestamp: string, +): boolean { + // 1. Validate timestamp within tolerance window + const ts = parseInt(receivedTimestamp, 10); + const now = Math.floor(Date.now() / 1000); + if (isNaN(ts) || Math.abs(now - ts) > TOLERANCE_SECONDS) { + return false; // stale or malformed timestamp + } + + // 2. Reconstruct signing body + const signingBody = `${ts}.${rawBody}`; + + // 3. Compute expected signature + const hmac = createHmac("sha256", secret); + hmac.update(signingBody, "utf8"); + const expected = `sha256=${hmac.digest("hex")}`; + + // 4. Timing-safe comparison + if (expected.length !== receivedSignature.length) return false; + return timingSafeEqual( + Buffer.from(expected, "utf8"), + Buffer.from(receivedSignature, "utf8"), + ); +} + +// ── Express / raw middleware example ───────────────────────────────────────── + +import express from "express"; + +const app = express(); + +// Must use raw body middleware — JSON.parse() changes byte representation. +app.use("/webhooks/fieldtrack", express.raw({ type: "application/json" })); + +app.post("/webhooks/fieldtrack", (req: IncomingMessage & { body: Buffer }, res: ServerResponse) => { + const rawBody = (req as express.Request).body.toString("utf8"); + const signature = (req as express.Request).headers["x-fieldtrack-signature"] as string ?? ""; + const timestamp = (req as express.Request).headers["x-fieldtrack-timestamp"] as string ?? ""; + const secret = process.env.FIELDTRACK_WEBHOOK_SECRET ?? ""; + + if (!verifyFieldTrackWebhook(rawBody, secret, signature, timestamp)) { + res.writeHead(401); + res.end("Invalid signature"); + return; + } + + const event = JSON.parse(rawBody); + console.log("Received event:", event.type); + res.writeHead(200); + res.end("OK"); +}); +``` + +--- + +## Python verification example + +```python +import hashlib +import hmac +import time +from flask import Flask, request, abort + +TOLERANCE_SECONDS = 300 +app = Flask(__name__) + + +def verify_fieldtrack_webhook( + raw_body: bytes, + secret: str, + received_signature: str, + received_timestamp: str, +) -> bool: + # 1. Validate timestamp within tolerance window + try: + ts = int(received_timestamp) + except (ValueError, TypeError): + return False + + if abs(time.time() - ts) > TOLERANCE_SECONDS: + return False # stale + + # 2. Reconstruct signing body (bytes) + signing_body = f"{ts}.".encode() + raw_body + + # 3. Compute expected signature + mac = hmac.new(secret.encode("utf-8"), signing_body, hashlib.sha256) + expected = "sha256=" + mac.hexdigest() + + # 4. Timing-safe comparison + return hmac.compare_digest(expected, received_signature) + + +@app.route("/webhooks/fieldtrack", methods=["POST"]) +def receive_webhook(): + raw_body = request.get_data() # raw bytes before JSON decode + signature = request.headers.get("X-FieldTrack-Signature", "") + timestamp = request.headers.get("X-FieldTrack-Timestamp", "") + secret = "your-webhook-secret-here" # from FieldTrack dashboard + + if not verify_fieldtrack_webhook(raw_body, secret, signature, timestamp): + abort(401, "Invalid signature") + + event = request.get_json() + print(f"Received event: {event['type']}") + return "", 200 +``` + +--- + +## Common mistakes + +| Mistake | Impact | Fix | +|---|---|---| +| Re-serialising the body before signing (e.g. `json.dumps(json.loads(body))`) | Signature mismatch on any non-canonical JSON | Hash the **raw bytes** received over the wire | +| Skipping the timestamp check | Replay attacks possible indefinitely | Always validate `\|now - ts\| <= 300` | +| Using `==` for signature comparison | Timing oracle leaks partial secret | Use `hmac.compare_digest` / `timingSafeEqual` | +| Decoding the secret from Base64 | Wrong key bytes → signature always fails | Use the secret string as-is (UTF-8) | +| Signing `body` instead of `timestamp.body` | Valid signatures but no replay protection | Always prepend timestamp + dot | + +--- + +## Rotating secrets + +1. Generate a new secret in the FieldTrack webhooks dashboard. +2. Update your receiver to accept **both** the old and new secret during a transition window (check both; accept if either matches). +3. Once all in-flight requests have been delivered, remove the old secret check. + +FieldTrack re-signs all new deliveries with the new secret immediately upon rotation; retries of existing deliveries use the secret active at the time of the original enqueue. diff --git a/infra/grafana/dashboards/fieldtrack.json b/infra/grafana/dashboards/fieldtrack.json index 37efa89..83bef40 100644 --- a/infra/grafana/dashboards/fieldtrack.json +++ b/infra/grafana/dashboards/fieldtrack.json @@ -525,6 +525,138 @@ "refId": "B" } ] + }, + { + "title": "API Error Budget Remaining (30d)", + "type": "stat", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "datasource": { + "type": "prometheus", + "uid": "" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "unit": "percentunit", + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "green", + "value": 0.9 + } + ] + } + }, + "overrides": [] + }, + "targets": [ + { + "expr": "fieldtrack:api_error_budget_remaining_30d", + "legendFormat": "Remaining", + "refId": "A" + } + ] + }, + { + "title": "API Error Burn Rate (1h / 6h)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "datasource": { + "type": "prometheus", + "uid": "" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 15, + "lineWidth": 2 + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "targets": [ + { + "expr": "fieldtrack:api_error_rate_1h", + "legendFormat": "1h", + "refId": "A" + }, + { + "expr": "fieldtrack:api_error_rate_6h", + "legendFormat": "6h", + "refId": "B" + } + ] + }, + { + "title": "Webhook Permanent Failure Rate (5m)", + "type": "stat", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "datasource": { + "type": "prometheus", + "uid": "" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "unit": "percentunit", + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.3 + } + ] + } + }, + "overrides": [] + }, + "targets": [ + { + "expr": "fieldtrack:webhook_failure_rate_5m", + "legendFormat": "Failure Rate", + "refId": "A" + } + ] } ], "schemaVersion": 39, @@ -544,5 +676,5 @@ "timezone": "browser", "title": "FieldTrack 2.0 — Backend & System", "uid": "fieldtrack-backend", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/infra/nginx/fieldtrack.conf b/infra/nginx/fieldtrack.conf index 09f97e1..44a66a9 100644 --- a/infra/nginx/fieldtrack.conf +++ b/infra/nginx/fieldtrack.conf @@ -12,7 +12,7 @@ upstream fieldtrack_backend { keepalive 32; } -limit_req_zone $binary_remote_addr zone=fieldtrack_api:10m rate=30r/s; +limit_req_zone $binary_remote_addr zone=fieldtrack_api:10m rate=60r/s; limit_req_zone $binary_remote_addr zone=fieldtrack_health:10m rate=5r/s; # Cloudflare IPs @@ -86,7 +86,7 @@ server { listen 443 ssl; listen [::]:443 ssl; - # ❌ removed invalid http2 directive + http2 on; server_name __API_HOSTNAME__; diff --git a/infra/prometheus/alerts.yml b/infra/prometheus/alerts.yml index abe5b5e..6b9a2e2 100644 --- a/infra/prometheus/alerts.yml +++ b/infra/prometheus/alerts.yml @@ -1,5 +1,46 @@ groups: +# --------------------------------------------------------- +# RECORDING RULES +# --------------------------------------------------------- + +- name: fieldtrack_recording_rules + rules: + - record: fieldtrack:api_requests_rate_5m + expr: sum(rate(http_requests_total{job=~"fieldtrack-backend.*"}[5m])) + + - record: fieldtrack:api_errors_5xx_rate_5m + expr: sum(rate(http_requests_total{job=~"fieldtrack-backend.*",status_code=~"5.."}[5m])) + + - record: fieldtrack:api_error_rate_5m + expr: fieldtrack:api_errors_5xx_rate_5m / clamp_min(fieldtrack:api_requests_rate_5m, 1e-9) + + - record: fieldtrack:api_error_rate_1h + expr: | + sum(rate(http_requests_total{job=~"fieldtrack-backend.*",status_code=~"5.."}[1h])) + / + clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-backend.*"}[1h])), 1e-9) + + - record: fieldtrack:api_error_rate_6h + expr: | + sum(rate(http_requests_total{job=~"fieldtrack-backend.*",status_code=~"5.."}[6h])) + / + clamp_min(sum(rate(http_requests_total{job=~"fieldtrack-backend.*"}[6h])), 1e-9) + + - record: fieldtrack:webhook_failure_rate_5m + expr: | + sum(rate(webhook_failures_total[5m])) + / + clamp_min(sum(rate(webhook_deliveries_total[5m])), 1e-9) + + - record: fieldtrack:api_error_budget_remaining_30d + expr: | + 1 - ( + sum(increase(http_requests_total{job=~"fieldtrack-backend.*",status_code=~"5.."}[30d])) + / + clamp_min(sum(increase(http_requests_total{job=~"fieldtrack-backend.*"}[30d])), 1) + ) + # --------------------------------------------------------- # API HEALTH # --------------------------------------------------------- @@ -8,17 +49,23 @@ groups: rules: - alert: FieldTrackHighErrorRate - expr: | - sum(rate(http_requests_total{job=~"fieldtrack-backend.*",status_code=~"5.."}[5m])) - / - sum(rate(http_requests_total{job=~"fieldtrack-backend.*"}[5m])) - > 0.05 + expr: fieldtrack:api_error_rate_5m > 0.05 for: 2m labels: severity: critical annotations: summary: "High API error rate" description: "5xx errors exceed 5%" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" + runbook: | + Cause: Application throwing unhandled errors or DB/dependency failures. + Actions: + 1. Check container logs: docker logs fieldtrack-api --tail 200 + 2. Check /system-health endpoint from VPS + 3. Review recent deployments: git log --oneline -10 + 4. If DB: check Supabase dashboard for connection pool saturation + 5. If memory: check HostMemoryPressure alert and restart container + 6. Rollback if needed: see docs/ROLLBACK_QUICKREF.md - alert: FieldTrackHighLatency expr: | @@ -32,6 +79,15 @@ groups: annotations: summary: "High API latency" description: "p95 latency above 1 second" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" + runbook: | + Cause: Slow DB queries, queue contention, or upstream dependency latency. + Actions: + 1. Open Grafana latency panel (p95/p99) and identify spike start time + 2. Check slow-response logs in Loki (`slow_response` and `very_slow_response`) + 3. Check DB load and connection saturation in Supabase dashboard + 4. Inspect queue backlogs via GET /admin/system-health + 5. Roll back recent deployment if latency regression started post-release - alert: FieldTrackAvgLatencyHigh expr: | @@ -45,6 +101,15 @@ groups: annotations: summary: "FieldTrack API latency exceeded threshold" description: "Average response time exceeded 500 ms for 5 minutes" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" + runbook: | + Cause: Sustained performance degradation across many routes. + Actions: + 1. Compare avg latency with p95/p99 to identify broad vs tail issue + 2. Review top routes by request rate and latency in Grafana + 3. Inspect backend logs for DB timeout and retry patterns + 4. Validate Redis and Supabase health via /ready and /system-health + 5. Trigger rollback if regression is tied to latest deploy # --------------------------------------------------------- # WORKER ALERTS @@ -61,6 +126,15 @@ groups: annotations: summary: "Distance worker jobs failing at high rate" description: "More than 3 distance recalculation jobs permanently failed in the last 5 minutes. Check Redis connectivity and the distance-engine queue." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" + runbook: | + Cause: Redis connectivity failure, Supabase query errors, or malformed GPS data. + Actions: + 1. Check Redis: redis-cli -u $REDIS_URL ping + 2. Check worker logs: docker logs fieldtrack-api | grep "Distance worker" + 3. Inspect failed queue: GET /admin/system-health (worker section) + 4. Replay stuck sessions via queue_retry_intents if needed + 5. Check for GPS point anomalies (MAX_POINTS_PER_SESSION exceeded) - alert: AnalyticsQueueBacklogGrowing expr: analytics_queue_depth > 500 @@ -69,6 +143,16 @@ groups: severity: warning annotations: summary: "Analytics queue backlog high" + description: "Analytics queue depth exceeded 500 for 5 minutes" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" + runbook: | + Cause: Worker throughput below enqueue rate or downstream DB contention. + Actions: + 1. Check analytics worker logs for repeated errors/timeouts + 2. Inspect queue depth in GET /admin/system-health + 3. Validate Redis latency and connection health + 4. Check Supabase CPU/connection pressure + 5. Temporarily scale worker concurrency if safe # Phase 22: Fire if more than 5 analytics jobs permanently fail within 5 minutes. # This indicates a systemic problem (bad DB schema change, Supabase outage, etc.) @@ -81,6 +165,15 @@ groups: annotations: summary: "Analytics jobs failing at high rate" description: "More than 5 analytics jobs permanently failed (exhausted all retries) in the last 5 minutes. Check the analytics-failed dead letter queue and worker logs." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" + runbook: | + Cause: Supabase schema change, DB connection exhaustion, or analytics aggregation bug. + Actions: + 1. Check worker logs: docker logs fieldtrack-api | grep "analytics" + 2. Inspect dead letter queue via GET /admin/system-health + 3. Verify DB schema: check employee_daily_metrics and org_daily_metrics tables + 4. If transient: failed jobs auto-expire after 72 h; monitor retry_intents_dead metric + 5. If persistent: hotfix deployment required — see docs/ROLLBACK_QUICKREF.md # --------------------------------------------------------- # HOST ALERTS @@ -96,6 +189,16 @@ groups: severity: warning annotations: summary: "High CPU usage" + description: "Host CPU usage above 85% for 5 minutes" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" + runbook: | + Cause: Traffic surge, runaway process, or expensive query loops. + Actions: + 1. Check top CPU consumers on host (`top`/`htop`) + 2. Correlate with request rate and queue depth in Grafana + 3. Inspect container logs for retry storms or hot loops + 4. Scale out backend replicas or reduce noisy traffic source + 5. Roll back if a recent deploy caused the spike - alert: HostMemoryPressure expr: | @@ -108,6 +211,16 @@ groups: severity: warning annotations: summary: "High memory usage" + description: "Host memory usage above 85% for 5 minutes" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" + runbook: | + Cause: Memory leak, oversized cache, or traffic burst. + Actions: + 1. Inspect container RSS and heap charts in Grafana + 2. Check process logs for OOM warnings and GC pressure + 3. Restart affected container if memory does not recover + 4. If recurring post-deploy, roll back and open incident + 5. Confirm host swap/disk not under pressure simultaneously - alert: DiskAlmostFull expr: | @@ -121,6 +234,16 @@ groups: severity: critical annotations: summary: "Disk usage above 85%" + description: "Root filesystem usage exceeded 85% for 5 minutes" + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" + runbook: | + Cause: Log growth, artifact buildup, or runaway temp files. + Actions: + 1. Identify large directories (`du -sh /*` on host) + 2. Rotate/prune Docker images and logs + 3. Verify Loki/Promtail retention settings + 4. Free space before deployment operations + 5. Increase disk capacity if growth trend persists # --------------------------------------------------------- # DEPLOYMENT & INFRASTRUCTURE ALERTS @@ -137,6 +260,15 @@ groups: annotations: summary: "Redis is unreachable" description: "Redis has been down for more than 2 minutes. BullMQ workers, rate limiting, and the auth context cache will all degrade until Redis recovers." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md" + runbook: | + Cause: Redis container crash, OOM kill, or network partition. + Actions: + 1. Check container: docker ps | grep redis; docker logs redis --tail 50 + 2. Restart if crashed: docker restart redis (or docker compose up -d redis) + 3. Verify BullMQ reconnects: check worker logs after Redis recovery + 4. Rate limiting degrades gracefully (requests allowed through) during outage + 5. Circuit-breaker state is DB-backed and survives Redis restart - alert: DeploymentFailure expr: up{job=~"fieldtrack-backend.*"} == 0 @@ -146,6 +278,16 @@ groups: annotations: summary: "Backend container is down" description: "{{ $labels.job }} has been down for more than 2 minutes. Check deployment logs and container status." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" + runbook: | + Cause: Container OOM, crash loop, failed deployment, or host issue. + Actions: + 1. Check status: docker ps -a | grep fieldtrack + 2. Inspect last 100 lines: docker logs fieldtrack-api --tail 100 + 3. Check exit code: docker inspect fieldtrack-api | jq '.[0].State' + 4. Restart if safe: docker restart fieldtrack-api + 5. Rollback if bad deploy: see docs/ROLLBACK_QUICKREF.md + 6. Check host memory/disk: node_memory and node_filesystem alerts - alert: ReadinessCheckFailing expr: probe_success{job="fieldtrack-readiness"} == 0 @@ -155,6 +297,15 @@ groups: annotations: summary: "Readiness check failing" description: "/ready endpoint has been failing for 3 minutes. Check DB, Redis, and Supabase connectivity." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ROLLBACK_QUICKREF.md" + runbook: | + Cause: One or more hard dependencies unhealthy (Redis/Supabase/BullMQ). + Actions: + 1. Hit /ready and /health manually from VPS + 2. Check Redis ping and Supabase connectivity + 3. Inspect container logs for startup/recovery errors + 4. Check worker state in /admin/system-health + 5. Roll back if issue began immediately after deployment # --------------------------------------------------------- # TLS CERTIFICATE ALERTS @@ -171,6 +322,15 @@ groups: annotations: summary: "TLS certificate expiring within 14 days" description: "Certificate for {{ $labels.instance }} expires in less than 14 days. Renew via certbot." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" + runbook: | + Cause: Certificate nearing expiry date. + Actions: + 1. Verify expiry date using blackbox panel and `openssl s_client` + 2. Renew certificate (certbot or managed provider) + 3. Reload NGINX and confirm certificate chain + 4. Recheck probe_ssl_earliest_cert_expiry metric + 5. Confirm no stale cert served via CDN edge - alert: TLSCertExpired expr: probe_ssl_earliest_cert_expiry{job="fieldtrack-readiness"} - time() < 0 @@ -179,4 +339,221 @@ groups: severity: critical annotations: summary: "TLS certificate has expired" - description: "Certificate for {{ $labels.instance }} has expired. All HTTPS traffic is failing." \ No newline at end of file + description: "Certificate for {{ $labels.instance }} has expired. All HTTPS traffic is failing." + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/DEPLOYMENT.md" + runbook: | + Cause: Certificate renewal failed or cert not reloaded. + Actions: + 1. Renew certificate immediately + 2. Reload NGINX and verify HTTPS handshake + 3. Validate Cloudflare/full-chain configuration + 4. Confirm /health and /ready are reachable over HTTPS + 5. Open incident and track customer impact window + +# --------------------------------------------------------- +# WEBHOOK DELIVERY SLOs (SLO 4 + SLO 5) +# See docs/SLO.md for full SLO definitions and error-budget +# burn-rate strategy. +# --------------------------------------------------------- + +- name: fieldtrack_webhook_slo_alerts + rules: + + # --- SLO 4: Webhook delivery permanent failure rate > 10% for 5 m (warning) -- + - alert: WebhookDeliveryFailureRateWarning + expr: fieldtrack:webhook_failure_rate_5m > 0.10 + for: 5m + labels: + severity: warning + annotations: + summary: "Webhook permanent failure rate above 10%" + description: >- + More than 10% of webhook deliveries are permanently failing (all retries + exhausted) over the last 5 minutes. Check receiver endpoints and circuit + breaker status. DLQ jobs can be replayed via POST /admin/webhook-dlq/:id/replay. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" + runbook: | + Cause: Elevated webhook failures for one or more receivers. + Actions: + 1. Check webhook worker logs for dominant error patterns + 2. Inspect DLQ depth and recent failed deliveries + 3. Confirm receiver endpoints are reachable and returning 2xx + 4. Check circuit breaker status in webhooks table + 5. Replay DLQ jobs after root cause is fixed + + # --- SLO 4: Webhook delivery permanent failure rate > 30% for 2 m (critical) - + - alert: WebhookDeliveryFailureRateHigh + expr: fieldtrack:webhook_failure_rate_5m > 0.30 + for: 2m + labels: + severity: critical + annotations: + summary: "Webhook permanent failure rate critically high (>30%)" + description: >- + Over 30% of webhook deliveries are permanently failing. This is a + customer-visible outage for all orgs with active webhooks. Investigate + immediately: check DB connectivity, receiver endpoints, and circuit breaker + state. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" + runbook: | + Cause: Mass endpoint failures, DB outage, or a code bug in the delivery worker. + Actions: + 1. Check worker logs: docker logs fieldtrack-api | grep "webhook.worker" + 2. Inspect DLQ: GET /admin/webhook-dlq (admin token required) + 3. Check circuit breaker state: query webhooks table for circuit_open_until IS NOT NULL + 4. Replay DLQ entries after fixing root cause: POST /admin/webhook-dlq/:id/replay + 5. If DB issue: check Supabase dashboard, verify webhook_deliveries writes + 6. If code bug: rollback deployment — see docs/ROLLBACK_QUICKREF.md + + # --- SLO 5: DLQ depth above 100 for 30 min -------------------------------- + - alert: WebhookDlqGrowing + expr: dlq_size{queue="webhook-delivery-dlq"} > 100 + for: 30m + labels: + severity: warning + annotations: + summary: "Webhook DLQ depth above 100 for 30 minutes" + description: >- + The webhook dead-letter queue has had more than 100 unprocessed jobs for + 30 minutes. This indicates sustained delivery failures that exceed the + normal transient-failure pattern. Review DLQ via GET /admin/webhook-dlq + and replay or purge stale entries. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/OBSERVABILITY_ARCHITECTURE.md" + runbook: | + Cause: Persistent downstream delivery failures. + Actions: + 1. Review DLQ entries and identify repeated endpoint failures + 2. Confirm webhook receiver health and DNS/TLS validity + 3. Inspect retry/error metrics and circuit breaker audit entries + 4. Purge stale DLQ entries after archival is confirmed + 5. Replay jobs only after receivers are healthy + + # --- Circuit breaker: any webhook circuit opened (leading indicator) ------- + # + # webhook_failures_total counts permanent failures; a sudden spike often + # indicates a circuit breaker tripped. A short `for: 0m` (fires immediately) + # gives the earliest possible signal to investigate the affected endpoint. + - alert: WebhookCircuitBreakerOpened + expr: increase(webhook_failures_total[2m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: "Webhook failure spike — possible circuit breaker activation" + description: >- + More than 5 permanent webhook failures occurred in the last 2 minutes. + A circuit breaker may have opened, pausing delivery to one or more + endpoints. Check circuit breaker state in webhook_deliveries and the + webhooks.circuit_open_until column. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/WEBHOOK_SIGNATURES.md" + runbook: | + Cause: Rapid repeated delivery failures triggered circuit breaker protection. + Actions: + 1. Query webhooks with circuit_open_until > now() + 2. Validate receiver status codes and timeout behavior + 3. Confirm auto-recovery scanner is running in worker logs + 4. Check whether failures are payload/size related vs network + 5. Re-enable/replay once endpoint stability is restored + + # --- Rate limit burst spike ----------------------------------------------- + - alert: RateLimitBurstSpike + expr: increase(security_rate_limit_hits_total[5m]) > 500 + for: 2m + labels: + severity: warning + annotations: + summary: "Rate limiter blocking unusually high request volume" + description: >- + More than 500 requests were rate-limited in the last 5 minutes. This + may indicate a misconfigured client, a burst from a single org, or the + start of a DoS attempt. Review the rate-limit logs to identify the + offending org / IP. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/ARCHITECTURE.md" + runbook: | + Cause: Burst traffic beyond per-user/per-org sliding window limits. + Actions: + 1. Inspect rate-limit logs for top offending keys + 2. Confirm traffic is expected (batch job) vs malicious + 3. Check Redis health to ensure limiter is functioning correctly + 4. Apply temporary edge-level mitigation if attack suspected + 5. Tune per-org/per-user thresholds only with incident review + +# --------------------------------------------------------- +# API ERROR BUDGET BURN RATE (SLO 3 multi-window alerting) +# See docs/SLO.md §Error Budget Alert Strategy +# --------------------------------------------------------- + +- name: fieldtrack_slo_error_budget + rules: + + # Fast burn: 1 h window at 14x burn rate (>14% error rate) + # exhausts monthly error budget in ~2 days if sustained. + - alert: FieldTrackSloErrorBudgetBurnFast + expr: fieldtrack:api_error_rate_1h > 0.14 + for: 5m + labels: + severity: critical + annotations: + summary: "API error budget burning fast (14x rate)" + description: >- + The 1-hour error rate exceeds 14% (14x normal budget burn). At this + rate the monthly error budget will be exhausted in under 2 days. + Investigate 5xx errors immediately - check logs, DB connectivity, and + recent deployments. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" + runbook: | + Cause: Sustained high error rate burning error budget at 14x the normal rate. + Actions: + 1. Identify failing routes: check Grafana → FieldTrack API dashboard + 2. Check container logs for exceptions: docker logs fieldtrack-api --tail 500 + 3. Check DB connectivity: /ready endpoint from VPS + 4. If recent deploy: rollback immediately — see docs/ROLLBACK_QUICKREF.md + 5. Open an incident; notify stakeholders if budget < 50% + + # Slow burn: 6 h window at 6x burn rate (>6% error rate) + # exhausts monthly error budget in ~5 days if sustained. + - alert: FieldTrackSloErrorBudgetBurnSlow + expr: fieldtrack:api_error_rate_6h > 0.06 + for: 15m + labels: + severity: warning + annotations: + summary: "API error budget burning (6x rate over 6 h)" + description: >- + The 6-hour error rate exceeds 6% (6x normal budget burn). Open a ticket + and investigate the root cause before the error budget is exhausted. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" + runbook: | + Cause: Sustained elevated 5xx errors over a long window. + Actions: + 1. Review error budget remaining metric on Grafana dashboard + 2. Identify top failing routes and error classes + 3. Correlate with deployments and infra incidents + 4. Open reliability ticket and assign owner + 5. Plan mitigations before entering critical burn threshold + + # p99 latency SLO breach - 2 s threshold (SLO 2) + - alert: FieldTrackLatencyP99High + expr: | + histogram_quantile( + 0.99, + sum(rate(http_request_duration_seconds_bucket{job=~"fieldtrack-backend.*"}[10m])) by (le) + ) > 2 + for: 10m + labels: + severity: warning + annotations: + summary: "API p99 latency above 2 s (SLO 2 breach)" + description: >- + The 99th-percentile API response time has been above 2 seconds for 10 + minutes. This breaches the p99 latency SLO defined in docs/SLO.md. + Check slow queries, worker queue depths, and DB connection pool saturation. + runbook_url: "https://github.com/fieldtrack/fieldtrack/blob/main/docs/SLO.md" + runbook: | + Cause: Tail-latency degradation affecting a subset of requests. + Actions: + 1. Inspect p99 panel and compare with p95 for tail amplification + 2. Review very_slow_response logs for route-level concentration + 3. Check DB wait events and queue backlog growth + 4. Reduce load or scale services if saturation detected + 5. Roll back if latency regression tracks a release \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 4bbc840..5a07342 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4068,6 +4068,7 @@ "version": "0.1.0", "dependencies": { "@fieldtrack/types": "*", + "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-avatar": "^1.1.2", "@radix-ui/react-dialog": "^1.1.4", "@radix-ui/react-dropdown-menu": "^2.1.4", @@ -4081,10 +4082,12 @@ "@supabase/supabase-js": "^2.46.2", "@tanstack/react-query": "^5.62.7", "@types/leaflet": "^1.9.21", + "@types/leaflet.markercluster": "^1.5.6", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "framer-motion": "^12.36.0", "leaflet": "^1.9.4", + "leaflet.markercluster": "^1.5.3", "lucide-react": "^0.468.0", "mapbox-gl": "^3.8.0", "next": "^15.1.3", @@ -4199,92 +4202,6 @@ } } }, - "apps/web/node_modules/@radix-ui/react-dialog": { - "version": "1.1.15", - "license": "MIT", - "dependencies": { - "@radix-ui/primitive": "1.1.3", - "@radix-ui/react-compose-refs": "1.1.2", - "@radix-ui/react-context": "1.1.2", - "@radix-ui/react-dismissable-layer": "1.1.11", - "@radix-ui/react-focus-guards": "1.1.3", - "@radix-ui/react-focus-scope": "1.1.7", - "@radix-ui/react-id": "1.1.1", - "@radix-ui/react-portal": "1.1.9", - "@radix-ui/react-presence": "1.1.5", - "@radix-ui/react-primitive": "2.1.3", - "@radix-ui/react-slot": "1.2.3", - "@radix-ui/react-use-controllable-state": "1.2.2", - "aria-hidden": "^1.2.4", - "react-remove-scroll": "^2.6.3" - }, - "peerDependencies": { - "@types/react": "*", - "@types/react-dom": "*", - "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", - "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - }, - "@types/react-dom": { - "optional": true - } - } - }, - "apps/web/node_modules/@radix-ui/react-dialog/node_modules/@radix-ui/react-context": { - "version": "1.1.2", - "license": "MIT", - "peerDependencies": { - "@types/react": "*", - "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - } - } - }, - "apps/web/node_modules/@radix-ui/react-dialog/node_modules/@radix-ui/react-primitive": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz", - "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==", - "license": "MIT", - "dependencies": { - "@radix-ui/react-slot": "1.2.3" - }, - "peerDependencies": { - "@types/react": "*", - "@types/react-dom": "*", - "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", - "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - }, - "@types/react-dom": { - "optional": true - } - } - }, - "apps/web/node_modules/@radix-ui/react-dialog/node_modules/@radix-ui/react-slot": { - "version": "1.2.3", - "license": "MIT", - "dependencies": { - "@radix-ui/react-compose-refs": "1.1.2" - }, - "peerDependencies": { - "@types/react": "*", - "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - } - } - }, "apps/web/node_modules/@radix-ui/react-dropdown-menu": { "version": "2.1.16", "license": "MIT", @@ -7850,6 +7767,57 @@ "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", "license": "MIT" }, + "node_modules/@radix-ui/react-alert-dialog": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/@radix-ui/react-alert-dialog/-/react-alert-dialog-1.1.15.tgz", + "integrity": "sha512-oTVLkEw5GpdRe29BqJ0LSDFWI3qu0vR1M0mUkOQWDIUnY/QIkLpgDMWuKxP94c2NAC2LGcgVhG1ImF3jkZ5wXw==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-dialog": "1.1.15", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-slot": "1.2.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-alert-dialog/node_modules/@radix-ui/react-primitive": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz", + "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-slot": "1.2.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-arrow": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz", @@ -7975,6 +7943,65 @@ } } }, + "node_modules/@radix-ui/react-dialog": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/@radix-ui/react-dialog/-/react-dialog-1.1.15.tgz", + "integrity": "sha512-TCglVRtzlffRNxRMEyR36DGBLJpeusFcgMVD9PZEzAKnUs1lKCgX5u9BmC2Yg+LL9MgZDugFFs1Vl+Jp4t/PGw==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-focus-guards": "1.1.3", + "@radix-ui/react-focus-scope": "1.1.7", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-portal": "1.1.9", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-slot": "1.2.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "aria-hidden": "^1.2.4", + "react-remove-scroll": "^2.6.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, + "node_modules/@radix-ui/react-dialog/node_modules/@radix-ui/react-primitive": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz", + "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==", + "license": "MIT", + "dependencies": { + "@radix-ui/react-slot": "1.2.3" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-direction": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/@radix-ui/react-direction/-/react-direction-1.1.1.tgz", @@ -9203,6 +9230,15 @@ "@types/geojson": "*" } }, + "node_modules/@types/leaflet.markercluster": { + "version": "1.5.6", + "resolved": "https://registry.npmjs.org/@types/leaflet.markercluster/-/leaflet.markercluster-1.5.6.tgz", + "integrity": "sha512-I7hZjO2+isVXGYWzKxBp8PsCzAYCJBc29qBdFpquOCkS7zFDqUsUvkEOyQHedsk/Cy5tocQzf+Ndorm5W9YKTQ==", + "license": "MIT", + "dependencies": { + "@types/leaflet": "^1.9" + } + }, "node_modules/@types/react": { "version": "19.2.14", "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.14.tgz", @@ -13377,6 +13413,15 @@ "integrity": "sha512-nxS1ynzJOmOlHp+iL3FyWqK89GtNL8U8rvlMOsQdTTssxZwCXh8N2NB3GDQOL+YR3XnWyZAxwQixURb+FA74PA==", "license": "BSD-2-Clause" }, + "node_modules/leaflet.markercluster": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/leaflet.markercluster/-/leaflet.markercluster-1.5.3.tgz", + "integrity": "sha512-vPTw/Bndq7eQHjLBVlWpnGeLa3t+3zGiuM7fJwCkiMFq+nmRuG3RI3f7f4N4TDX7T4NpbAXpR2+NTRSEGfCSeA==", + "license": "MIT", + "peerDependencies": { + "leaflet": "^1.3.1" + } + }, "node_modules/levn": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", diff --git a/run_api_smoke.ps1 b/run_api_smoke.ps1 new file mode 100644 index 0000000..fa23f9d --- /dev/null +++ b/run_api_smoke.ps1 @@ -0,0 +1,90 @@ +$ErrorActionPreference='Continue' +$artifact='apps/api/dist/server.js' +if (-not (Test-Path $artifact)) { + Write-Output "ARTIFACT_MISSING:$artifact" + exit 1 +} +$stdout=Join-Path $PWD 'api_smoke_stdout.log' +$stderr=Join-Path $PWD 'api_smoke_stderr.log' +Remove-Item $stdout,$stderr -ErrorAction SilentlyContinue +$proc=Start-Process -FilePath node -ArgumentList $artifact -PassThru -RedirectStandardOutput $stdout -RedirectStandardError $stderr -WorkingDirectory $PWD +Write-Output "STARTED_PID:$($proc.Id)" +$started=$false +for ($i=0; $i -lt 20; $i++) { + Start-Sleep -Milliseconds 500 + $proc.Refresh() + if ($proc.HasExited) { break } + try { + $null=Invoke-WebRequest -Uri 'http://127.0.0.1:3000/health' -UseBasicParsing -TimeoutSec 2 + $started=$true + break + } catch { } +} +$proc.Refresh() +if ($proc.HasExited) { + Write-Output "STARTUP_FAILED:Process exited with code $($proc.ExitCode)" + Write-Output 'STDERR_BEGIN' + if (Test-Path $stderr) { Get-Content $stderr -Tail 200 } + Write-Output 'STDERR_END' + Write-Output 'STDOUT_BEGIN' + if (Test-Path $stdout) { Get-Content $stdout -Tail 200 } + Write-Output 'STDOUT_END' + Write-Output 'PROBES_SKIPPED:true' + exit 0 +} +if ($started) { Write-Output 'STARTUP_CONFIRMED:true' } else { Write-Output 'STARTUP_UNCONFIRMED:true' } +$urls=@( + 'http://127.0.0.1:3000/health', + 'http://127.0.0.1:3000/ready', + 'http://127.0.0.1:3000/metrics', + 'http://127.0.0.1:3000/admin/system-health' +) +foreach ($u in $urls) { + try { + $resp=Invoke-WebRequest -Uri $u -UseBasicParsing -TimeoutSec 5 -MaximumRedirection 0 + $body=$resp.Content + if ($null -eq $body) { $body='' } + $snippet=($body.Substring(0, [Math]::Min(200, $body.Length)) -replace "`r|`n", ' ') + Write-Output "PROBE:$u STATUS:$([int]$resp.StatusCode) OK:true BODY:$snippet" + } catch { + $ex=$_.Exception + $status='' + $body='' + if ($ex.Response) { + try { $status=[int]$ex.Response.StatusCode } catch { $status='' } + try { + $stream=$ex.Response.GetResponseStream() + if ($stream) { + $reader=New-Object System.IO.StreamReader($stream) + $body=$reader.ReadToEnd() + $reader.Close() + } + } catch { } + } + $msg=($ex.Message -replace "`r|`n", ' ') + $snippet='' + if ($body) { $snippet=($body.Substring(0, [Math]::Min(200, $body.Length)) -replace "`r|`n", ' ') } + Write-Output "PROBE:$u STATUS:$status OK:false ERROR:$msg BODY:$snippet" + } +} +if (-not $proc.HasExited) { + try { + Stop-Process -Id $proc.Id + Start-Sleep -Milliseconds 500 + } catch { + Write-Output "STOP_ERROR:$($_.Exception.Message)" + } +} +$proc.Refresh() +Write-Output "PROCESS_EXITED:$($proc.HasExited)" +if (-not $proc.HasExited) { + try { + $proc.Kill() + Write-Output 'KILLED:true' + } catch { + Write-Output "KILL_ERROR:$($_.Exception.Message)" + } +} +Write-Output 'STDERR_TAIL_BEGIN' +if (Test-Path $stderr) { Get-Content $stderr -Tail 80 } +Write-Output 'STDERR_TAIL_END' diff --git a/supabase/migrations/20260328134113_add_admin_audit_log.sql b/supabase/migrations/20260328134113_add_admin_audit_log.sql new file mode 100644 index 0000000..1a7ad58 --- /dev/null +++ b/supabase/migrations/20260328134113_add_admin_audit_log.sql @@ -0,0 +1,25 @@ +-- Migration: add_admin_audit_log +-- Creates an immutable audit trail for admin actions (DLQ replays, circuit +-- breaker events, etc.). Written via the service client so no RLS is needed; +-- API-layer auth already restricts who can read or trigger logged events. + +CREATE TABLE IF NOT EXISTS public.admin_audit_log ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + event TEXT NOT NULL, + actor_id UUID, + organization_id UUID, + resource_type TEXT, + resource_id TEXT, + payload JSONB NOT NULL DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +-- Descending created_at first so page queries (before=) stay fast. +CREATE INDEX IF NOT EXISTS idx_admin_audit_log_org_created + ON public.admin_audit_log (organization_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_admin_audit_log_event_created + ON public.admin_audit_log (event, created_at DESC); + +COMMENT ON TABLE public.admin_audit_log IS + 'Immutable audit trail of privileged admin actions (DLQ replays, circuit breaker state changes, etc.)'; diff --git a/supabase/migrations/20260328134130_circuit_breaker_persistence.sql b/supabase/migrations/20260328134130_circuit_breaker_persistence.sql new file mode 100644 index 0000000..3ff7731 --- /dev/null +++ b/supabase/migrations/20260328134130_circuit_breaker_persistence.sql @@ -0,0 +1,31 @@ +-- Migration: persist circuit-breaker state on the webhooks table. +-- +-- Problem: failure_streak and disabled_until are currently Redis-only. +-- A Redis restart or eviction loses all in-flight streak data, allowing a +-- misbehaving endpoint to reset its consecutive-failure count for free. +-- +-- Solution: +-- failure_streak INT — mirrors cb:failure_streak:{id} in Redis +-- circuit_open_until TIMESTAMPTZ NULL — set when circuit is OPEN, +-- NULL when CLOSED/HALF-OPEN +-- +-- The application layer treats DB as the authoritative source of truth on +-- cold-start; Redis is the hot-path cache. On each process start, a sync +-- function reads all webhooks with circuit_open_until IS NOT NULL and +-- re-populates the Redis cooldown key so delivery workers respect open +-- circuits even after a Redis flush. + +ALTER TABLE public.webhooks + ADD COLUMN IF NOT EXISTS failure_streak INT NOT NULL DEFAULT 0, + ADD COLUMN IF NOT EXISTS circuit_open_until TIMESTAMPTZ; + +-- Index so the startup sync query can find open circuits quickly. +CREATE INDEX IF NOT EXISTS idx_webhooks_circuit_open + ON public.webhooks (circuit_open_until) + WHERE circuit_open_until IS NOT NULL; + +COMMENT ON COLUMN public.webhooks.failure_streak IS + 'Consecutive delivery failures (Redis-mirrored). Resets on any successful delivery.'; + +COMMENT ON COLUMN public.webhooks.circuit_open_until IS + 'When non-NULL the circuit is OPEN and no deliveries are attempted until this timestamp.'; diff --git a/supabase/migrations/20260328134140_webhook_dlq_archive.sql b/supabase/migrations/20260328134140_webhook_dlq_archive.sql new file mode 100644 index 0000000..34cb394 --- /dev/null +++ b/supabase/migrations/20260328134140_webhook_dlq_archive.sql @@ -0,0 +1,30 @@ +-- Migration: webhook DLQ archival and retention support. +-- +-- DLQ entries currently accumulate indefinitely in BullMQ (Redis). +-- When a job ages out of the retention window, the application archives +-- a snapshot to this table before removing the BullMQ job. This gives +-- operators a permanent, queryable history without unbounded Redis growth. +-- +-- Schema is intentionally write-once (no updates, no deletes) so the +-- table acts as an immutable audit trail. + +CREATE TABLE IF NOT EXISTS public.webhook_dlq_archive ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + delivery_id TEXT NOT NULL, + webhook_id TEXT NOT NULL, + event_id TEXT NOT NULL, + url TEXT NOT NULL, + attempt_number INT NOT NULL, + failed_at TIMESTAMPTZ NOT NULL, + archived_at TIMESTAMPTZ NOT NULL DEFAULT now(), + reason TEXT NOT NULL DEFAULT 'retention_policy' +); + +CREATE INDEX IF NOT EXISTS idx_dlq_archive_webhook_id + ON public.webhook_dlq_archive (webhook_id, archived_at DESC); + +CREATE INDEX IF NOT EXISTS idx_dlq_archive_archived_at + ON public.webhook_dlq_archive (archived_at DESC); + +COMMENT ON TABLE public.webhook_dlq_archive IS + 'Immutable archive of DLQ jobs removed by the retention policy or manually purged.'; diff --git a/supabase/migrations/20260328135403_phase29_hardening_rls_and_search_path.sql b/supabase/migrations/20260328135403_phase29_hardening_rls_and_search_path.sql new file mode 100644 index 0000000..f234194 --- /dev/null +++ b/supabase/migrations/20260328135403_phase29_hardening_rls_and_search_path.sql @@ -0,0 +1,65 @@ +-- Phase 29: migration reconciliation hardening +-- 1) Enable RLS on newly introduced public tables +-- 2) Add explicit policies for service_role and authenticated admin reads +-- 3) Lock trigger function search_path + +alter table if exists public.admin_audit_log enable row level security; +alter table if exists public.webhook_dlq_archive enable row level security; + +drop policy if exists service_role_only_admin_audit_log on public.admin_audit_log; +create policy service_role_only_admin_audit_log + on public.admin_audit_log + for all + to service_role + using (true) + with check (true); + +drop policy if exists admin_read_admin_audit_log on public.admin_audit_log; +create policy admin_read_admin_audit_log + on public.admin_audit_log + for select + to authenticated + using ( + organization_id = ( + select u.organization_id + from public.users u + where u.id = (select auth.uid()) + ) + and ( + select u.role + from public.users u + where u.id = (select auth.uid()) + ) = 'ADMIN' + ); + +drop policy if exists service_role_only_webhook_dlq_archive on public.webhook_dlq_archive; +create policy service_role_only_webhook_dlq_archive + on public.webhook_dlq_archive + for all + to service_role + using (true) + with check (true); + +drop policy if exists admin_read_webhook_dlq_archive on public.webhook_dlq_archive; +create policy admin_read_webhook_dlq_archive + on public.webhook_dlq_archive + for select + to authenticated + using ( + webhook_id in ( + select w.id::text + from public.webhooks w + where w.organization_id = ( + select u.organization_id + from public.users u + where u.id = (select auth.uid()) + ) + ) + and ( + select u.role + from public.users u + where u.id = (select auth.uid()) + ) = 'ADMIN' + ); + +alter function public.set_updated_at() set search_path = public, pg_temp; From c6e6b47a2b4dead7a2b0d8b898bc314e1fa0b27b Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 28 Mar 2026 21:17:33 +0530 Subject: [PATCH 2/3] feat(webhook): implement Dead-Letter Queue (DLQ) management routes and services --- apps/api/src/app.ts | 2 - .../src/modules/admin/webhook-dlq.routes.ts | 144 +++++++----------- .../modules/webhooks/webhooks.repository.ts | 70 +++++++++ .../src/modules/webhooks/webhooks.schema.ts | 27 ++++ .../src/modules/webhooks/webhooks.service.ts | 9 ++ apps/api/src/routes/index.ts | 2 + .../admin/webhooks.integration.test.ts | 137 +++++++++++++++++ 7 files changed, 297 insertions(+), 94 deletions(-) diff --git a/apps/api/src/app.ts b/apps/api/src/app.ts index 60f0177..91ff495 100644 --- a/apps/api/src/app.ts +++ b/apps/api/src/app.ts @@ -195,11 +195,9 @@ export async function buildApp(): Promise { const { adminQueuesRoutes } = await import("./modules/admin/queues.routes.js"); const { adminRetryIntentsRoutes } = await import("./modules/admin/retry-intents.routes.js"); const { systemHealthRoutes } = await import("./modules/admin/system-health.routes.js"); - const { webhookDlqRoutes } = await import("./modules/admin/webhook-dlq.routes.js"); await app.register(adminQueuesRoutes); await app.register(adminRetryIntentsRoutes); await app.register(systemHealthRoutes); - await app.register(webhookDlqRoutes); } // Admin audit log — not worker-gated (pure DB, no Redis required). diff --git a/apps/api/src/modules/admin/webhook-dlq.routes.ts b/apps/api/src/modules/admin/webhook-dlq.routes.ts index bae65c3..3172057 100644 --- a/apps/api/src/modules/admin/webhook-dlq.routes.ts +++ b/apps/api/src/modules/admin/webhook-dlq.routes.ts @@ -1,35 +1,44 @@ /** - * webhook-dlq.routes.ts — Admin API for Dead-Letter Queue (DLQ) management. + * webhook-dlq.routes.ts — Admin API for failed webhook deliveries. * - * GET /admin/webhook-dlq — list DLQ jobs pending review - * POST /admin/webhook-dlq/:id/replay — replay a single DLQ job (reset attempt_count) + * GET /admin/webhook-dlq — list failed webhook deliveries for this org + * POST /admin/webhook-dlq/:id/retry — retry a failed delivery * - * All routes require ADMIN role (JWT + RBAC). - * Only available when WORKERS_ENABLED=true (registered from app.ts). - * - * Replay semantics: - * - Removes the job from the DLQ - * - Re-enqueues into the main webhook-delivery queue with attempt_number=1 - * - Resets attempt_count in DB to allow full retry schedule - * - Logs a structured audit entry on every replay + * This route is DB-backed off `public.webhook_deliveries`, not the in-memory + * BullMQ DLQ queue. It is always registered so unauthenticated callers receive + * 401 instead of 404 even when workers are disabled. */ import type { FastifyInstance } from "fastify"; import { z } from "zod"; import { authenticate } from "../../middleware/auth.js"; import { requireRole } from "../../middleware/role-guard.js"; -import { - replayWebhookDlqJob, - listWebhookDlqJobs, - getWebhookDlqDepth, -} from "../../workers/webhook.queue.js"; -import { supabaseServiceClient as supabase } from "../../config/supabase.js"; -import { NotFoundError } from "../../utils/errors.js"; +import { webhooksService } from "../webhooks/webhooks.service.js"; import { handleError } from "../../utils/response.js"; -import { insertAuditRecord } from "../../utils/audit.js"; +import { + dlqListQuerySchema, + webhookDlqDeliverySchema, +} from "../webhooks/webhooks.schema.js"; -const DLQ_REPLAY_COOLDOWN_MS = 5_000; -let lastDlqReplayAt = 0; +const dlqListResponseSchema = z.object({ + success: z.literal(true), + data: z.array(webhookDlqDeliverySchema), + meta: z.object({ + limit: z.number().int(), + offset: z.number().int(), + count: z.number().int(), + }), +}); + +const dlqRetryResponseSchema = z.object({ + success: z.literal(true), + data: z.object({ + id: z.string().uuid(), + status: z.string(), + attempt_count: z.number(), + next_retry_at: z.string().nullable(), + }), +}); export async function webhookDlqRoutes(app: FastifyInstance): Promise { // ── GET /admin/webhook-dlq ───────────────────────────────────────────────── @@ -38,108 +47,59 @@ export async function webhookDlqRoutes(app: FastifyInstance): Promise { { schema: { tags: ["admin", "webhooks"], - description: "List jobs in the webhook Dead-Letter Queue (ADMIN only).", - querystring: z.object({ - limit: z.coerce.number().int().min(1).max(100).default(50), - }), + description: "List failed webhook deliveries for this organization (ADMIN only).", + querystring: dlqListQuerySchema, + response: { 200: dlqListResponseSchema }, }, preValidation: [authenticate, requireRole("ADMIN")], }, async (request, reply) => { try { - const { limit } = request.query as { limit: number }; - const [jobs, depth] = await Promise.all([ - listWebhookDlqJobs(limit), - getWebhookDlqDepth(), - ]); + const query = dlqListQuerySchema.parse(request.query); + const { data, total } = await webhooksService.listDlqDeliveries(request, query); reply.status(200).send({ success: true, - dlq_depth: depth, - jobs, + data, + meta: { + limit: query.limit, + offset: query.offset, + count: total, + }, }); } catch (error) { - handleError(error, request, reply, "Failed to list DLQ jobs"); + handleError(error, request, reply, "Failed to list DLQ deliveries"); } }, ); - // ── POST /admin/webhook-dlq/:id/replay ──────────────────────────────────── + // ── POST /admin/webhook-dlq/:id/retry ───────────────────────────────────── app.post<{ Params: { id: string } }>( - "/admin/webhook-dlq/:id/replay", + "/admin/webhook-dlq/:id/retry", { schema: { tags: ["admin", "webhooks"], - description: "Replay a DLQ job: re-enqueue with attempt_count reset (ADMIN only).", + description: "Retry a failed webhook delivery (ADMIN only).", params: z.object({ id: z.string().uuid() }), + response: { 200: dlqRetryResponseSchema }, }, preValidation: [authenticate, requireRole("ADMIN")], }, async (request, reply) => { try { const { id: deliveryId } = request.params; - const adminId = (request as { user?: { sub?: string } }).user?.sub; - const orgId = (request as { organizationId?: string }).organizationId; - - // Per-admin replay cooldown — prevents accidental mass re-delivery - const now = Date.now(); - const elapsed = now - lastDlqReplayAt; - if (elapsed < DLQ_REPLAY_COOLDOWN_MS) { - reply.status(429).send({ - success: false, - error: `DLQ replay rate-limited. Retry in ${DLQ_REPLAY_COOLDOWN_MS - elapsed}ms.`, - }); - return; - } - lastDlqReplayAt = now; - - const replayed = await replayWebhookDlqJob(deliveryId); - if (!replayed) { - throw new NotFoundError(`DLQ job for delivery ${deliveryId} not found`); - } - - // Reset attempt_count in DB so the full retry schedule applies - await supabase - .from("webhook_deliveries") - .update({ - status: "pending", - attempt_count: 0, - next_retry_at: new Date().toISOString(), - }) - .eq("id", deliveryId); - - // Structured audit log — queryable in Grafana/Loki - request.log.info( - { - audit: true, - event: "WEBHOOK_DLQ_REPLAY", - deliveryId, - adminId, - organizationId: orgId, - timestamp: new Date().toISOString(), - }, - "webhook-dlq: DLQ job replayed by admin", - ); - - // Persist to DB audit trail for GET /admin/audit-log - await insertAuditRecord({ - event: "WEBHOOK_DLQ_REPLAY", - actor_id: adminId, - organization_id: orgId, - resource_type: "webhook_delivery", - resource_id: deliveryId, - payload: { replayed_at: new Date().toISOString() }, - }); + const delivery = await webhooksService.retryDelivery(request, deliveryId); reply.status(200).send({ success: true, data: { - delivery_id: deliveryId, - replayed_at: new Date().toISOString(), - message: "Job re-queued with attempt_count reset", + id: delivery.id, + status: delivery.status, + attempt_count: delivery.attempt_count, + next_retry_at: delivery.next_retry_at, }, }); } catch (error) { - handleError(error, request, reply, "Failed to replay DLQ job"); + handleError(error, request, reply, "Failed to retry DLQ delivery"); } }, ); diff --git a/apps/api/src/modules/webhooks/webhooks.repository.ts b/apps/api/src/modules/webhooks/webhooks.repository.ts index 792f512..2ea8589 100644 --- a/apps/api/src/modules/webhooks/webhooks.repository.ts +++ b/apps/api/src/modules/webhooks/webhooks.repository.ts @@ -16,10 +16,14 @@ import type { WebhookPublic, WebhookDelivery, DeliveryListQuery, + DlqListQuery, + WebhookDlqDelivery, } from "./webhooks.schema.js"; const WEBHOOK_DELIVERY_COLUMNS = "id, webhook_id, event_id, organization_id, status, attempt_count, response_status, response_body, last_attempt_at, next_retry_at, created_at"; +const WEBHOOK_DLQ_COLUMNS = + "id, webhook_id, organization_id, event_id, event_type, payload, status, attempt_count, response_status, response_body, last_error, next_retry_at, last_attempt_at, created_at"; // ─── Webhook CRUD ───────────────────────────────────────────────────────────── @@ -133,6 +137,72 @@ export const webhooksRepository = { return { data: (data ?? []) as WebhookDelivery[], total: count ?? 0 }; }, + /** + * Paginated list of failed delivery rows for the admin DLQ view. + * + * Uses `last_attempt_at` consistently in both DB query and API response. + */ + async listDlqDeliveries( + request: FastifyRequest, + query: DlqListQuery, + ): Promise<{ data: WebhookDlqDelivery[]; total: number }> { + const from = query.offset; + const to = query.offset + query.limit - 1; + + let q = orgTable(request, "webhook_deliveries") + .select(WEBHOOK_DLQ_COLUMNS, { count: "exact" }) + .eq("status", "failed") + .order("last_attempt_at", { ascending: false, nullsFirst: false }) + .range(from, to); + + if (query.webhook_id) { + q = (q as ReturnType).eq("webhook_id", query.webhook_id); + } + if (query.event_type) { + q = (q as ReturnType).eq("event_type", query.event_type); + } + + const { data, error, count } = await q; + if (error) throw new Error(`Failed to list webhook DLQ deliveries: ${error.message}`); + + const rows = (data ?? []) as Array<{ + id: string; + webhook_id: string; + organization_id: string; + event_id: string; + event_type: string | null; + payload: unknown | null; + status: "failed"; + attempt_count: number; + response_status: number | null; + response_body: string | null; + last_error: string | null; + next_retry_at: string | null; + last_attempt_at: string | null; + created_at: string; + }>; + + return { + data: rows.map((row) => ({ + id: row.id, + webhook_id: row.webhook_id, + organization_id: row.organization_id, + event_id: row.event_id, + event_type: row.event_type, + payload: row.payload, + status: row.status, + attempts: row.attempt_count, + response_status: row.response_status, + response_body: row.response_body, + last_error: row.last_error, + next_retry_at: row.next_retry_at, + last_attempt_at: row.last_attempt_at, + created_at: row.created_at, + })), + total: count ?? 0, + }; + }, + /** Fetch a single delivery row by id. */ async findDeliveryById( request: FastifyRequest, diff --git a/apps/api/src/modules/webhooks/webhooks.schema.ts b/apps/api/src/modules/webhooks/webhooks.schema.ts index 2b13571..18dc3c1 100644 --- a/apps/api/src/modules/webhooks/webhooks.schema.ts +++ b/apps/api/src/modules/webhooks/webhooks.schema.ts @@ -90,6 +90,25 @@ export const webhookDeliverySchema = z.object({ }); export type WebhookDelivery = z.infer; +export const webhookDlqDeliverySchema = z.object({ + id: z.string().uuid(), + webhook_id: z.string().uuid(), + organization_id: z.string().uuid(), + event_id: z.string().uuid(), + event_type: z.string().nullable(), + payload: z.unknown().nullable(), + status: z.literal("failed"), + attempts: z.number(), + response_status: z.number().nullable(), + response_body: z.string().nullable(), + last_error: z.string().nullable(), + next_retry_at: z.string().nullable(), + // DB and API use the same timestamp name to avoid semantic drift. + last_attempt_at: z.string().nullable(), + created_at: z.string(), +}); +export type WebhookDlqDelivery = z.infer; + // ─── Query params ────────────────────────────────────────────────────────────── export const deliveryListQuerySchema = z.object({ @@ -99,3 +118,11 @@ export const deliveryListQuerySchema = z.object({ status: z.enum(["pending", "success", "failed"]).optional(), }); export type DeliveryListQuery = z.infer; + +export const dlqListQuerySchema = z.object({ + limit: z.coerce.number().int().min(1).max(100).default(50), + offset: z.coerce.number().int().min(0).default(0), + event_type: z.string().min(1).optional(), + webhook_id: z.string().uuid().optional(), +}); +export type DlqListQuery = z.infer; diff --git a/apps/api/src/modules/webhooks/webhooks.service.ts b/apps/api/src/modules/webhooks/webhooks.service.ts index 2187fb4..86cc3e6 100644 --- a/apps/api/src/modules/webhooks/webhooks.service.ts +++ b/apps/api/src/modules/webhooks/webhooks.service.ts @@ -22,6 +22,8 @@ import type { WebhookPublic, WebhookDelivery, DeliveryListQuery, + DlqListQuery, + WebhookDlqDelivery, } from "./webhooks.schema.js"; export const webhooksService = { @@ -87,6 +89,13 @@ export const webhooksService = { return webhooksRepository.listDeliveries(request, query); }, + async listDlqDeliveries( + request: FastifyRequest, + query: DlqListQuery, + ): Promise<{ data: WebhookDlqDelivery[]; total: number }> { + return webhooksRepository.listDlqDeliveries(request, query); + }, + /** * Manually retry a delivery. * diff --git a/apps/api/src/routes/index.ts b/apps/api/src/routes/index.ts index bdc50e5..5c6f7a6 100644 --- a/apps/api/src/routes/index.ts +++ b/apps/api/src/routes/index.ts @@ -13,6 +13,7 @@ import { dashboardRoutes } from "../modules/dashboard/dashboard.routes.js"; import { profileRoutes } from "../modules/profile/profile.routes.js"; import { adminDashboardRoutes } from "../modules/admin/dashboard.routes.js"; import { adminMapRoutes } from "../modules/admin/map.routes.js"; +import { webhookDlqRoutes } from "../modules/admin/webhook-dlq.routes.js"; import { eventsRoutes } from "./events.routes.js"; import { webhooksRoutes } from "../modules/webhooks/webhooks.routes.js"; @@ -31,6 +32,7 @@ export async function registerRoutes(app: FastifyInstance): Promise { await app.register(profileRoutes); await app.register(adminDashboardRoutes); await app.register(adminMapRoutes); + await app.register(webhookDlqRoutes); await app.register(eventsRoutes); await app.register(webhooksRoutes); } diff --git a/apps/api/tests/integration/admin/webhooks.integration.test.ts b/apps/api/tests/integration/admin/webhooks.integration.test.ts index 80de00c..72ca940 100644 --- a/apps/api/tests/integration/admin/webhooks.integration.test.ts +++ b/apps/api/tests/integration/admin/webhooks.integration.test.ts @@ -55,6 +55,7 @@ vi.mock("../../../src/modules/webhooks/webhooks.repository.js", () => ({ update: vi.fn(), delete: vi.fn(), listDeliveries: vi.fn(), + listDlqDeliveries: vi.fn(), findDeliveryById: vi.fn(), findWebhookSecretById: vi.fn(), resetDeliveryForRetry: vi.fn(), @@ -86,7 +87,9 @@ import { buildTestApp, signAdminToken, signEmployeeToken, + TEST_ADMIN_ID, TEST_ORG_ID, + TEST_ORG_ID_B, } from "../../setup/test-server.js"; import { webhooksRepository } from "../../../src/modules/webhooks/webhooks.repository.js"; @@ -121,16 +124,35 @@ const deliveryRow = { created_at: now, }; +const dlqDeliveryRow = { + id: DELIVERY_ID, + webhook_id: WEBHOOK_ID, + organization_id: TEST_ORG_ID, + event_id: EVENT_ID, + event_type: "expense.created", + payload: { type: "expense.created", amount: 123.45 }, + status: "failed" as const, + attempts: 3, + response_status: 500, + response_body: "Internal Server Error", + last_error: "Receiver returned 500", + next_retry_at: null, + last_attempt_at: now, + created_at: now, +}; + // ─── Test suite ─────────────────────────────────────────────────────────────── describe("Webhooks Admin API", () => { let app: FastifyInstance; let adminToken: string; + let adminTokenOrgB: string; let employeeToken: string; beforeAll(async () => { app = await buildTestApp(); adminToken = signAdminToken(app); + adminTokenOrgB = signAdminToken(app, TEST_ADMIN_ID, TEST_ORG_ID_B); employeeToken = signEmployeeToken(app); }); @@ -406,6 +428,121 @@ describe("Webhooks Admin API", () => { }); }); + // ─── GET /admin/webhook-dlq ──────────────────────────────────────────────── + + describe("GET /admin/webhook-dlq", () => { + it("returns failed deliveries for ADMIN", async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked((webhooksRepository as any).listDlqDeliveries).mockResolvedValueOnce({ + data: [dlqDeliveryRow], + total: 1, + }); + + const res = await app.inject({ + method: "GET", + url: "/admin/webhook-dlq?limit=50&offset=0", + headers: { authorization: `Bearer ${adminToken}` }, + }); + + expect(res.statusCode).toBe(200); + const body = res.json<{ + success: boolean; + data: typeof dlqDeliveryRow[]; + meta: { limit: number; offset: number; count: number }; + }>(); + expect(body.success).toBe(true); + expect(body.data).toHaveLength(1); + expect(body.data[0].status).toBe("failed"); + expect(body.meta).toEqual({ limit: 50, offset: 0, count: 1 }); + }); + + it("accepts event_type and webhook_id filters", async () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked((webhooksRepository as any).listDlqDeliveries).mockResolvedValueOnce({ + data: [], + total: 0, + }); + + const res = await app.inject({ + method: "GET", + url: `/admin/webhook-dlq?event_type=expense.created&webhook_id=${WEBHOOK_ID}`, + headers: { authorization: `Bearer ${adminToken}` }, + }); + + expect(res.statusCode).toBe(200); + }); + + it("returns 403 for EMPLOYEE role", async () => { + const res = await app.inject({ + method: "GET", + url: "/admin/webhook-dlq", + headers: { authorization: `Bearer ${employeeToken}` }, + }); + expect(res.statusCode).toBe(403); + }); + + it("returns 401 with no token", async () => { + const res = await app.inject({ + method: "GET", + url: "/admin/webhook-dlq", + }); + expect(res.statusCode).toBe(401); + }); + }); + + // ─── POST /admin/webhook-dlq/:id/retry ───────────────────────────────────── + + describe("POST /admin/webhook-dlq/:id/retry", () => { + it("retries a failed DLQ delivery", async () => { + const { enqueueWebhookDelivery } = await import( + "../../../src/workers/webhook.queue.js" + ); + + const webhookWithSecret = { + id: WEBHOOK_ID, + url: "https://example.com/hook", + secret: "s3cr3t_value_long_enough", + }; + const updatedDelivery = { ...deliveryRow, status: "pending" as const }; + + vi.mocked(webhooksRepository.findDeliveryById).mockResolvedValueOnce(deliveryRow); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked((webhooksRepository as any).findWebhookSecretById).mockResolvedValueOnce(webhookWithSecret); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.mocked((webhooksRepository as any).resetDeliveryForRetry).mockResolvedValueOnce(updatedDelivery); + + const res = await app.inject({ + method: "POST", + url: `/admin/webhook-dlq/${DELIVERY_ID}/retry`, + headers: { authorization: `Bearer ${adminToken}` }, + }); + + expect(res.statusCode).toBe(200); + expect(vi.mocked(enqueueWebhookDelivery)).toHaveBeenCalledOnce(); + }); + + it("returns 404 for admin from another organization", async () => { + // Org-scoped lookup should return null for cross-org delivery ids. + vi.mocked(webhooksRepository.findDeliveryById).mockResolvedValueOnce(null); + + const res = await app.inject({ + method: "POST", + url: `/admin/webhook-dlq/${DELIVERY_ID}/retry`, + headers: { authorization: `Bearer ${adminTokenOrgB}` }, + }); + + expect(res.statusCode).toBe(404); + }); + + it("returns 401 with no token", async () => { + const res = await app.inject({ + method: "POST", + url: `/admin/webhook-dlq/${DELIVERY_ID}/retry`, + }); + expect(res.statusCode).toBe(401); + }); + }); + // ─── POST /admin/webhook-deliveries/:id/retry ─────────────────────────────── describe("POST /admin/webhook-deliveries/:id/retry", () => { From 6eec3d2f69ea001bc5d494df445afff59aecf62c Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Sat, 28 Mar 2026 21:37:26 +0530 Subject: [PATCH 3/3] fix(nginx): remove http2 directive from server configuration --- infra/nginx/fieldtrack.conf | 1 - 1 file changed, 1 deletion(-) diff --git a/infra/nginx/fieldtrack.conf b/infra/nginx/fieldtrack.conf index 44a66a9..2b5b27a 100644 --- a/infra/nginx/fieldtrack.conf +++ b/infra/nginx/fieldtrack.conf @@ -86,7 +86,6 @@ server { listen 443 ssl; listen [::]:443 ssl; - http2 on; server_name __API_HOSTNAME__;