diff --git a/apps/README.md b/apps/README.md index 0ed5f85..ce77588 100644 --- a/apps/README.md +++ b/apps/README.md @@ -56,27 +56,41 @@ Add `env` to inject config: } ``` -Add `expose` to ask DD to route a public hostname to a workload's port: +Add `expose` to ask DD to route a public hostname to a workload's port. +Two shapes: + +**Per-agent (auto)** — URL is derived from the agent's UUID; good for +anything that's naturally per-VM: ```json -{ - "app_name": "web-nvidia-smi", - "expose": { "hostname_label": "gpu", "port": 8081 }, - "cmd": [...] -} +{ "expose": { "hostname_label": "my-label", "port": 8081 } } +``` + +Becomes `-my-label.devopsdefender.com` (one level +deep, so Universal SSL covers it). + +**Vanity claim** — a stable short URL directly under the zone apex. +First agent to register the claim wins; DNS uniqueness arbitrates. If +another agent tries to deploy the same spec, the CP returns 409. + +```json +{ "expose": { "claim_hostname": "nvidia-smi", "port": 8081 } } ``` -At agent boot, `apps/_infra/local-agents.sh` collects every `expose` entry -into `DD_EXTRA_INGRESS`. dd-agent forwards them on `/register` and the CP -prepends them to the agent's cloudflared tunnel ingress. A workload declaring -`{"hostname_label": "gpu", "port": 8081}` becomes reachable at -`gpu.` — in addition to the default dashboard at -``. easyenclave itself ignores the field; it's a DD-level -hint about tunnel routing. - -Per-workload ingress is **boot-time only** today. Workloads POSTed later via -`/deploy` don't get auto-exposed — declare your exposure on boot workloads in -this tree. +Becomes `nvidia-smi.devopsdefender.com`. When the owning agent dies, +the CP's collector releases the claim so the next deploy can take it. + +At agent boot, `apps/_infra/local-agents.sh` collects every `expose` +entry into `DD_EXTRA_INGRESS`. Claims are marked with a `@` prefix +(`@nvidia-smi:8081`) to distinguish them from auto-labels. dd-agent +parses the env var, splits into the two variants, and forwards both +on `/register`. The CP prepends them to the agent's cloudflared +tunnel ingress and provisions the CNAMEs + CF Access apps. Easyenclave +itself ignores the field; it's a DD-level hint about tunnel routing. + +Per-workload ingress is boot-time **and** runtime — any workload +POSTed via `/deploy` with an `expose` block also gets added to the +agent's tunnel via `/ingress/replace`. ## Templates diff --git a/apps/_infra/local-agents.sh b/apps/_infra/local-agents.sh index 873ca4b..9756f9c 100755 --- a/apps/_infra/local-agents.sh +++ b/apps/_infra/local-agents.sh @@ -78,14 +78,28 @@ bake() { } # Extract `expose` entries from a stream of baked workloads and emit -# them as a comma-separated `label:port` string — the shape dd-agent -# expects in $DD_EXTRA_INGRESS. Using plain text (not JSON) avoids -# quote-escaping when the value gets substituted into the dd-agent -# workload template's `"DD_EXTRA_INGRESS=${DD_EXTRA_INGRESS}"` env -# entry: embedded `"` would close the outer JSON string early and -# produce invalid JSON (jq: "Invalid numeric literal"). +# them as a comma-separated string — the shape dd-agent expects in +# $DD_EXTRA_INGRESS. Two variants per entry: +# +# label:port — auto per-agent (e.g. `web:9000` routes +# `-web.` to localhost:9000) +# @claim:port — vanity zone-apex claim (e.g. `@nvidia-smi:8081` +# routes `nvidia-smi.` to localhost:8081 +# on the first agent to register it) +# +# Using plain text (not JSON) avoids quote-escaping when the value +# gets substituted into the dd-agent workload template's +# `"DD_EXTRA_INGRESS=${DD_EXTRA_INGRESS}"` env entry: embedded `"` +# would close the outer JSON string early and produce invalid JSON +# (jq: "Invalid numeric literal"). extract_extra_ingress() { - jq -rs 'map(select(.expose) | "\(.expose.hostname_label):\(.expose.port)") | join(",")' + jq -rs 'map( + select(.expose) + | if .expose.claim_hostname + then "@\(.expose.claim_hostname):\(.expose.port)" + else "\(.expose.hostname_label):\(.expose.port)" + end + ) | join(",")' } [ -r "$BASE" ] || { echo "missing $BASE" >&2; exit 1; } diff --git a/apps/web-nvidia-smi/workload.json b/apps/web-nvidia-smi/workload.json index 96cb2d8..b1ca6b6 100644 --- a/apps/web-nvidia-smi/workload.json +++ b/apps/web-nvidia-smi/workload.json @@ -1,6 +1,6 @@ { "app_name": "web-nvidia-smi", - "expose": { "hostname_label": "gpu", "port": 8081 }, + "expose": { "claim_hostname": "nvidia-smi", "port": 8081 }, "cmd": [ "/bin/busybox", "sh", "-c", "until [ -x /var/lib/easyenclave/bin/podman ]; do sleep 2; done\nexec /var/lib/easyenclave/bin/podman run --rm --name web-nvidia-smi --network=host --device=/dev/nvidia0 --device=/dev/nvidiactl --device=/dev/nvidia-uvm docker.io/nvidia/cuda:12.6.1-base-ubuntu22.04 sh -c 'set -e; apt-get update -qq && apt-get install -y -qq --no-install-recommends netcat-openbsd >/dev/null; while true; do (printf \"HTTP/1.0 200 OK\\r\\nContent-Type: text/plain\\r\\n\\r\\n\"; nvidia-smi) | nc -l -p 8081 -q 1; done'" diff --git a/src/agent.rs b/src/agent.rs index d37988b..1bddd4d 100644 --- a/src/agent.rs +++ b/src/agent.rs @@ -60,6 +60,12 @@ struct St { /// agent forwards the full list on every /ingress/replace call /// so the CP's PUT is a straight replacement. extras: Arc>>, + /// Live set of vanity zone-apex claims (`@name:port`). Same + /// lifecycle as `extras` — seeded from boot, appended by runtime + /// deploys. The CP rejects /ingress/replace with 409 if a claim + /// is already owned by another agent, so the local list can hold + /// unconfirmed claims momentarily until the next replace reconciles. + claims: Arc>>, /// Verifier for GitHub Actions OIDC JWTs — the auth on /deploy /// and /exec. CI workflows in the DD_OWNER org can call them /// without any shared secret; anyone else is denied at claim @@ -120,6 +126,7 @@ pub async fn run() -> Result<()> { started: Instant::now(), ita_token, extras: Arc::new(RwLock::new(cfg.extra_ingress.clone())), + claims: Arc::new(RwLock::new(cfg.claims.clone())), gh, }; @@ -152,10 +159,19 @@ struct Bootstrap { async fn register(cfg: &Cfg, ita_token: &str) -> Result { let http = reqwest::Client::new(); let url = format!("{}/register", cfg.cp_url.trim_end_matches('/')); + // Each entry carries EITHER `hostname_label` (auto per-agent) OR + // `claim_hostname` (vanity zone-apex claim). The CP rejects the + // whole register with 409 if any claim collides with another + // live agent's claim — DNS uniqueness is the lock. let extra_ingress: Vec = cfg .extra_ingress .iter() .map(|(label, port)| serde_json::json!({"hostname_label": label, "port": port})) + .chain( + cfg.claims + .iter() + .map(|(name, port)| serde_json::json!({"claim_hostname": name, "port": port})), + ) .collect(); let body = serde_json::json!({ "vm_name": cfg.common.vm_name, @@ -233,12 +249,22 @@ async fn health(State(s): State) -> Json { .unwrap_or_default(); let m = metrics::collect().await; let ita_token = s.ita_token.read().await.clone(); + // /health reports both auto-labeled extras and vanity claims + // under `extra_ingress` so the CP's collector can rebuild the + // per-agent state after a CP restart without a fresh /register. let extra_ingress: Vec = s .extras .read() .await .iter() .map(|(label, port)| serde_json::json!({"hostname_label": label, "port": port})) + .chain( + s.claims + .read() + .await + .iter() + .map(|(name, port)| serde_json::json!({"claim_hostname": name, "port": port})), + ) .collect(); Json(serde_json::json!({ @@ -450,53 +476,93 @@ async fn deploy( let response = s.ee.deploy(spec).await?; - if let Some((label, port)) = expose { - if let Err(e) = push_extra_ingress(&s, label.clone(), port).await { + if let Some(entry) = expose { + if let Err(e) = push_extra_ingress(&s, entry).await { // Soft-fail: the workload is deployed, the owner just can't // reach it from the public internet yet. Better than failing // the whole /deploy and leaving the caller unsure whether // the process is running. - eprintln!( - "agent: /ingress/replace add {label}:{port} failed (workload still running): {e}" - ); + eprintln!("agent: /ingress/replace failed (workload still running): {e}"); } } Ok(Json(response)) } -/// Extract `expose.hostname_label` + `expose.port` from a DeployRequest -/// JSON body. Returns None if the field is missing or malformed; the -/// caller treats that as "no runtime ingress requested" and moves on. -fn parse_expose(spec: &serde_json::Value) -> Option<(String, u16)> { +/// Parsed form of a workload's `expose:` block. Each workload may +/// declare at most one of these. +enum ExposeEntry { + Auto { label: String, port: u16 }, + Claim { name: String, port: u16 }, +} + +/// Extract `expose.hostname_label`/`expose.claim_hostname` + `expose.port` +/// from a DeployRequest JSON body. Returns None if `expose` is missing +/// or malformed; the caller treats that as "no runtime ingress +/// requested" and moves on. +fn parse_expose(spec: &serde_json::Value) -> Option { let expose = spec.get("expose")?; - let label = expose.get("hostname_label")?.as_str()?.to_string(); let port = expose.get("port")?.as_u64()?; - if label.is_empty() || port == 0 || port > u16::MAX as u64 { + if port == 0 || port > u16::MAX as u64 { return None; } - Some((label, port as u16)) + let port = port as u16; + if let Some(name) = expose.get("claim_hostname").and_then(|v| v.as_str()) { + if name.is_empty() { + return None; + } + return Some(ExposeEntry::Claim { + name: name.to_string(), + port, + }); + } + if let Some(label) = expose.get("hostname_label").and_then(|v| v.as_str()) { + if label.is_empty() { + return None; + } + return Some(ExposeEntry::Auto { + label: label.to_string(), + port, + }); + } + None } -/// Append `(label, port)` to the live extras list (dedup by label — -/// redeploying the same app_name with the same hostname_label is a -/// no-op, not a duplicate rule) and POST the full list to the CP's -/// /ingress/replace endpoint. The CP re-PUTs the tunnel config and -/// upserts CNAMEs. -async fn push_extra_ingress(s: &St, label: String, port: u16) -> Result<()> { - let extras = { - let mut guard = s.extras.write().await; - if let Some(existing) = guard.iter_mut().find(|(l, _)| *l == label) { - existing.1 = port; - } else { - guard.push((label, port)); +/// Upsert a workload expose entry (auto-labeled or vanity) into the +/// live state and POST the full reconciled ingress to the CP's +/// `/ingress/replace` endpoint. The CP re-PUTs the tunnel config, +/// upserts CNAMEs, and provisions CF Access apps. Returns 409-like +/// errors when a claim collides with another agent. +async fn push_extra_ingress(s: &St, entry: ExposeEntry) -> Result<()> { + match entry { + ExposeEntry::Auto { label, port } => { + let mut guard = s.extras.write().await; + if let Some(existing) = guard.iter_mut().find(|(l, _)| *l == label) { + existing.1 = port; + } else { + guard.push((label, port)); + } } - guard.clone() - }; + ExposeEntry::Claim { name, port } => { + let mut guard = s.claims.write().await; + if let Some(existing) = guard.iter_mut().find(|(n, _)| *n == name) { + existing.1 = port; + } else { + guard.push((name, port)); + } + } + } - let body_extras: Vec = extras + let extras_snapshot = s.extras.read().await.clone(); + let claims_snapshot = s.claims.read().await.clone(); + let body_extras: Vec = extras_snapshot .iter() .map(|(l, p)| serde_json::json!({"hostname_label": l, "port": p})) + .chain( + claims_snapshot + .iter() + .map(|(n, p)| serde_json::json!({"claim_hostname": n, "port": p})), + ) .collect(); let ita_token = s.ita_token.read().await.clone(); let body = serde_json::json!({ @@ -519,7 +585,11 @@ async fn push_extra_ingress(s: &St, label: String, port: u16) -> Result<()> { "ingress/replace {url} → {status}: {text}" ))); } - eprintln!("agent: ingress/replace ok ({} extras total)", extras.len()); + eprintln!( + "agent: ingress/replace ok ({} auto + {} claims)", + extras_snapshot.len(), + claims_snapshot.len() + ); Ok(()) } diff --git a/src/cf.rs b/src/cf.rs index 1b373a8..721981f 100644 --- a/src/cf.rs +++ b/src/cf.rs @@ -78,17 +78,21 @@ async fn call( Ok(parsed) } -/// Create (or recreate) a CF tunnel with ingress pointing at the local -/// service on port 8080, a proxied CNAME for `hostname`, and one -/// additional `{label}.{hostname}` → `localhost:{port}` ingress + -/// CNAME per entry in `extras`. Extras are prepended to the ingress -/// rules so they match before the primary wildcard catch-all. +/// Create (or recreate) a CF tunnel with ingress pointing at the +/// local service on port 8080, a proxied CNAME for `hostname`, one +/// additional `{base}-{label}.{tld}` → `localhost:{port}` ingress + +/// CNAME per entry in `extras`, and a zone-apex CNAME per entry in +/// `claims` (POSTed without upsert — fails on conflict). Extras + +/// claims are prepended to the ingress rules so they match before +/// the primary wildcard catch-all. +#[allow(clippy::too_many_arguments)] pub async fn create( http: &Client, cf: &CfCreds, name: &str, hostname: &str, extras: &[(String, u16)], + claims: &[(String, u16)], ) -> Result { delete_by_name(http, cf, name).await; @@ -111,7 +115,7 @@ pub async fn create( .ok_or_else(|| Error::Upstream("tunnel create: missing token".into()))? .to_string(); - let extra_hostnames = apply_ingress(http, cf, &id, hostname, extras).await?; + let extra_hostnames = apply_ingress(http, cf, &id, hostname, extras, claims).await?; Ok(Tunnel { id, @@ -133,8 +137,9 @@ pub async fn update_ingress( tunnel_id: &str, hostname: &str, extras: &[(String, u16)], + claims: &[(String, u16)], ) -> Result> { - apply_ingress(http, cf, tunnel_id, hostname, extras).await + apply_ingress(http, cf, tunnel_id, hostname, extras, claims).await } /// Turn `(hostname="pr-144.devopsdefender.com", label="term")` into @@ -151,16 +156,36 @@ pub fn label_hostname(hostname: &str, label: &str) -> String { } } -/// Build the ingress array (extras first, then the primary +/// Derive the zone apex from a tunnel hostname like +/// `pr-144.devopsdefender.com`. Returns `"devopsdefender.com"`. +/// Vanity claims live directly under this apex so the cert fits +/// under Universal SSL. +fn zone_apex(hostname: &str) -> &str { + hostname + .split_once('.') + .map(|(_, rest)| rest) + .unwrap_or(hostname) +} + +/// Full vanity claim hostname, one level under the zone apex. +/// e.g. `"nvidia-smi"` + `"pr-144.devopsdefender.com"` → +/// `"nvidia-smi.devopsdefender.com"`. +pub fn claim_hostname(tunnel_hostname: &str, claim: &str) -> String { + format!("{claim}.{}", zone_apex(tunnel_hostname)) +} + +/// Build the ingress array (extras + claims first, then the primary /// `hostname → localhost:8080` rule, then the 404 catch-all), PUT -/// it to the tunnel, and upsert a CNAME for each hostname pointing -/// at `{tunnel_id}.cfargotunnel.com`. +/// it to the tunnel, upsert CNAMEs for the tunnel + auto-label +/// hostnames, and POST-only CNAMEs for vanity claims (first caller +/// wins; later callers get a conflict error bubbled up). async fn apply_ingress( http: &Client, cf: &CfCreds, tunnel_id: &str, hostname: &str, extras: &[(String, u16)], + claims: &[(String, u16)], ) -> Result> { let mut ingress: Vec = extras .iter() @@ -170,6 +195,12 @@ async fn apply_ingress( "service": format!("http://localhost:{port}"), }) }) + .chain(claims.iter().map(|(name, port)| { + serde_json::json!({ + "hostname": claim_hostname(hostname, name), + "service": format!("http://localhost:{port}"), + }) + })) .collect(); ingress.push(serde_json::json!({ "hostname": hostname, @@ -190,15 +221,119 @@ async fn apply_ingress( .await?; upsert_cname(http, cf, tunnel_id, hostname).await?; - let mut extra_hostnames = Vec::with_capacity(extras.len()); + let mut extra_hostnames = Vec::with_capacity(extras.len() + claims.len()); for (label, _) in extras { let extra = label_hostname(hostname, label); upsert_cname(http, cf, tunnel_id, &extra).await?; extra_hostnames.push(extra); } + for (name, _) in claims { + let claim = claim_hostname(hostname, name); + // For existing claim CNAMEs that already point at THIS + // tunnel, a re-apply is idempotent (update in place). For a + // first-time claim we must not upsert — a stray PUT would + // silently hijack another agent's claim. Check ownership + // first; POST-only if absent; skip if foreign. + match find_record_id(http, cf, &claim).await? { + Some(rec_id) => { + let got = call( + http, + cf, + Method::GET, + &format!("/zones/{}/dns_records/{rec_id}", cf.zone_id), + None, + ) + .await?; + let content = got["result"]["content"].as_str().unwrap_or_default(); + if content == format!("{tunnel_id}.cfargotunnel.com") { + // Ours already; no-op. + extra_hostnames.push(claim); + continue; + } + return Err(Error::Upstream(format!( + "claim {claim}: already owned by another tunnel" + ))); + } + None => { + try_claim_cname(http, cf, tunnel_id, &claim).await?; + extra_hostnames.push(claim); + } + } + } Ok(extra_hostnames) } +/// POST a fresh CNAME without upsert. Fails if a record for `hostname` +/// already exists — CF returns success=false with code 81053 "record +/// already exists" (or 81058 "conflicts"). Our `call()` promotes that +/// to `Err(Upstream(...))` which the caller matches on. DNS +/// uniqueness is the first-come-first-served lock for vanity claims +/// — no separate registry, no race window. +pub async fn try_claim_cname( + http: &Client, + cf: &CfCreds, + tunnel_id: &str, + hostname: &str, +) -> Result<()> { + if find_record_id(http, cf, hostname).await?.is_some() { + // Already taken. Don't even attempt the POST; return a + // deterministic error so the CP can reject with 409. + return Err(Error::Upstream(format!( + "claim {hostname}: already owned by another tunnel" + ))); + } + let content = format!("{tunnel_id}.cfargotunnel.com"); + call( + http, + cf, + Method::POST, + &format!("/zones/{}/dns_records", cf.zone_id), + Some(serde_json::json!({ + "type": "CNAME", "name": hostname, "content": content, "proxied": true, + })), + ) + .await?; + Ok(()) +} + +/// Release a claim we previously owned. Delete the CNAME if it still +/// points at our tunnel; otherwise leave it alone (another agent +/// won the race and took it over). +pub async fn release_claim( + http: &Client, + cf: &CfCreds, + tunnel_id: &str, + hostname: &str, +) -> Result<()> { + let Some(id) = find_record_id(http, cf, hostname).await? else { + return Ok(()); + }; + // Only delete if the record still points at our tunnel. Don't + // stomp a claim that another agent legitimately took over. + let resp = call( + http, + cf, + Method::GET, + &format!("/zones/{}/dns_records/{id}", cf.zone_id), + None, + ) + .await?; + let content = resp["result"]["content"].as_str().unwrap_or_default(); + let expected = format!("{tunnel_id}.cfargotunnel.com"); + if content != expected { + return Ok(()); + } + let _ = call( + http, + cf, + Method::DELETE, + &format!("/zones/{}/dns_records/{id}", cf.zone_id), + None, + ) + .await; + Ok(()) +} + async fn upsert_cname(http: &Client, cf: &CfCreds, tunnel_id: &str, hostname: &str) -> Result<()> { let content = format!("{tunnel_id}.cfargotunnel.com"); let body = serde_json::json!({ @@ -552,6 +687,7 @@ pub async fn provision_cp_access( /// URLs are public by default (this is the nvidia-smi exemption). /// - Any existing `*.{agent}.{domain}` app whose label is no longer /// in `workload_labels` is deleted. +#[allow(clippy::too_many_arguments)] pub async fn provision_agent_access( http: &Client, cf: &CfCreds, @@ -560,10 +696,20 @@ pub async fn provision_agent_access( owner: &str, admin_email: &str, workload_labels: &[String], + workload_claims: &[String], ) -> Result<()> { let idp = github_idp_uuid(http, cf).await?; let human = human_policy(owner, admin_email, &idp); + // Vanity claims live directly under the zone apex; public-bypass + // app per claim, scoped to this env so reaps don't collide with + // other deployments. DNS uniqueness was already enforced at + // apply_ingress time — if we got here, this agent owns the claim. + for claim in workload_claims { + let domain = claim_hostname(agent_hostname, claim); + ensure_bypass_app(http, cf, &format!("dd-{env}-claim-{domain}"), &domain).await?; + } + ensure_app( http, cf, diff --git a/src/collector.rs b/src/collector.rs index e36e7fb..9e3a887 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -60,6 +60,12 @@ pub struct Agent { /// recovers this list from the agent's `/health` response. #[serde(default)] pub extras: Vec<(String, u16)>, + /// Vanity claims currently owned by this agent. Same lifecycle + /// as `extras`. The collector's orphan-GC path releases each of + /// these (deletes the CNAME + CF Access app) when this agent + /// goes dead so the next caller can claim the same hostname. + #[serde(default)] + pub claims: Vec<(String, u16)>, } pub type Store = Arc>>; @@ -166,7 +172,7 @@ async fn tick( eprintln!("cp: collector: {name} /health lacks ita_token — skipping"); continue; }; - let claims = match verifier.verify(token).await { + let ita_claims = match verifier.verify(token).await { Ok(c) => c, Err(e) => { eprintln!("cp: collector: {name} ITA verify failed: {e}"); @@ -176,8 +182,11 @@ async fn tick( // Store key is the tunnel name (authoritative on the CP side), // NOT the agent's self-reported agent_id. let mut s = store.lock().await; - let extras = parse_extra_ingress(h) - .unwrap_or_else(|| s.get(name).map(|a| a.extras.clone()).unwrap_or_default()); + let (extras, vanity_claims) = parse_extra_ingress(h).unwrap_or_else(|| { + s.get(name) + .map(|a| (a.extras.clone(), a.claims.clone())) + .unwrap_or_default() + }); s.insert( name.clone(), Agent { @@ -204,37 +213,70 @@ async fn tick( memory_total_mb: h["memory_total_mb"].as_u64().unwrap_or(0), nets: serde_json::from_value(h["nets"].clone()).unwrap_or_default(), disks: serde_json::from_value(h["disks"].clone()).unwrap_or_default(), - ita: claims, + ita: ita_claims, tunnel_id: tunnel_id.clone(), extras, + claims: vanity_claims, }, ); drop(s); verified += 1; } - // Collect + delete dead entries from the store. - let dead: Vec = { + // Snapshot dead entries + their claim state BEFORE removing them + // from the store, so the release path below has the tunnel_id and + // claim list to pass to CF. + let dead: Vec<(String, String, Vec)> = { let s = store.lock().await; s.iter() .filter(|(_, a)| a.status == "dead") - .map(|(k, _)| k.clone()) + .map(|(k, a)| { + ( + k.clone(), + a.tunnel_id.clone(), + a.claims.iter().map(|(n, _)| n.clone()).collect(), + ) + }) .collect() }; - for k in &dead { + for (k, _, _) in &dead { store.lock().await.remove(k); } if !orphans.is_empty() { for (name, host) in &orphans { eprintln!("cp: GC dead tunnel {name}"); + // Release any vanity claims this agent was holding so the + // next caller can register the same hostname. `release_claim` + // checks ownership before deleting, so it won't stomp on + // a takeover that already happened. + let claim_names: Vec = dead + .iter() + .find(|(k, _, _)| k == name) + .map(|(_, _, claims)| claims.clone()) + .unwrap_or_default(); + let tunnel_id = dead + .iter() + .find(|(k, _, _)| k == name) + .map(|(_, tid, _)| tid.clone()) + .unwrap_or_default(); + for claim in &claim_names { + let domain = cf::claim_hostname(host, claim); + if let Err(e) = cf::release_claim(http, cf, &tunnel_id, &domain).await { + eprintln!("cp: release_claim {domain} failed: {e}"); + } + } + cf::delete_by_name(http, cf, name).await; let _ = cf::delete_cname(http, cf, host).await; // Sweep the agent's CF Access apps — its human dashboard - // app and every workload-URL bypass under this hostname. - // Without this the account accumulates dead apps every - // STONITH cycle. + // app, every workload-URL bypass under this hostname, and + // the vanity-claim bypass apps at the zone apex. cf::delete_access_apps_for(http, cf, host).await; + for claim in &claim_names { + let domain = cf::claim_hostname(host, claim); + cf::delete_access_apps_for(http, cf, &domain).await; + } } } @@ -281,20 +323,37 @@ async fn tick( ); } -fn parse_extra_ingress(h: &serde_json::Value) -> Option> { - h.get("extra_ingress")?.as_array().map(|items| { - items - .iter() - .filter_map(|item| { - let label = item.get("hostname_label")?.as_str()?; - let port = item.get("port")?.as_u64()?; - if label.is_empty() || port == 0 || port > u16::MAX as u64 { - return None; - } - Some((label.to_string(), port as u16)) - }) - .collect() - }) +/// Split `/health.extra_ingress` entries into (auto-labeled, +/// vanity-claims) — each entry has exactly one of `hostname_label` +/// or `claim_hostname`. Returns `None` when the field is missing so +/// callers can fall back to the cached values (avoids blanking the +/// store during a scrape hiccup). +#[allow(clippy::type_complexity)] +fn parse_extra_ingress(h: &serde_json::Value) -> Option<(Vec<(String, u16)>, Vec<(String, u16)>)> { + let items = h.get("extra_ingress")?.as_array()?; + let mut extras = Vec::new(); + let mut claims = Vec::new(); + for item in items { + let Some(port) = item.get("port").and_then(|v| v.as_u64()) else { + continue; + }; + if port == 0 || port > u16::MAX as u64 { + continue; + } + let port = port as u16; + if let Some(claim) = item.get("claim_hostname").and_then(|v| v.as_str()) { + if !claim.is_empty() { + claims.push((claim.to_string(), port)); + continue; + } + } + if let Some(label) = item.get("hostname_label").and_then(|v| v.as_str()) { + if !label.is_empty() { + extras.push((label.to_string(), port)); + } + } + } + Some((extras, claims)) } async fn mark_stale_or_orphan( @@ -342,10 +401,24 @@ mod tests { ] }); - assert_eq!( - parse_extra_ingress(&h), - Some(vec![("gpu".into(), 8081), ("web".into(), 9000)]) - ); + let (extras, claims) = parse_extra_ingress(&h).unwrap(); + assert_eq!(extras, vec![("gpu".into(), 8081), ("web".into(), 9000)]); + assert!(claims.is_empty()); + } + + #[test] + fn parses_mixed_auto_and_claim_entries() { + let h = serde_json::json!({ + "extra_ingress": [ + {"hostname_label": "gpu", "port": 8081}, + {"claim_hostname": "nvidia-smi", "port": 8081}, + {"hostname_label": "web", "port": 9000} + ] + }); + + let (extras, claims) = parse_extra_ingress(&h).unwrap(); + assert_eq!(extras, vec![("gpu".into(), 8081), ("web".into(), 9000)]); + assert_eq!(claims, vec![("nvidia-smi".into(), 8081)]); } #[test] @@ -356,10 +429,13 @@ mod tests { {"hostname_label": "", "port": 8082}, {"hostname_label": "bad-zero", "port": 0}, {"hostname_label": "bad-wide", "port": 70000}, - {"hostname_label": "bad-string", "port": "8083"} + {"hostname_label": "bad-string", "port": "8083"}, + {"claim_hostname": "", "port": 8084} ] }); - assert_eq!(parse_extra_ingress(&h), Some(vec![("gpu".into(), 8081)])); + let (extras, claims) = parse_extra_ingress(&h).unwrap(); + assert_eq!(extras, vec![("gpu".into(), 8081)]); + assert!(claims.is_empty()); } } diff --git a/src/config.rs b/src/config.rs index 0aa6773..382e083 100644 --- a/src/config.rs +++ b/src/config.rs @@ -159,13 +159,17 @@ pub struct Agent { pub cp_url: String, pub ee_socket: String, pub ita: Ita, - /// Extra cloudflared ingress rules requested at register time, - /// parsed from `DD_EXTRA_INGRESS` (a comma-separated list of - /// `label:port` pairs, e.g. `gpu:8081,web:9000`). The boot-workload - /// builder (`apps/_infra/local-agents.sh`) collects these from - /// `expose` hints on individual workload specs. Empty is fine — - /// the agent just gets the default dashboard rule. + /// Auto per-agent ingress rules from `DD_EXTRA_INGRESS` — + /// entries like `label:port`. Each becomes the URL + /// `-