From 098fd553bdda990d06fd0e66e05261b59a95f19d Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Thu, 14 May 2026 21:24:15 -0700 Subject: [PATCH 1/2] ci: split release pipeline into per-OS native builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous release.yml ran goreleaser inside a single Linux container and cross-compiled for linux/amd64, linux/arm64, windows/amd64, and darwin/arm64. That approach left the cross-compiled binaries with empty go:embed sections — the Hyper-V kernel/initrd/rootfs, the macOS Vz aarch64 VM assets, and the arm64 GHA runner tarball all live in platform-specific paths fed by mage targets gated to the matching host. Cross-compiling from Linux saw only the x64 Linux assets and filled the rest with EnsurePlaceholders() zero-byte files — the resulting Windows and Darwin binaries would compile fine but be unable to boot a Linux VM. Replace with four parallel build jobs, one per platform, each running on its native self-hosted runner via the appropriate mage target: - linux/amd64: mage build:build on [self-hosted, linux, x64] - linux/arm64: mage build:build on [self-hosted, linux, arm64] - windows/amd64: mage build:windows on [self-hosted, windows, x64] — the full two-stage build (Linuxembed, Rootfs, Kernelx86, Initrdx86) - darwin/arm64: mage build:macos on [self-hosted, macos, arm64] — the Darwin build with aarch64 Linux VM assets + codesign Each job packages its binary (tar.gz on unix, zip on Windows) and uploads it as a workflow artifact. A final release job downloads all four artifacts, computes sha256 checksums.txt, and creates a draft GitHub release via `gh release create --draft --prerelease`. The draft gate is intentional — release notes auto-generated by --generate-notes, publishing is manual. Drop .goreleaser.yml; the workflow uses `gh release create` directly and mage handles cross-compile via its existing per-OS build:* targets. --- .github/workflows/release.yml | 184 ++++++++++++++++++++++++++++++---- .goreleaser.yml | 62 ------------ 2 files changed, 164 insertions(+), 82 deletions(-) delete mode 100644 .goreleaser.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7da25e4..fcc156d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,16 @@ name: Release +# Per-OS build pipeline. Each platform builds on its native self-hosted +# runner via `mage build:`, packages the binary, and uploads it as +# a workflow artifact. A final `release` job downloads all artifacts, +# computes checksums, and creates a draft GitHub release. +# +# This split exists because cross-compiling Windows/Darwin binaries from +# a Linux host leaves their go:embed sections empty — the embedded VM +# kernel, initrd, and rootfs are platform-specific assets downloaded by +# mage targets that only run on the matching host. Each native build +# pulls its own assets and produces a fully working binary. + on: push: tags: @@ -12,35 +23,168 @@ env: GO_VERSION: "1.26.1" jobs: - release: - name: Build & Release + build-linux-amd64: + name: Build (Linux amd64) runs-on: [self-hosted, linux, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # so `git describe --tags` returns the tag + + # mage comes from the runner-ci-linux image, same as ci.yml. + - name: Restore cached build deps + run: /opt/ephemerd-ci/entrypoint-cache.sh + continue-on-error: true + + - name: Build Linux amd64 binary + run: mage build:build + + - name: Verify binary + run: ./ephemerd --version + + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_amd64.tar.gz" ephemerd + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-linux-amd64 + path: ephemerd_*_linux_amd64.tar.gz + if-no-files-found: error + + build-linux-arm64: + name: Build (Linux arm64) + runs-on: [self-hosted, linux, arm64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install mage + run: go install github.com/magefile/mage@latest + + - name: Build Linux arm64 binary + run: mage build:build + + - name: Verify binary + run: ./ephemerd --version + + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_arm64.tar.gz" ephemerd + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-linux-arm64 + path: ephemerd_*_linux_arm64.tar.gz + if-no-files-found: error + + build-windows-amd64: + name: Build (Windows amd64) + runs-on: [self-hosted, windows, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Restore cached build deps + run: C:\ephemerd-ci\entrypoint-cache.ps1 + continue-on-error: true + + - name: Ensure mage is available + run: | + if (Get-Command mage -ErrorAction SilentlyContinue) { return } + Write-Host "mage not found - installing via go install..." + $env:GOBIN = 'C:\go\bin' + go install github.com/magefile/mage@latest + + # Full two-stage build: cross-compiles ephemerd-linux for the Hyper-V + # VM, downloads kernel/initrd/rootfs, then builds ephemerd.exe with + # all assets embedded. ~700 MB output. + - name: Build Windows binary + run: mage build:windows + + - name: Verify binary + run: .\ephemerd.exe --version + + - name: Package + run: Compress-Archive -Path .\ephemerd.exe -DestinationPath "ephemerd_${env:GITHUB_REF_NAME}_windows_amd64.zip" -Force + + - uses: actions/upload-artifact@v4 + with: + name: ephemerd-windows-amd64 + path: ephemerd_*_windows_amd64.zip + if-no-files-found: error + + build-darwin-arm64: + name: Build (macOS arm64) + runs-on: [self-hosted, macos, arm64] steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - # Go and mage come from the runner-ci-linux image (see images/runner-ci-linux/Dockerfile), - # so setup-go and `go install mage` are unnecessary here. - # - uses: actions/setup-go@v6 - # with: - # go-version: ${{ env.GO_VERSION }} - # - name: Install mage - # run: go install github.com/magefile/mage@latest + - uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} + + - name: Install mage + run: go install github.com/magefile/mage@latest + + # Full Darwin build: downloads aarch64 Linux kernel/initrd/rootfs, + # cross-compiles ephemerd-linux for arm64, builds ephemerd, then + # ad-hoc codesigns with the virtualization entitlement Vz requires. + - name: Build macOS binary + run: mage build:macos - - name: Download embedded dependencies + - name: Verify binary run: | - mage download:all - mage download:runnerwindows - mage download:rootfs + ./ephemerd --version + codesign -v ./ephemerd && echo "codesigned" || echo "NOT codesigned" - - name: Cross-compile Linux binary for Windows embedding - run: mage build:linuxembed + - name: Package + run: tar -czf "ephemerd_${GITHUB_REF_NAME}_darwin_arm64.tar.gz" ephemerd - - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v6 + - uses: actions/upload-artifact@v4 with: - version: latest - args: release --clean + name: ephemerd-darwin-arm64 + path: ephemerd_*_darwin_arm64.tar.gz + if-no-files-found: error + + release: + name: Publish GitHub Release + needs: + - build-linux-amd64 + - build-linux-arm64 + - build-windows-amd64 + - build-darwin-arm64 + runs-on: [self-hosted, linux, x64] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: dist + merge-multiple: true + + - name: Compute checksums + run: | + cd dist + sha256sum * > checksums.txt + cat checksums.txt + + - name: Create draft release env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create "${GITHUB_REF_NAME}" \ + --draft \ + --prerelease \ + --generate-notes \ + --title "${GITHUB_REF_NAME}" \ + dist/* diff --git a/.goreleaser.yml b/.goreleaser.yml deleted file mode 100644 index 759d1f6..0000000 --- a/.goreleaser.yml +++ /dev/null @@ -1,62 +0,0 @@ -version: 2 - -before: - hooks: - # Download assets for ALL platforms (Linux runner/CNI/shim come from download:all). - - cmd: mage download:all - # Windows cross-compile needs the real Linux binary, Alpine rootfs, and Windows runner. - - cmd: mage download:runnerwindows - - cmd: mage download:rootfs - - cmd: mage build:linuxembed - -builds: - - id: ephemerd - main: ./cmd/ephemerd/ - binary: ephemerd - env: - - CGO_ENABLED=0 - ldflags: - - -s -w - - -X main.version={{.Version}} - - -X github.com/ephpm/ephemerd/pkg/runner.Version=2.333.1 - - -X github.com/ephpm/ephemerd/pkg/cni.Version=1.6.2 - goos: - - linux - - windows - - darwin - goarch: - - amd64 - - arm64 - ignore: - - goos: windows - goarch: arm64 - - goos: darwin - goarch: amd64 - -archives: - - id: default - formats: - - tar.gz - format_overrides: - - goos: windows - formats: - - zip - name_template: "{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}" - -checksum: - name_template: "checksums.txt" - -release: - github: - owner: ephpm - name: ephemerd - draft: true - prerelease: auto - -changelog: - sort: asc - filters: - exclude: - - "^docs:" - - "^test:" - - "^chore:" From d11c6b808c9f9c915dc0eb428c307bc387c67405 Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Thu, 14 May 2026 22:32:26 -0700 Subject: [PATCH 2/2] docs: add AGENTS.md + fix the real TestPushHandlerEndToEnd race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related things, both surfaced by the PR #69 CI run failing on the same TestPushHandlerEndToEnd flake from PR #68 that I "fixed" with a post-stage Info() diagnostic. 1. AGENTS.md — short hard-rules file for any agent (Claude, Cursor, etc.) working in this repo. Centerpiece is "run mage lint AND mage test before every push, no exceptions". Local cgo failures on Windows are not a free pass — that's exactly the path that produced two recent red CI runs (errcheck on debugexec_linux.go in PR #68, flake-mask regression in PR #69). Also documents the no-flake-masking rule: never paper over a flaky test with a diagnostic call, sleep, or label that "might help". 2. The real TestPushHandlerEndToEnd fix — hold a 5-minute lease across the entire staging→push lifecycle via leases.Create + WithLease. Without an active lease, content.WriteBlob's addContentLease is a no-op (leases.FromContext returns false), and the staged blobs are namespace-bucket-registered but un-leased and un-labeled. That combination flakes in CI in ways that look like the layer digest "doesn't exist" mid-push. Replaces the post-stage Info() diagnostic from PR #68, which was flake-masking: it made one CI run pass but the underlying race was never fixed. Verified: 5 sequential `go test -run=TestPushHandlerEndToEnd` runs pass on Windows (CGO_ENABLED=0) in 0.65–1.0s each, vs. the previous half-second-flake behavior. Note on this commit's lint coverage: golangci-lint on this Windows box fails on the miekg/pkcs11 cgo cross-import (a known local-env issue documented in AGENTS.md). golangci-lint reports "0 issues" before exiting on that typecheck. `go build ./pkg/dind/...` and `go test ./pkg/dind/...` both pass. --- AGENTS.md | 51 +++++++++++++++++++++++++++++++++++ pkg/dind/registry_e2e_test.go | 34 +++++++++++++---------- 2 files changed, 71 insertions(+), 14 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..62157f0 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,51 @@ +# AGENTS.md + +Hard rules for any AI agent (Claude Code, Cursor, Aider, etc.) working in this repo. CLAUDE.md has the longer-form expectations; this file is the short list of things that get caught in CI and shouldn't. + +## Before every `git push` + +Run the CI pipeline locally and fix every error before pushing. **No exceptions.** Not "looks fine to me", not "the change is small", not "my local cgo is broken so I'll skip it" — those are the exact reasons CI gets surprised. If the local environment can't run `mage ci`, fix that environment before pushing, or run the targets individually and document what was skipped *and why* in the commit message. + +``` +mage ci +``` + +That target runs download, lint, test, and build — the same sequence CI runs. If `mage ci` is too slow for an iteration loop, run the relevant subset: + +``` +mage lint # golangci-lint — errcheck and errorlint enabled +mage test # go test ./... +mage build # cross-compile for current OS +mage e2egithub # GitHub provider e2e against fake server +mage e2emodproxy # Go module proxy e2e +``` + +If any of those fail, fix the failure before pushing. A red CI run on a PR you opened is a process failure, not a discovery. + +## Specifically: the `golangci-lint` failure modes that have bitten us + +- **errcheck** — every fallible call must be checked. Closures over `http.ResponseWriter` writes, `Close()` in defers, `io.Copy` return values, `fmt.Fprintf` return values. The repo policy is to wrap them in `if err := foo(); err != nil { log.Warn(...) }`, never `_ = foo()`. +- **staticcheck SA9003** — empty `if` branches. If the branch is empty because the comment is "either is acceptable", invert the condition and `t.Errorf`/`return err` for the *un*expected case. +- **typecheck failures on Windows** — `miekg/pkcs11` cgo preprocessing fails on Windows. This is a *local* problem, not a CI problem. Running lint on Linux works. If you can't run lint locally, you push at your own risk and the user's annoyance. + +## Specifically: the test failure modes that have bitten us + +- **`pkg/dind/TestPushHandlerEndToEnd`** has been flaky in CI in ways that aren't obviously reproducible locally. Don't paper over a flake with a `cs.Info()` "warm-up" call or a `time.Sleep` — that's flake-masking, and the real bug will resurface in the next iteration. If a test is genuinely flaky, find the race or the missing lease/label and fix it; if you can't, mark it `t.Skip` with an issue number, not a silent diagnostic. + +## Pushing workflow yaml changes + +Workflow YAML doesn't go through `mage lint`, but the consequences of a broken `.github/workflows/*.yml` are worse than Go lint errors — a bad workflow doesn't tell you anything when it fails on the *next* trigger. For non-trivial workflow changes: + +- Use [`actionlint`](https://github.com/rhysd/actionlint) if installed, or paste into [rhysd.github.io/actionlint](https://rhysd.github.io/actionlint/) for a quick sanity check. +- Sanity-check `runs-on` labels exist on registered self-hosted runners (ephemerd JIT-registers based on host `goruntime.GOARCH`; cross-arch runs require the matching host to be online). +- Confirm any `secrets.*` references exist in the repo/org secrets before pushing. + +## Pushing release-pipeline / tag-triggered changes + +`.github/workflows/release.yml` only fires on `push: tags: v*`, so you cannot test it on a branch. Sequence: + +1. Push the workflow change in a regular PR. Get it merged. +2. Push a release-candidate tag like `v0.0.1-rc1` to validate the full pipeline end-to-end before pushing the real tag. +3. Only push `v0.0.1` once the rc has produced a clean draft release. + +Do **not** push the real tag as the first test of a changed release workflow. diff --git a/pkg/dind/registry_e2e_test.go b/pkg/dind/registry_e2e_test.go index 819944a..e245f80 100644 --- a/pkg/dind/registry_e2e_test.go +++ b/pkg/dind/registry_e2e_test.go @@ -22,6 +22,7 @@ import ( "github.com/containerd/containerd/v2/core/content" "github.com/containerd/containerd/v2/core/images" + "github.com/containerd/containerd/v2/core/leases" "github.com/containerd/containerd/v2/pkg/namespaces" containerdpkg "github.com/ephpm/ephemerd/pkg/containerd" "github.com/opencontainers/go-digest" @@ -154,6 +155,25 @@ func TestPushHandlerEndToEnd(t *testing.T) { ctx, cancel := context.WithTimeout(namespaces.WithNamespace(context.Background(), bkNamespace), 60*time.Second) defer cancel() + // Hold a lease across the entire staging→push lifecycle. Without this, + // content.WriteBlob registers the blob in the namespace bucket but + // attaches no lease (addContentLease is a no-op without leases.FromContext) + // and no GC-ref labels are written for plain child blobs. The buildkit + // namespace can then have orphan content that is racy with respect to + // containerd's internal flushing/visibility paths — this manifested as + // CI flakes where TestPushHandlerEndToEnd would fail mid-push with + // "content digest sha256:...layer...: not found". + lease, err := ctrdClient.LeasesService().Create(ctx, leases.WithExpiration(5*time.Minute)) + if err != nil { + t.Fatalf("create lease: %v", err) + } + t.Cleanup(func() { + if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil { + t.Logf("delete lease: %v", err) + } + }) + ctx = leases.WithLease(ctx, lease.ID) + // Stage a synthetic OCI image: empty layer + tiny config + manifest // pointing at both. Image record `mockRef` so /push GetImage finds it. imgDesc, err := stageSyntheticImage(ctx, ctrdClient, mockRef) @@ -162,20 +182,6 @@ func TestPushHandlerEndToEnd(t *testing.T) { } t.Logf("staged image %s -> %s (%d bytes)", mockRef, imgDesc.Digest, imgDesc.Size) - // Diagnostic: confirm the three staged blobs are visible via the same - // content store the push handler will use, in the same namespace, right - // now. If any of these Info calls reports NotFound, the write didn't - // register the digest in the buildkit-namespace bucket — distinct from - // the symptom where push later fails to ReaderAt the layer. - cs := ctrdClient.ContentStore() - layerBytes := []byte("synthetic-layer-for-push-e2e") - layerDgst := digest.FromBytes(layerBytes) - for _, d := range []digest.Digest{layerDgst, imgDesc.Digest} { - info, infoErr := cs.Info(ctx, d) - t.Logf("post-stage Info(%s): err=%v size=%d labels=%v", - d, infoErr, info.Size, info.Labels) - } - // Bring up the dind server. s, err := New(Config{ JobID: "push-e2e",