Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 164 additions & 20 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
name: Release

# Per-OS build pipeline. Each platform builds on its native self-hosted
# runner via `mage build:<target>`, packages the binary, and uploads it as
# a workflow artifact. A final `release` job downloads all artifacts,
# computes checksums, and creates a draft GitHub release.
#
# This split exists because cross-compiling Windows/Darwin binaries from
# a Linux host leaves their go:embed sections empty — the embedded VM
# kernel, initrd, and rootfs are platform-specific assets downloaded by
# mage targets that only run on the matching host. Each native build
# pulls its own assets and produces a fully working binary.

on:
push:
tags:
Expand All @@ -12,35 +23,168 @@ env:
GO_VERSION: "1.26.1"

jobs:
release:
name: Build & Release
build-linux-amd64:
name: Build (Linux amd64)
runs-on: [self-hosted, linux, x64]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # so `git describe --tags` returns the tag

# mage comes from the runner-ci-linux image, same as ci.yml.
- name: Restore cached build deps
run: /opt/ephemerd-ci/entrypoint-cache.sh
continue-on-error: true

- name: Build Linux amd64 binary
run: mage build:build

- name: Verify binary
run: ./ephemerd --version

- name: Package
run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_amd64.tar.gz" ephemerd

- uses: actions/upload-artifact@v4
with:
name: ephemerd-linux-amd64
path: ephemerd_*_linux_amd64.tar.gz
if-no-files-found: error

build-linux-arm64:
name: Build (Linux arm64)
runs-on: [self-hosted, linux, arm64]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- uses: actions/setup-go@v6
with:
go-version: ${{ env.GO_VERSION }}

- name: Install mage
run: go install github.com/magefile/mage@latest

- name: Build Linux arm64 binary
run: mage build:build

- name: Verify binary
run: ./ephemerd --version

- name: Package
run: tar -czf "ephemerd_${GITHUB_REF_NAME}_linux_arm64.tar.gz" ephemerd

- uses: actions/upload-artifact@v4
with:
name: ephemerd-linux-arm64
path: ephemerd_*_linux_arm64.tar.gz
if-no-files-found: error

build-windows-amd64:
name: Build (Windows amd64)
runs-on: [self-hosted, windows, x64]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- name: Restore cached build deps
run: C:\ephemerd-ci\entrypoint-cache.ps1
continue-on-error: true

- name: Ensure mage is available
run: |
if (Get-Command mage -ErrorAction SilentlyContinue) { return }
Write-Host "mage not found - installing via go install..."
$env:GOBIN = 'C:\go\bin'
go install github.com/magefile/mage@latest

# Full two-stage build: cross-compiles ephemerd-linux for the Hyper-V
# VM, downloads kernel/initrd/rootfs, then builds ephemerd.exe with
# all assets embedded. ~700 MB output.
- name: Build Windows binary
run: mage build:windows

- name: Verify binary
run: .\ephemerd.exe --version

- name: Package
run: Compress-Archive -Path .\ephemerd.exe -DestinationPath "ephemerd_${env:GITHUB_REF_NAME}_windows_amd64.zip" -Force

- uses: actions/upload-artifact@v4
with:
name: ephemerd-windows-amd64
path: ephemerd_*_windows_amd64.zip
if-no-files-found: error

build-darwin-arm64:
name: Build (macOS arm64)
runs-on: [self-hosted, macos, arm64]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

# Go and mage come from the runner-ci-linux image (see images/runner-ci-linux/Dockerfile),
# so setup-go and `go install mage` are unnecessary here.
# - uses: actions/setup-go@v6
# with:
# go-version: ${{ env.GO_VERSION }}
# - name: Install mage
# run: go install github.com/magefile/mage@latest
- uses: actions/setup-go@v6
with:
go-version: ${{ env.GO_VERSION }}

- name: Install mage
run: go install github.com/magefile/mage@latest

# Full Darwin build: downloads aarch64 Linux kernel/initrd/rootfs,
# cross-compiles ephemerd-linux for arm64, builds ephemerd, then
# ad-hoc codesigns with the virtualization entitlement Vz requires.
- name: Build macOS binary
run: mage build:macos

- name: Download embedded dependencies
- name: Verify binary
run: |
mage download:all
mage download:runnerwindows
mage download:rootfs
./ephemerd --version
codesign -v ./ephemerd && echo "codesigned" || echo "NOT codesigned"

- name: Cross-compile Linux binary for Windows embedding
run: mage build:linuxembed
- name: Package
run: tar -czf "ephemerd_${GITHUB_REF_NAME}_darwin_arm64.tar.gz" ephemerd

- name: Run GoReleaser
uses: goreleaser/goreleaser-action@v6
- uses: actions/upload-artifact@v4
with:
version: latest
args: release --clean
name: ephemerd-darwin-arm64
path: ephemerd_*_darwin_arm64.tar.gz
if-no-files-found: error

release:
name: Publish GitHub Release
needs:
- build-linux-amd64
- build-linux-arm64
- build-windows-amd64
- build-darwin-arm64
runs-on: [self-hosted, linux, x64]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: dist
merge-multiple: true

- name: Compute checksums
run: |
cd dist
sha256sum * > checksums.txt
cat checksums.txt

- name: Create draft release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh release create "${GITHUB_REF_NAME}" \
--draft \
--prerelease \
--generate-notes \
--title "${GITHUB_REF_NAME}" \
dist/*
62 changes: 0 additions & 62 deletions .goreleaser.yml

This file was deleted.

51 changes: 51 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# AGENTS.md

Hard rules for any AI agent (Claude Code, Cursor, Aider, etc.) working in this repo. CLAUDE.md has the longer-form expectations; this file is the short list of things that get caught in CI and shouldn't.

## Before every `git push`

Run the CI pipeline locally and fix every error before pushing. **No exceptions.** Not "looks fine to me", not "the change is small", not "my local cgo is broken so I'll skip it" — those are the exact reasons CI gets surprised. If the local environment can't run `mage ci`, fix that environment before pushing, or run the targets individually and document what was skipped *and why* in the commit message.

```
mage ci
```

That target runs download, lint, test, and build — the same sequence CI runs. If `mage ci` is too slow for an iteration loop, run the relevant subset:

```
mage lint # golangci-lint — errcheck and errorlint enabled
mage test # go test ./...
mage build # cross-compile for current OS
mage e2egithub # GitHub provider e2e against fake server
mage e2emodproxy # Go module proxy e2e
```

If any of those fail, fix the failure before pushing. A red CI run on a PR you opened is a process failure, not a discovery.

## Specifically: the `golangci-lint` failure modes that have bitten us

- **errcheck** — every fallible call must be checked. Closures over `http.ResponseWriter` writes, `Close()` in defers, `io.Copy` return values, `fmt.Fprintf` return values. The repo policy is to wrap them in `if err := foo(); err != nil { log.Warn(...) }`, never `_ = foo()`.
- **staticcheck SA9003** — empty `if` branches. If the branch is empty because the comment is "either is acceptable", invert the condition and `t.Errorf`/`return err` for the *un*expected case.
- **typecheck failures on Windows** — `miekg/pkcs11` cgo preprocessing fails on Windows. This is a *local* problem, not a CI problem. Running lint on Linux works. If you can't run lint locally, you push at your own risk and the user's annoyance.

## Specifically: the test failure modes that have bitten us

- **`pkg/dind/TestPushHandlerEndToEnd`** has been flaky in CI in ways that aren't obviously reproducible locally. Don't paper over a flake with a `cs.Info()` "warm-up" call or a `time.Sleep` — that's flake-masking, and the real bug will resurface in the next iteration. If a test is genuinely flaky, find the race or the missing lease/label and fix it; if you can't, mark it `t.Skip` with an issue number, not a silent diagnostic.

## Pushing workflow yaml changes

Workflow YAML doesn't go through `mage lint`, but the consequences of a broken `.github/workflows/*.yml` are worse than Go lint errors — a bad workflow doesn't tell you anything when it fails on the *next* trigger. For non-trivial workflow changes:

- Use [`actionlint`](https://github.com/rhysd/actionlint) if installed, or paste into [rhysd.github.io/actionlint](https://rhysd.github.io/actionlint/) for a quick sanity check.
- Sanity-check `runs-on` labels exist on registered self-hosted runners (ephemerd JIT-registers based on host `goruntime.GOARCH`; cross-arch runs require the matching host to be online).
- Confirm any `secrets.*` references exist in the repo/org secrets before pushing.

## Pushing release-pipeline / tag-triggered changes

`.github/workflows/release.yml` only fires on `push: tags: v*`, so you cannot test it on a branch. Sequence:

1. Push the workflow change in a regular PR. Get it merged.
2. Push a release-candidate tag like `v0.0.1-rc1` to validate the full pipeline end-to-end before pushing the real tag.
3. Only push `v0.0.1` once the rc has produced a clean draft release.

Do **not** push the real tag as the first test of a changed release workflow.
34 changes: 20 additions & 14 deletions pkg/dind/registry_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/containerd/containerd/v2/core/content"
"github.com/containerd/containerd/v2/core/images"
"github.com/containerd/containerd/v2/core/leases"
"github.com/containerd/containerd/v2/pkg/namespaces"
containerdpkg "github.com/ephpm/ephemerd/pkg/containerd"
"github.com/opencontainers/go-digest"
Expand Down Expand Up @@ -154,6 +155,25 @@ func TestPushHandlerEndToEnd(t *testing.T) {
ctx, cancel := context.WithTimeout(namespaces.WithNamespace(context.Background(), bkNamespace), 60*time.Second)
defer cancel()

// Hold a lease across the entire staging→push lifecycle. Without this,
// content.WriteBlob registers the blob in the namespace bucket but
// attaches no lease (addContentLease is a no-op without leases.FromContext)
// and no GC-ref labels are written for plain child blobs. The buildkit
// namespace can then have orphan content that is racy with respect to
// containerd's internal flushing/visibility paths — this manifested as
// CI flakes where TestPushHandlerEndToEnd would fail mid-push with
// "content digest sha256:...layer...: not found".
lease, err := ctrdClient.LeasesService().Create(ctx, leases.WithExpiration(5*time.Minute))
if err != nil {
t.Fatalf("create lease: %v", err)
}
t.Cleanup(func() {
if err := ctrdClient.LeasesService().Delete(context.Background(), lease); err != nil {
t.Logf("delete lease: %v", err)
}
})
ctx = leases.WithLease(ctx, lease.ID)

// Stage a synthetic OCI image: empty layer + tiny config + manifest
// pointing at both. Image record `mockRef` so /push GetImage finds it.
imgDesc, err := stageSyntheticImage(ctx, ctrdClient, mockRef)
Expand All @@ -162,20 +182,6 @@ func TestPushHandlerEndToEnd(t *testing.T) {
}
t.Logf("staged image %s -> %s (%d bytes)", mockRef, imgDesc.Digest, imgDesc.Size)

// Diagnostic: confirm the three staged blobs are visible via the same
// content store the push handler will use, in the same namespace, right
// now. If any of these Info calls reports NotFound, the write didn't
// register the digest in the buildkit-namespace bucket — distinct from
// the symptom where push later fails to ReaderAt the layer.
cs := ctrdClient.ContentStore()
layerBytes := []byte("synthetic-layer-for-push-e2e")
layerDgst := digest.FromBytes(layerBytes)
for _, d := range []digest.Digest{layerDgst, imgDesc.Digest} {
info, infoErr := cs.Info(ctx, d)
t.Logf("post-stage Info(%s): err=%v size=%d labels=%v",
d, infoErr, info.Size, info.Labels)
}

// Bring up the dind server.
s, err := New(Config{
JobID: "push-e2e",
Expand Down