diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fe169e2..918b4aa 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,6 +7,9 @@ on: permissions: contents: read +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + jobs: # --------------------------------------------------------------------------- # Linux: compile KVM hypervisor backend (cfg(target_os = "linux")) @@ -15,8 +18,8 @@ jobs: runs-on: ubuntu-24.04-arm env: # Hosted ARM runners can expose /dev/kvm but hang in nested/restricted - # KVM ioctls. PR CI compiles the Linux KVM backend and test binaries; the - # release pipeline owns real-KVM exercise. + # KVM ioctls. PR CI compiles the Linux KVM backend and test binaries. + # The release pipeline owns real-KVM coverage. CAPSEM_SKIP_KVM_TESTS: "1" steps: - uses: actions/checkout@v5 @@ -27,16 +30,23 @@ jobs: - uses: Swatinem/rust-cache@v2 - # Try to enable KVM for diagnostics only. GitHub-hosted runners don't - # always expose nested virt -- and when they do, restricted ioctls can - # hang. PR CI compiles the KVM backend with CAPSEM_SKIP_KVM_TESTS=1; the - # release pipeline owns real-KVM coverage. - - name: Enable KVM (best-effort) - continue-on-error: true + # Collect KVM diagnostics only. GitHub-hosted runners don't always expose + # nested virt -- and when they do, restricted ioctls can hang. PR CI + # compiles the KVM backend with CAPSEM_SKIP_KVM_TESTS=1; the release + # pipeline owns real-KVM coverage. + - name: Collect KVM diagnostics run: | - echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules - sudo udevadm control --reload-rules - sudo udevadm trigger --name-match=kvm + if echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules >/dev/null; then + sudo udevadm control --reload-rules || echo "::notice::udev reload failed; keeping KVM diagnostics non-blocking" + sudo udevadm trigger --name-match=kvm || echo "::notice::udev trigger failed; keeping KVM diagnostics non-blocking" + else + echo "::notice::could not write KVM udev rule; keeping KVM diagnostics non-blocking" + fi + if [ -e /dev/kvm ]; then + ls -l /dev/kvm + else + echo "::notice::/dev/kvm is not present on this runner" + fi # Compile Linux library + service crate tests without executing them. The # macOS job owns runtime unit coverage for portable code; this job proves @@ -144,13 +154,14 @@ jobs: # capsem-mcp-builtin are thin binaries that pull capsem-core logic. - name: Unit tests with coverage run: | + set -o pipefail cargo llvm-cov nextest --no-cfg-coverage --profile ci --codecov --output-path codecov-unit.json --fail-under-lines 65 -p capsem-core -p capsem-agent -p capsem-logger -p capsem-proto -p capsem-guard -p capsem-gateway -p capsem-service -p capsem -p capsem-mcp -p capsem-mcp-aggregator -p capsem-mcp-builtin -p capsem-tray -p capsem-app -p capsem-process - cargo llvm-cov report --no-cfg-coverage --summary-only -p capsem-core -p capsem-agent -p capsem-logger -p capsem-proto -p capsem-guard -p capsem-gateway -p capsem-service -p capsem -p capsem-mcp -p capsem-mcp-aggregator -p capsem-mcp-builtin -p capsem-tray -p capsem-app -p capsem-process 2>&1 | tee coverage-summary.txt + cargo llvm-cov report --summary-only -p capsem-core -p capsem-agent -p capsem-logger -p capsem-proto -p capsem-guard -p capsem-gateway -p capsem-service -p capsem -p capsem-mcp -p capsem-mcp-aggregator -p capsem-mcp-builtin -p capsem-tray -p capsem-app -p capsem-process 2>&1 | tee coverage-summary.txt # Integration tests (tests/ directory, cross-crate) - name: Integration tests with coverage run: | - cargo llvm-cov nextest --no-cfg-coverage --profile ci --codecov --output-path codecov-integration.json -p capsem-core --test '*' || true + cargo llvm-cov nextest --no-cfg-coverage --profile ci --codecov --output-path codecov-integration.json -p capsem-core --test '*' # Frontend tests with coverage + JUnit output - name: Frontend type-check, test, and build @@ -223,10 +234,11 @@ jobs: # Upload test results for test analytics - name: Upload test results to Codecov if: ${{ !cancelled() }} - uses: codecov/test-results-action@v1 + uses: codecov/codecov-action@v5 with: files: target/nextest/ci/junit.xml,frontend-junit.xml,python-junit.xml token: ${{ secrets.CODECOV_TOKEN }} + report_type: test_results # T5: preserve every test artifact (service.log / process.log / # session.db etc.) on failure so PR reviewers can debug without diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index ec27ce2..f60e616 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -14,6 +14,7 @@ jobs: deployments: write env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} steps: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 45c785a..7ec29d4 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -8,6 +8,9 @@ permissions: attestations: write id-token: write +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + jobs: preflight: runs-on: macos-14 diff --git a/.github/workflows/site.yaml b/.github/workflows/site.yaml index add244c..256b04c 100644 --- a/.github/workflows/site.yaml +++ b/.github/workflows/site.yaml @@ -14,6 +14,7 @@ jobs: deployments: write env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index e23e8e8..a72f0b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed Linux PR CI so hosted ARM runners compile the KVM backend and test binaries without hanging in live KVM probes or unbounded hosted-runner test execution; release CI remains the real-KVM exercise gate. +- Fixed ordinary CI hardening gaps: Linux KVM diagnostics no longer emit red + success annotations, Rust integration coverage is release-blocking, coverage + summary errors are not hidden by `tee`, and Codecov test analytics use the + supported uploader. ## [1.1.1778542197] - 2026-05-11 diff --git a/skills/release-process/SKILL.md b/skills/release-process/SKILL.md index 91ed56c..b00d9e9 100644 --- a/skills/release-process/SKILL.md +++ b/skills/release-process/SKILL.md @@ -161,6 +161,12 @@ Test runs in parallel with builds. A test failure blocks `create-release` but do `cargo test --no-run --all-targets` for the portable host crates: it compiles the KVM backend and Linux test binaries without executing hosted-runner KVM probes, while release CI owns real-KVM exercise. +- **Ordinary CI must not hide red signals.** Diagnostic-only steps should not + use `continue-on-error`; make the diagnostic command itself non-fatal so a + green job does not carry a red annotation. Test steps must not end in + `|| true`, coverage summary pipes must use `set -o pipefail`, and Codecov + test analytics should use `codecov/codecov-action@v5` with + `report_type: test_results`. - **No AppImage on any platform.** linuxdeploy cannot run on GitHub CI runners -- Ubuntu 24.04 lacks FUSE2, and neither `libfuse2` nor `APPIMAGE_EXTRACT_AND_RUN=1` fixes it reliably. All Linux platforms ship `.deb` only. CI matrix passes `bundles: deb` for both arm64 and x86_64. `just cross-compile` matches this. This cost 14 consecutive failed releases (v0.12.1 through v0.14.14) to discover. - **Tauri signing keys on all platforms.** `TAURI_SIGNING_PRIVATE_KEY` and `TAURI_SIGNING_PRIVATE_KEY_PASSWORD` must be passed to every `cargo tauri build` step (macOS and Linux). Missing keys cause "public key found but no private key" failure. The macOS job had them from the start; the Linux job was missing them until v0.14.11. - **Collect all updater artifacts.** Linux artifact collection must include `.tar.gz`, `.tar.gz.sig`, `.AppImage.tar.gz`, `.AppImage.tar.gz.sig` -- not just `.deb` and `.AppImage`. Tauri's updater needs the `.sig` files. diff --git a/sprints/ci-hardening-followup/plan.md b/sprints/ci-hardening-followup/plan.md new file mode 100644 index 0000000..f569c54 --- /dev/null +++ b/sprints/ci-hardening-followup/plan.md @@ -0,0 +1,59 @@ +# CI Hardening Follow-Up + +## Goal + +Beef up ordinary PR/main CI after the release verification work exposed warnings +that could still look green in GitHub Actions. This sprint turns the observed +weak spots into workflow behavior plus policy tests so they do not drift back. + +## Scope + +- `.github/workflows/ci.yaml` + - Keep Linux PR CI compile-only for hosted KVM, but make the diagnostics step + non-red without relying on `continue-on-error`. + - Make Rust integration coverage release-blocking by removing the `|| true` + mask. + - Make the coverage summary pipe fail when `cargo llvm-cov report` fails. + - Move Codecov test analytics from the deprecated test-results action to the + supported `codecov/codecov-action@v5` path. + - Opt the workflow into Node 24 action runtime to avoid late Node 20 action + deprecation surprises. +- `.github/workflows/release.yaml`, `.github/workflows/docs.yaml`, + `.github/workflows/site.yaml` + - Opt remaining workflows into Node 24 action runtime. +- `tests/test_ci_codesign_runner.py` + - Add policy tests that fail if these ordinary CI invariants regress. +- `skills/release-process/SKILL.md` + - Capture the hard-won invariant in the release skill. +- `CHANGELOG.md` + - Record the user-facing CI reliability fix under Unreleased. + +## Decisions + +- Keep KVM live execution out of PR Linux CI. Hosted ARM runners are still not + the right place for real KVM exercise; release CI owns that. +- Do not use `continue-on-error` for diagnostic-only steps. Make the diagnostic + command explicitly non-fatal so a green job does not carry a red annotation. +- Do not mask test commands with `|| true`. If a test lane is intentionally + informational, it should be named and tested as such, not silently ignored. +- Use `set -o pipefail` on coverage summary pipes so `tee` cannot hide a failed + coverage command. + +## Done + +- Focused CI policy tests pass locally. +- Release workflow policy tests still pass. +- `git diff --check` is clean. +- Branch is pushed and PR CI proves the workflow changes on GitHub. + +## Coverage Matrix + +- Unit/contract: `tests/test_ci_codesign_runner.py` workflow policy tests. +- Functional: GitHub Actions PR/main CI run exercises the edited workflow. +- Adversarial: Policy tests assert masks and deprecated action wiring are absent. +- E2E/VM: Not applicable; this change only edits CI orchestration. +- Telemetry: Not applicable; no session data changes. +- Performance: Not applicable; no runtime product path changes. +- Missing/deferred: Full `just test` is not required for YAML-only CI policy + hardening; the focused Python policy tests and GitHub Actions run are the + relevant gates. diff --git a/sprints/ci-hardening-followup/tracker.md b/sprints/ci-hardening-followup/tracker.md new file mode 100644 index 0000000..f68a6ad --- /dev/null +++ b/sprints/ci-hardening-followup/tracker.md @@ -0,0 +1,51 @@ +# Sprint: CI Hardening Follow-Up + +## Tasks + +- [x] Reproduce the CI policy gaps with failing workflow tests. +- [x] Replace the Linux KVM red-success diagnostic step. +- [x] Make Rust integration coverage blocking. +- [x] Protect the coverage summary pipe with `set -o pipefail`. +- [x] Move Codecov test analytics to `codecov/codecov-action@v5`. +- [x] Opt ordinary workflows into the Node 24 action runtime. +- [x] Document the invariant in `release-process`. +- [x] Add changelog entry. +- [x] Run focused local gates. +- [x] Commit. +- [ ] Push branch and open PR. +- [ ] Watch GitHub CI. + +## Notes + +- Discovery: the prior main CI was green but carried a red annotation from the + KVM setup step because `continue-on-error` only softened the conclusion. +- Discovery: `cargo llvm-cov report --no-cfg-coverage` emitted an unsupported + flag error, and the pipe to `tee` hid the failing command status. +- Discovery: the Rust integration coverage lane was currently masked by + `|| true`, while the latest GitHub run showed those tests passing. +- Discovery: Codecov has deprecated `codecov/test-results-action@v1`; test + analytics now go through `codecov/codecov-action@v5` with + `report_type: test_results`. + +## Coverage Ledger + +- Unit/contract: + - `uv run --offline pytest tests/test_ci_codesign_runner.py -q` passed + with 12 tests. + - `uv run --offline pytest tests/test_release_workflow_policy.py -q` passed + with 17 tests. +- Functional: + - GitHub Actions PR run after push. +- Adversarial: + - Workflow tests fail if `continue-on-error`, `|| true`, hidden coverage pipe + behavior, deprecated Codecov test-results action, or missing Node 24 runtime + opt-in returns. +- E2E/VM: + - Not applicable; no VM product path changed. +- Telemetry: + - Not applicable; no telemetry path changed. +- Performance: + - Not applicable; no runtime path changed. +- Missing/deferred: + - Full `just test` is outside this YAML policy hardening slice. The CI run is + the functional proof for this change. diff --git a/tests/test_ci_codesign_runner.py b/tests/test_ci_codesign_runner.py index 5d01ff2..e8d6a18 100644 --- a/tests/test_ci_codesign_runner.py +++ b/tests/test_ci_codesign_runner.py @@ -125,3 +125,53 @@ def test_pr_linux_ci_compiles_kvm_without_exercising_hosted_kvm(): assert "codecov-linux.json" not in linux_job assert "-p capsem-core" in linux_job assert 'std::env::var_os("CAPSEM_SKIP_KVM_TESTS")' in kvm_sys + + +def test_pr_linux_kvm_diagnostics_do_not_emit_red_success_annotations(): + """Diagnostic-only KVM setup must not rely on continue-on-error.""" + workflow = (REPO_ROOT / ".github" / "workflows" / "ci.yaml").read_text() + linux_job = workflow.split(" test-linux:\n", 1)[1] + linux_job = linux_job.split("\n # ---------------------------------------------------------------------------", 1)[0] + + assert "continue-on-error: true" not in linux_job + assert "Enable KVM (best-effort)" not in linux_job + assert "Collect KVM diagnostics" in linux_job + + +def test_ci_rust_integration_coverage_is_release_blocking(): + """Rust integration coverage must fail CI when the tests fail.""" + workflow = (REPO_ROOT / ".github" / "workflows" / "ci.yaml").read_text() + section = workflow.split(" - name: Integration tests with coverage\n", 1)[1] + section = section.split("\n # Frontend tests with coverage", 1)[0] + + assert "cargo llvm-cov nextest" in section + assert "|| true" not in section + + +def test_ci_coverage_summary_report_errors_are_not_hidden_by_tee(): + """The coverage summary command must be compatible and pipefail-protected.""" + workflow = (REPO_ROOT / ".github" / "workflows" / "ci.yaml").read_text() + section = workflow.split(" - name: Unit tests with coverage\n", 1)[1] + section = section.split("\n # Integration tests", 1)[0] + + assert "set -o pipefail" in section + assert "cargo llvm-cov report --summary-only" in section + assert "cargo llvm-cov report --no-cfg-coverage" not in section + + +def test_ci_uses_supported_codecov_test_results_upload(): + """Codecov test analytics should use codecov-action, not deprecated action.""" + workflow = (REPO_ROOT / ".github" / "workflows" / "ci.yaml").read_text() + section = workflow.split(" - name: Upload test results to Codecov\n", 1)[1] + section = section.split("\n # T5: preserve every test artifact", 1)[0] + + assert "codecov/test-results-action" not in workflow + assert "uses: codecov/codecov-action@v5" in section + assert "report_type: test_results" in section + + +def test_workflows_opt_into_node24_action_runtime(): + """Avoid late Node 20 action-runtime surprises across all workflows.""" + for workflow in sorted((REPO_ROOT / ".github" / "workflows").glob("*.yaml")): + text = workflow.read_text() + assert "FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true" in text, workflow