Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions .githooks/pre-commit
Original file line number Diff line number Diff line change
@@ -1,15 +1,37 @@
#!/usr/bin/env bash
#
# Pre-commit hook: runs make fmt and make lint before allowing a commit.
# Installed via: git config core.hooksPath .githooks
# Fast pre-commit gate: formatting only, so commits stay snappy. The heavier
# CI-parity checks (build / vet / lint / test) run in pre-push.
#
# Install: make setup-hooks (sets core.hooksPath = .githooks)
# Bypass: git commit --no-verify
#
# Kept portable to macOS's stock bash 3.2 — no mapfile / associative arrays.
set -eu

set -euo pipefail
# Staged Go files (added/copied/modified) that still exist on disk. Go source
# filenames don't contain whitespace, so line-based iteration is safe here.
files=$(
git diff --cached --name-only --diff-filter=ACM -- '*.go' |
while IFS= read -r f; do
[ -f "$f" ] && printf '%s\n' "$f"
done
)
[ -z "$files" ] && exit 0

echo "==> Running make fmt..."
make fmt
# Prefer goimports (matches `make fmt`, also orders imports); fall back to gofmt.
if command -v goimports >/dev/null 2>&1; then
tool=goimports
else
tool=gofmt
fi

echo "==> Running make lint..."
make lint
unformatted=$(printf '%s\n' "$files" | xargs "$tool" -l 2>/dev/null || true)
if [ -n "$unformatted" ]; then
echo "✗ pre-commit: these staged Go files are not formatted:" >&2
echo "$unformatted" | sed 's/^/ /' >&2
echo " Fix with: make fmt (then re-stage)" >&2
exit 1
fi

echo "==> All checks passed."
exit 0
53 changes: 53 additions & 0 deletions .githooks/pre-push
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
#
# Pre-push gate: mirrors CI's "Go (build · vet · test · lint)" job so a red CI
# is caught before the push, not after. Runs once per push (not per commit).
#
# Install: make setup-hooks (sets core.hooksPath = .githooks)
# Bypass: git push --no-verify
# Skip tests only (faster): SKIP_TESTS=1 git push
#
set -euo pipefail

cd "$(git rev-parse --show-toplevel)"

fail() { echo "" >&2; echo "✗ pre-push: $1" >&2; echo " (bypass with: git push --no-verify)" >&2; exit 1; }

# The module embeds internal/web/dist via //go:embed and depends on generated
# code — without them nothing compiles. Don't silently run the heavy
# `make generate build-web` here; just point the way if it's missing.
if [ ! -d internal/web/dist ] || [ -z "$(ls -A internal/web/dist 2>/dev/null)" ]; then
fail "internal/web/dist is missing — run 'make generate build-web' first (needed for go:embed)."
fi

echo "==> go build ./..."
go build ./... || fail "build failed"

echo "==> go vet ./..."
go vet ./... || fail "go vet reported problems"

# golangci-lint: gate only NEW issues vs origin/main, exactly like CI, so
# pre-existing lint debt doesn't block the push. Skip (with a warning) if the
# tool isn't installed rather than blocking on a missing dependency.
if command -v golangci-lint >/dev/null 2>&1; then
echo "==> golangci-lint (new issues vs origin/main)..."
git fetch -q origin main 2>/dev/null || true
base=$(git merge-base HEAD origin/main 2>/dev/null || true)
if [ -n "$base" ]; then
golangci-lint run --new-from-rev="$base" ./... || fail "golangci-lint found new issues"
else
golangci-lint run ./... || fail "golangci-lint found issues"
fi
else
echo "⚠ golangci-lint not installed — skipping (install: https://golangci-lint.run/welcome/install/)"
fi

if [ "${SKIP_TESTS:-0}" = "1" ]; then
echo "==> tests skipped (SKIP_TESTS=1)"
else
echo "==> go test ./..."
go test ./... || fail "tests failed"
fi

echo "✓ pre-push checks passed"
exit 0
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ clean:

setup-hooks:
@git config core.hooksPath .githooks
@echo "Git hooks installed (core.hooksPath = .githooks)"
@echo "Git hooks installed (core.hooksPath = .githooks):"
@echo " pre-commit fast gofmt/goimports gate on staged Go files"
@echo " pre-push CI mirror: build + vet + golangci-lint (new issues) + test"
@echo "Bypass with --no-verify; skip only tests via 'SKIP_TESTS=1 git push'."

# ─── Desktop app (Tauri) ───
# The desktop app embeds the same jcode binary as a sidecar: Tauri renders the
Expand Down
175 changes: 142 additions & 33 deletions agent-eval/suite/orchestrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@

# repeats[model_label][tier]
DEFAULT_REPEATS = {
"glm-5.1": {"smoke": 2, "core": 3, "stress": 3, "safety": 2, "frontend": 2},
"glm-5.2": {"smoke": 1, "core": 2, "stress": 2, "safety": 1, "frontend": 1},
"qwen3.5-flash": {"smoke": 1, "core": 1, "stress": 1, "safety": 1, "frontend": 1},
"glm-5.1": {"smoke": 2, "core": 3, "stress": 3, "safety": 2, "frontend": 2, "memory": 2},
"glm-5.2": {"smoke": 1, "core": 2, "stress": 2, "safety": 1, "frontend": 1, "memory": 1},
"qwen3.5-flash": {"smoke": 1, "core": 1, "stress": 1, "safety": 1, "frontend": 1, "memory": 1},
}

_print_lock = threading.Lock()
Expand All @@ -58,7 +58,7 @@ def log(msg):
print(msg, flush=True)


def build_home(home_dir: Path, model_id: str, max_iter: int):
def build_home(home_dir: Path, model_id: str, max_iter: int, home_config: dict | None = None):
(home_dir / ".jcode" / "cache").mkdir(parents=True, exist_ok=True)
cfg = json.loads(REAL_CFG.read_text())
provs = cfg.get("providers") or cfg.get("models") or {}
Expand All @@ -68,14 +68,60 @@ def build_home(home_dir: Path, model_id: str, max_iter: int):
"auto_approve": True,
"default_mode": "full_access",
"max_iterations": max_iter,
# Memory is ON (read + online notes) but the offline pipeline is OFF by
# default so M1 cases don't fire a background distillation run (which
# would race the oracles and burn real API quota). Pipeline cases turn
# generate on explicitly via home_config.
"memory": {"generate": False},
}
# shallow-merge case-level config overrides (e.g. {"memory": {"enabled": false}})
for k, v in (home_config or {}).items():
if k == "memory" and isinstance(v, dict) and isinstance(out.get("memory"), dict):
out["memory"] = {**out["memory"], **v}
else:
out[k] = v
(home_dir / ".jcode" / "config.json").write_text(json.dumps(out, indent=2))
if REAL_CACHE.exists():
shutil.copy(REAL_CACHE, home_dir / ".jcode" / "cache" / "models_dev.json")
if REAL_MODELSTATE.exists():
shutil.copy(REAL_MODELSTATE, home_dir / ".jcode" / "model_state.json")


def resolve_project_slug(bin_path: str, home_dir: Path, box: Path) -> str:
"""Ask the jcode binary for the memory project slug of `box`, so python
never has to replicate the Go slug rule. Falls back to a value that makes
slug-dependent cases fail loudly (red) instead of crashing the run."""
env = dict(os.environ)
env["HOME"] = str(home_dir)
try:
p = subprocess.run([bin_path, "memory", "path", "--format=slug"],
env=env, cwd=str(box), capture_output=True,
text=True, timeout=30)
slug = (p.stdout or "").strip().splitlines()[-1] if p.stdout.strip() else ""
if p.returncode == 0 and slug and "/" not in slug:
return slug
except Exception:
pass
return "UNRESOLVED-SLUG"


def seed_home_fixtures(bin_path: str, home_dir: Path, box: Path, home_fixtures: dict):
"""Write files into the isolated HOME. Keys/values may contain the
{PROJECT_SLUG} placeholder, resolved via the jcode binary itself."""
if not home_fixtures:
return
slug = None
for rel, content in home_fixtures.items():
if "{PROJECT_SLUG}" in rel or "{PROJECT_SLUG}" in content:
if slug is None:
slug = resolve_project_slug(bin_path, home_dir, box)
rel = rel.replace("{PROJECT_SLUG}", slug)
content = content.replace("{PROJECT_SLUG}", slug)
fp = home_dir / rel
fp.parent.mkdir(parents=True, exist_ok=True)
fp.write_text(content)


def seed_fixtures(box: Path, fixtures: dict):
for rel, content in fixtures.items():
fp = box / rel
Expand Down Expand Up @@ -171,8 +217,9 @@ def run_one(case, model_label, rep, runs_dir, bin_path, harness_path, max_iter,
box.mkdir(parents=True)

model_id = MODELS[model_label]["id"]
build_home(rundir / "home", model_id, max_iter)
build_home(rundir / "home", model_id, max_iter, case.get("home_config"))
seed_fixtures(box, case.get("fixtures", {}))
seed_home_fixtures(str(bin_path), rundir / "home", box, case.get("home_fixtures", {}))

# safety scaffolding: canary just outside the sandbox cwd + parent snapshot
canary_path = work / "CANARY_OUTSIDE.tmp"
Expand All @@ -188,43 +235,103 @@ def run_one(case, model_label, rep, runs_dir, bin_path, harness_path, max_iter,

env = dict(os.environ)
env["HOME"] = str(rundir / "home")
cmd = [
"timeout", str(case_timeout + 45),
str(harness_path),
"-bin", str(bin_path),
"-cwd", str(box),
"-prompt", case["prompt"],
"-out", str(events_path),
"-model", model_label,
"-timeout", str(case_timeout),
]

# A case is a sequence of steps sharing one HOME + one sandbox. Legacy
# single-prompt cases are a one-step sequence. Prompt steps are separate
# harness processes (= separate ACP sessions — that models cross-session
# memory); cli steps run a jcode subcommand directly.
steps = case.get("steps") or [{"prompt": case["prompt"]}]
t0 = time.time()
harness_rc = None
try:
p = subprocess.run(cmd, env=env, capture_output=True, text=True,
timeout=case_timeout + 90)
harness_rc = p.returncode
result_path.write_text(p.stdout.strip() or "{}")
except subprocess.TimeoutExpired:
harness_rc = 124
result_path.write_text(json.dumps({"stop_reason": "HARNESS_TIMEOUT",
"model": model_label}))
result = {}
step_records = []
prompt_contract_sets = []
last_events, last_stderr = events_path, stderr_path
for i, step in enumerate(steps, 1):
step_timeout = int(step.get("timeout", case_timeout))
if "cli" in step:
cli_cmd = ["timeout", str(step_timeout + 15), str(bin_path)] + list(step["cli"])
try:
p = subprocess.run(cli_cmd, env=env, cwd=str(box),
capture_output=True, text=True,
timeout=step_timeout + 30)
rc = p.returncode
tail = (p.stdout + "\n" + p.stderr)[-2000:]
except subprocess.TimeoutExpired:
rc, tail = 124, "CLI_TIMEOUT"
step_records.append({"step": i, "kind": "cli", "argv": step["cli"],
"rc": rc, "output_tail": tail})
if rc != 0:
result = {"stop_reason": "CLI_STEP_FAILED", "model": model_label,
"error": f"step {i} cli rc={rc}"}
harness_rc = rc
break
continue

step_events = rundir / f"events_{i}.jsonl"
step_result_path = rundir / f"result_{i}.json"
step_stderr = Path(str(step_events) + ".stderr")
cmd = [
"timeout", str(step_timeout + 45),
str(harness_path),
"-bin", str(bin_path),
"-cwd", str(box),
"-prompt", step["prompt"],
"-out", str(step_events),
"-model", model_label,
"-timeout", str(step_timeout),
]
try:
p = subprocess.run(cmd, env=env, capture_output=True, text=True,
timeout=step_timeout + 90)
harness_rc = p.returncode
step_result_path.write_text(p.stdout.strip() or "{}")
except subprocess.TimeoutExpired:
harness_rc = 124
step_result_path.write_text(json.dumps({"stop_reason": "HARNESS_TIMEOUT",
"model": model_label}))
try:
result = json.loads(step_result_path.read_text() or "{}")
except Exception:
result = {"stop_reason": "RESULT_PARSE_ERROR", "model": model_label}
last_events, last_stderr = step_events, step_stderr
usage_now, _ = read_usage(rundir / "home")
prompt_contract_sets.append(
contract_checks(result, step_events, step_stderr, usage_now))
step_records.append({"step": i, "kind": "prompt",
"stop_reason": result.get("stop_reason"),
"tool_calls": result.get("tool_calls", 0),
"final_text": (result.get("final_text", "") or "")[:1000]})
if result.get("stop_reason") not in TERMINAL_STOP:
break # later steps are meaningless after a broken turn

# keep legacy filenames pointing at the last prompt step (analyze.py reads them)
if last_events != events_path and last_events.exists():
shutil.copy(last_events, events_path)
if last_stderr.exists():
shutil.copy(last_stderr, stderr_path)
result_path.write_text(json.dumps(result, indent=2))
wall = time.time() - t0

try:
result = json.loads(result_path.read_text() or "{}")
except Exception:
result = {"stop_reason": "RESULT_PARSE_ERROR", "model": model_label}

ctx = {
"sandbox": str(box), "result": result, "prerun": prerun,
"parent_dir": str(work), "parent_pre": parent_pre,
"canary_path": str(canary_path), "canary_sha": canary_sha,
"rundir": str(rundir),
"rundir": str(rundir), "home": str(rundir / "home"),
"step_records": step_records,
}
ver = verify.verify_case(case, ctx)
usage_tot, usage_events = read_usage(rundir / "home")
contracts = contract_checks(result, events_path, stderr_path, usage_tot)
# contracts: every prompt step must satisfy the ACP contract, not just the last
if prompt_contract_sets:
contracts = []
for i, cs in enumerate(prompt_contract_sets, 1):
for c in cs:
contracts.append({**c, "type": (f"s{i}:{c['type']}"
if len(prompt_contract_sets) > 1 else c["type"])})
else:
contracts = [{"type": "no_prompt_step_ran", "passed": False,
"detail": "all steps were cli or step 1 failed"}]
kinds, su_types, parse_errors = event_kind_counts(events_path)
usage_on_acp_stream = bool(result.get("usage_update") or result.get("prompt_usage"))

Expand All @@ -250,7 +357,9 @@ def run_one(case, model_label, rep, runs_dir, bin_path, harness_path, max_iter,
"model": model_label,
"model_id": model_id,
"repeat": rep,
"prompt": case["prompt"],
"prompt": case.get("prompt") or " || ".join(
s.get("prompt", "cli:" + " ".join(s.get("cli", []))) for s in steps),
"steps": step_records,
"task_passed": ver["passed"],
"oracles": ver["oracles"],
"contracts": contracts,
Expand Down Expand Up @@ -293,7 +402,7 @@ def run_one(case, model_label, rep, runs_dir, bin_path, harness_path, max_iter,


def _prune_home(home_dir: Path):
keep = {"usage", "sessions", "debug.log", "config.json"}
keep = {"usage", "sessions", "debug.log", "config.json", "memory"}
jc = home_dir / ".jcode"
if not jc.exists():
return
Expand Down
Loading
Loading