Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,10 @@ on:
push:
branches: [main]
paths:
- "evals/**"
- "scripts/scaffold-eval.ts"
- "src/evals/**"
- "packages/evals/**"
- ".github/workflows/evals.yml"
- "package.json"
- "pnpm-lock.yaml"
- "vitest.evals.config.ts"
pull_request:
types: [opened, synchronize, reopened, labeled]

Expand All @@ -33,13 +30,10 @@ jobs:
with:
script: |
const evalPaths = [
'evals/',
'scripts/scaffold-eval.ts',
'src/evals/',
'packages/evals/',
'.github/workflows/evals.yml',
'package.json',
'pnpm-lock.yaml',
'vitest.evals.config.ts',
];

function setRun(run, reason) {
Expand Down Expand Up @@ -113,8 +107,8 @@ jobs:

- name: Run evals
env:
VITEST_EVALS_JSON: eval-results.json
VITEST_EVALS_JUNIT: eval-results.junit.xml
VITEST_EVALS_JSON: ../../eval-results.json
VITEST_EVALS_JUNIT: ../../eval-results.junit.xml
run: |
set +e
pnpm evals
Expand Down
3 changes: 2 additions & 1 deletion .npmignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ action.yml
*.test.d.ts
**/*.test.*
vitest*.config.ts
coverage/

# Environment and secrets (.npmignore overrides .gitignore)
.env
Expand All @@ -24,6 +25,7 @@ vitest*.config.ts
.github/
.agents/
.codex/
.cursor/
.warden/
.claude/
.dex/
Expand All @@ -34,7 +36,6 @@ pnpm-lock.yaml
pnpm-workspace.yaml

# Evals and dev scripts
evals/
/scripts/
superwarden-bench/

Expand Down
7 changes: 4 additions & 3 deletions .oxlintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
"ignorePatterns": [
"dist/**",
"node_modules/**",
"coverage/**",
"*.js",
"examples/**",
"scripts/**",
"vitest*.config.ts"
"packages/docs/**/*.astro",
"packages/evals/fixtures/**"
],
"rules": {
"constructor-super": "error",
Expand Down Expand Up @@ -166,4 +167,4 @@
}
}
]
}
}
12 changes: 2 additions & 10 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ TELEMETRY.md # Sentry telemetry investigation map; points to Junior's

packages/
├── docs/ # Astro docs site (dex-docs), deployed via Vercel
├── evals/ # Private eval package: runner, scenarios, fixtures, test skills

src/ # @sentry/warden core (root package)
├── index.ts # Library entry point
Expand All @@ -37,17 +38,8 @@ src/ # @sentry/warden core (root package)
├── cli/ # CLI entry and commands
│ └── output/ # CLI output formatting
├── action/ # GitHub Action entry
├── evals/ # Eval runner, judge, and types
├── utils/ # Shared utilities
└── examples/ # Example configurations

evals/ # Eval specs, fixtures, and test skills (see evals/README.md)
├── eval-*.yaml # Harness smoke suites using eval-* test skills
├── code-review/ # Code-review benchmark scenarios
├── security-review/ # Security-review benchmark scenarios
├── verification/ # Verifier-only eval scenarios
├── skills/ # Test skills used as eval vehicles
└── fixtures/ # Source code with known issues
```

## Key Conventions
Expand Down Expand Up @@ -132,7 +124,7 @@ Skills define **what to look for**, not how to respond to findings:

## Evals

End-to-end evals for the full pipeline plus verifier-only evals. The Vitest entrypoints are split as `src/evals/e2e.eval.ts`, `src/evals/code-review.eval.ts`, `src/evals/security-review.eval.ts`, and `src/evals/verify.eval.ts`. See [`evals/INTERNAL.md`](evals/INTERNAL.md) for maintainer workflow and [`evals/README.md`](evals/README.md) for schemas. Run with `pnpm evals`; scaffold PR fixtures with `pnpm evals:scaffold <github-pr-url>`.
End-to-end evals for the full pipeline plus verifier-only evals live in `packages/evals/`. See [`packages/evals/INTERNAL.md`](packages/evals/INTERNAL.md) for maintainer workflow and [`packages/evals/README.md`](packages/evals/README.md) for schemas. Run with `pnpm evals`; scaffold PR fixtures with `pnpm evals:scaffold <github-pr-url>`.

## Voice

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pnpm test:coverage # unit tests with LCOV coverage
pnpm evals # end-to-end evals (requires API key)
```

See [`evals/README.md`](evals/README.md) for the eval framework.
See [`packages/evals/README.md`](packages/evals/README.md) for the eval framework.

## License

Expand Down
15 changes: 0 additions & 15 deletions evals/code-review/robots-prefix-blocks-public-metadata.json

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

15 changes: 0 additions & 15 deletions evals/security-review/sentry-preprod-snapshot-project-access.json

This file was deleted.

This file was deleted.

This file was deleted.

15 changes: 0 additions & 15 deletions evals/security-review/sentry-replay-delete-read-scope.json

This file was deleted.

15 changes: 0 additions & 15 deletions evals/security-review/sentry-workflow-connect-workflows-authz.json

This file was deleted.

This file was deleted.

22 changes: 15 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
"build": "pnpm run clean:dist && tsc -p tsconfig.build.json",
"build:action": "rm -rf dist/action && ncc build src/action/main.ts -o dist/action --no-source-map-register --license licenses.txt",
"dev": "tsc --watch",
"lint": "oxlint src",
"lint:fix": "oxlint --fix src",
"lint": "oxlint .",
"lint:fix": "oxlint --fix .",
"test": "vitest run",
"test:coverage": "vitest run --coverage",
"test:watch": "vitest",
"test:examples": "vitest run --config vitest.integration.config.ts",
"evals": "vitest run --config vitest.evals.config.ts",
"evals:scaffold": "tsx scripts/scaffold-eval.ts",
"typecheck": "tsc --noEmit",
"evals": "pnpm -C packages/evals evals",
"evals:scaffold": "pnpm -C packages/evals scaffold",
"typecheck": "tsc --noEmit && pnpm -C packages/evals typecheck",
"update-pricing": "tsx scripts/update-pricing.ts",
"generate:jsonl-schema": "tsx scripts/generate-jsonl-schema.ts",
"docs": "pnpm --filter dex-docs dev",
Expand All @@ -36,6 +36,15 @@
"src/**/*.ts": [
"oxlint --fix"
],
"packages/evals/src/**/*.ts": [
"oxlint --fix"
],
"packages/evals/scripts/**/*.ts": [
"oxlint --fix"
],
"packages/evals/*.ts": [
"oxlint --fix"
],
"packages/docs/**/*.astro": [
"pnpm -C packages/docs build"
]
Expand Down Expand Up @@ -89,8 +98,7 @@
"tinyrainbow": "^3.0.3",
"tsx": "^4.19.0",
"typescript": "^5.9.3",
"vitest": "^4.1.6",
"vitest-evals": "0.9.0-beta.3"
"vitest": "^4.1.6"
},
"engines": {
"node": ">=20.0.0"
Expand Down
Loading
Loading