criptogus · criptogus · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/.github/workflows/validate-content.yml b/.github/workflows/validate-content.yml
@@ -5,12 +5,16 @@ on:
     paths:
       - "content/**"
       - "scripts/validate-content.mjs"
+      - "scripts/audit-skills.mjs"
+      - "src/lib/security/prompt-injection-guard.ts"
       - ".github/workflows/validate-content.yml"
   push:
     branches: [main]
     paths:
       - "content/**"
       - "scripts/validate-content.mjs"
+      - "scripts/audit-skills.mjs"
+      - "src/lib/security/prompt-injection-guard.ts"
 
 jobs:
   validate:
@@ -24,3 +28,5 @@ jobs:
         run: bun install --frozen-lockfile
       - name: Validate registry
         run: bun run validate:content
+      - name: Security audit (prompt injection / malicious functions)
+        run: bun run audit:skills
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -74,9 +74,20 @@ A package is accepted when it:
 
 ```bash
 bun run validate:content
+bun run audit:skills
 ```
 
-The validator checks schema, slug uniqueness, file naming and example count. CI runs the same script on every PR.
+The validator checks schema, slug uniqueness, file naming and example count.
+
+`audit:skills` is the **marketplace security gate**: every package is scanned
+for prompt-injection / jailbreak signals (shared with the runtime guard) and
+for malicious "functions" embedded in instructions — remote code execution
+(`curl … | sh`), credential/dotenv exfiltration, reverse shells, beacons to
+non-allowlisted hosts, hardcoded keys, and obfuscated payloads. A package is
+**rejected** when its worst finding is `high` or `critical`. A skill whose job
+*is* security testing must declare a `security` / `red-team` / `adversarial`
+tag so its quoted example payloads are treated as data, not as attacks. Both
+scripts run in CI on every PR that touches `content/`.
 
 ### Step 5 — Open a pull request
 

diff --git a/package.json b/package.json
@@ -11,6 +11,7 @@
     "lint": "eslint .",
     "format": "prettier --write .",
     "validate:content": "node --experimental-strip-types scripts/validate-content.mjs",
+    "audit:skills": "node --experimental-strip-types scripts/audit-skills.mjs",
     "sync:content": "node scripts/sync-content-to-registry.mjs",
     "sync:adversarial": "node scripts/sync-adversarial-cases.mjs",
     "eval:adversarial": "node scripts/eval-adversarial.mjs",
@@ -23,7 +24,7 @@
     "trust:verify": "node scripts/verify-trust-attestation.mjs",
     "test": "npm run test:plain && npm run test:ts",
     "test:plain": "node --test tests/adversarial-harness.test.mjs tests/trust.test.mjs tests/release-signing.test.mjs tests/cli-install.test.mjs tests/trust-attestation.test.mjs",
-    "test:ts": "node --experimental-strip-types --test tests/prompt-injection-guard.test.mjs tests/runtime.test.mjs tests/integrations.test.mjs tests/growth-revenue-split.test.mjs tests/trust-badge.test.mjs tests/bounties.test.mjs"
+    "test:ts": "node --experimental-strip-types --test tests/prompt-injection-guard.test.mjs tests/audit-skills.test.mjs tests/runtime.test.mjs tests/integrations.test.mjs tests/growth-revenue-split.test.mjs tests/trust-badge.test.mjs tests/bounties.test.mjs"
   },
   "dependencies": {
     "@ai-sdk/openai-compatible": "^2.0.47",

diff --git a/scripts/audit-skills.mjs b/scripts/audit-skills.mjs
@@ -0,0 +1,242 @@
+#!/usr/bin/env node
+// Marketplace security audit gate.
+//
+// Every package that is published to the marketplace (skills, playbooks,
+// souls, integrations) is scanned for malicious content BEFORE it can be
+// synced/released:
+//   1. Prompt-injection / jailbreak signals — reuses the production guard in
+//      src/lib/security/prompt-injection-guard.ts so the gate and the runtime
+//      stay in sync.
+//   2. Malicious "functions" — shell/network primitives embedded in a skill's
+//      instructions that would make an agent exfiltrate data, run remote code,
+//      wipe disks, or phone home to a non-allowlisted host.
+//
+// A package is BLOCKED (non-zero exit) when its worst finding is at or above
+// the configured threshold (default: "high"). Lower-severity signals are
+// reported as warnings but do not fail the build.
+//
+// Usage:  npm run audit:skills
+//         node --experimental-strip-types scripts/audit-skills.mjs
+//         node --experimental-strip-types scripts/audit-skills.mjs --json
+//         AUDIT_REJECT_AT=critical npm run audit:skills
+
+import { readdirSync, readFileSync, statSync } from "node:fs";
+import { join, basename } from "node:path";
+import { pathToFileURL } from "node:url";
+import { parse as parseYaml } from "yaml";
+import { inspectContent } from "../src/lib/security/prompt-injection-guard.ts";
+
+const ROOT = new URL("..", import.meta.url).pathname;
+const FOLDERS = ["skills", "playbooks", "souls", "integrations"];
+const REJECT_AT = process.env.AUDIT_REJECT_AT || "high";
+const JSON_OUT = process.argv.includes("--json");
+
+const SEVERITY_RANK = { none: 0, low: 1, medium: 2, high: 3, critical: 4 };
+const worse = (a, b) => (SEVERITY_RANK[a] >= SEVERITY_RANK[b] ? a : b);
+
+// Hosts a skill is legitimately allowed to reference in its instructions.
+// Anything else paired with an outbound verb is treated as exfiltration.
+const ALLOWED_HOSTS = [
+  "superagentskill.com",
+  "github.com",
+  "raw.githubusercontent.com",
+  "api.github.com",
+  "developer.mozilla.org",
+  "owasp.org",
+];
+
+// Malicious-function heuristics layered on top of the prompt-injection guard.
+// High precision, ordered most-dangerous first.
+const CODE_RULES = [
+  { label: "remote code execution (curl|wget piped to shell)", category: "malicious_function", severity: "critical",
+    re: /\b(curl|wget|fetch)\b[^\n]{0,120}\|\s*(sudo\s+)?(ba|z|d|fi)?sh\b/gi },
+  { label: "destructive filesystem wipe", category: "malicious_function", severity: "critical",
+    re: /\brm\s+-[a-z]*r[a-z]*f?\b[^\n]{0,40}(\/|~|\$HOME|\*)/gi },
+  { label: "disk overwrite (dd / mkfs)", category: "malicious_function", severity: "critical",
+    re: /\b(dd\s+if=|mkfs(\.\w+)?\s|:\(\)\s*\{\s*:\|:&\s*\};:)/gi },
+  { label: "dynamic code eval of decoded payload", category: "malicious_function", severity: "critical",
+    re: /\b(eval|exec|Function|child_process|os\.system|subprocess|popen)\b[^\n]{0,60}\b(base64|atob|fromCharCode|decode|\$\()/gi },
+  { label: "credential / dotenv exfiltration", category: "data_exfiltration", severity: "critical",
+    re: /\b(cat|read|upload|post|send|exfiltrat\w*)\b[^\n]{0,60}(\.env|\.aws|\.ssh|id_rsa|credentials|secrets?\.(json|ya?ml)|process\.env)\b/gi },
+  { label: "reverse shell", category: "malicious_function", severity: "critical",
+    re: /\b(bash\s+-i|nc\s+-e|ncat|\/dev\/tcp\/|socat)\b[^\n]{0,60}(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|\d{2,5})/gi },
+  { label: "outbound request to embedded URL", category: "data_exfiltration", severity: "high",
+    re: /\b(curl|wget|fetch|axios|requests?\.(get|post)|http\.request|navigator\.sendBeacon|XMLHttpRequest)\b[^\n]{0,80}https?:\/\/[^\s'"`)]+/gi },
+  { label: "webhook / pastebin beacon", category: "data_exfiltration", severity: "high",
+    re: /https?:\/\/(?:[^\s'"`)/]+\.)?(webhook\.site|requestbin\.\w+|pipedream\.net|ngrok\.[a-z]+|pastebin\.com|hookb\.in|burpcollaborator\.net|interactsh\.\w+|oast\.\w+)/gi },
+  { label: "hardcoded private key / token", category: "data_exfiltration", severity: "high",
+    re: /(-----BEGIN [A-Z ]*PRIVATE KEY-----|\b(sk|rk|pk)_(live|test)_[A-Za-z0-9]{16,}|\bghp_[A-Za-z0-9]{30,}|\bAKIA[0-9A-Z]{16}\b|\bxox[baprs]-[A-Za-z0-9-]{10,})/g },
+  { label: "package install of arbitrary source", category: "malicious_function", severity: "medium",
+    re: /\b(npm\s+i(nstall)?|pip\s+install|gem\s+install|go\s+install)\b[^\n]{0,80}(https?:\/\/|git\+|github:[^\s]+\/)/gi },
+  { label: "obfuscated payload (long base64 / hex blob)", category: "encoding_evasion", severity: "medium",
+    re: /\b(?:[A-Za-z0-9+/]{120,}={0,2}|(?:\\x[0-9a-fA-F]{2}){40,})\b/g },
+];
+
+// Marketplace-generated boilerplate that is structurally safe but trips the
+// generic "force tool invocation" pattern. Stripped before scanning so the
+// gate stays high-precision.
+const BENIGN_BOILERPLATE = [
+  /ask the agent to invoke this skill by name[^.]*?trigger phrases?[^.]*?\./gis,
+];
+
+// Tags that mark a package as a deliberate security / red-team fixture. For
+// these, quoted attack strings inside `examples` are the product, not an
+// attack on the running agent, so injection-class findings there are demoted.
+const SECURITY_FIXTURE_TAGS = new Set([
+  "security", "red-team", "redteam", "adversarial", "ai-safety", "owasp-llm",
+]);
+const DEMOTE_IN_EXAMPLES = new Set([
+  "instruction_override", "role_hijack", "system_prompt_leak",
+  "policy_bypass", "encoding_evasion",
+]);
+const DEMOTE = { critical: "medium", high: "low", medium: "low", low: "none", none: "none" };
+
+function urlHost(u) {
+  try { return new URL(u).host.toLowerCase(); } catch { return ""; }
+}
+
+// Demote an "outbound request" finding to low if it only targets an
+// allowlisted host (docs links etc.), to keep false positives down.
+function effectiveSeverity(rule, matchText) {
+  if (rule.label.startsWith("outbound request")) {
+    const url = matchText.match(/https?:\/\/[^\s'"`)]+/);
+    if (url) {
+      const host = urlHost(url[0]);
+      if (ALLOWED_HOSTS.some((h) => host === h || host.endsWith("." + h))) return "low";
+    }
+  }
+  return rule.severity;
+}
+
+function scanCode(text) {
+  const findings = [];
+  let max = "none";
+  for (const rule of CODE_RULES) {
+    rule.re.lastIndex = 0;
+    let m;
+    while ((m = rule.re.exec(text)) !== null) {
+      const sev = effectiveSeverity(rule, m[0]);
+      const start = Math.max(0, m.index - 20);
+      findings.push({
+        pattern: rule.label,
+        category: rule.category,
+        severity: sev,
+        excerpt: text.slice(start, m.index + m[0].length + 20).slice(0, 160).replace(/\s+/g, " "),
+        offset: m.index,
+      });
+      max = worse(max, sev);
+      if (!rule.re.global) break;
+    }
+  }
+  return { findings, severity: max };
+}
+
+// Collect every string an LLM would read, grouped by originating field so the
+// gate can apply field-aware rules (e.g. demote attack strings inside the
+// `examples` of a declared security fixture).
+function harvestFields(pkg) {
+  const flatten = (v, acc) => {
+    if (typeof v === "string") acc.push(v);
+    else if (Array.isArray(v)) v.forEach((x) => flatten(x, acc));
+    else if (v && typeof v === "object") Object.values(v).forEach((x) => flatten(x, acc));
+    return acc;
+  };
+  const fields = {};
+  for (const k of [
+    "description", "long_description", "system_prompt", "agent_footer",
+    "rules", "examples", "persona", "instructions", "steps", "body",
+  ]) {
+    if (pkg[k] == null) continue;
+    let text = flatten(pkg[k], []).join("\n");
+    for (const re of BENIGN_BOILERPLATE) text = text.replace(re, " ");
+    fields[k] = text;
+  }
+  return fields;
+}
+
+// Audit a single parsed package object. Exported for unit tests.
+export function auditPackage(pkg) {
+  const fields = harvestFields(pkg);
+  const tags = Array.isArray(pkg.tags) ? pkg.tags.map((t) => String(t).toLowerCase()) : [];
+  const isFixture = tags.some((t) => SECURITY_FIXTURE_TAGS.has(t));
+  const findings = [];
+  let severity = "none";
+  for (const [field, text] of Object.entries(fields)) {
+    const inj = inspectContent(text, { fence: false, rejectAtOrAbove: "critical" });
+    const code = scanCode(text);
+    for (const f of [...inj.findings, ...code.findings]) {
+      let sev = f.severity;
+      if (isFixture && (field === "examples" || field === "description" || field === "long_description")
+          && DEMOTE_IN_EXAMPLES.has(f.category)) {
+        sev = DEMOTE[sev];
+      }
+      if (sev === "none") continue;
+      findings.push({ ...f, severity: sev, field });
+      severity = worse(severity, sev);
+    }
+  }
+  const blocked = SEVERITY_RANK[severity] >= SEVERITY_RANK[REJECT_AT];
+  return { severity, blocked, findings };
+}
+
+function main() {
+const results = [];
+let blocked = 0;
+let totalFindings = 0;
+
+for (const folder of FOLDERS) {
+  const dir = join(ROOT, "content", folder);
+  let entries;
+  try { entries = readdirSync(dir); } catch { continue; }
+  for (const entry of entries) {
+    const full = join(dir, entry);
+    if (!statSync(full).isFile() || !/\.ya?ml$/i.test(entry)) continue;
+    if (basename(entry).startsWith("_")) continue;
+
+    let pkg;
+    try { pkg = parseYaml(readFileSync(full, "utf8")); }
+    catch { continue; }
+    if (!pkg) continue;
+
+    const { severity, blocked: reject, findings } = auditPackage(pkg);
+    if (reject) blocked++;
+    totalFindings += findings.length;
+
+    if (findings.length) {
+      results.push({
+        file: full.replace(ROOT, ""),
+        slug: pkg.slug ?? basename(entry),
+        severity,
+        blocked: reject,
+        findings: findings.sort((a, b) => SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity]),
+      });
+    }
+  }
+}
+
+if (JSON_OUT) {
+  console.log(JSON.stringify({ rejectAt: REJECT_AT, blocked, results }, null, 2));
+} else {
+  const C = { red: "[31m", yellow: "[33m", green: "[32m", dim: "[2m", reset: "[0m" };
+  if (!results.length) {
+    console.log(`${C.green}✓${C.reset} No malicious or injection signals in any marketplace package.`);
+  }
+  for (const r of results.sort((a, b) => SEVERITY_RANK[b.severity] - SEVERITY_RANK[a.severity])) {
+    const tag = r.blocked ? `${C.red}BLOCKED${C.reset}` : `${C.yellow}WARN${C.reset}`;
+    console.log(`\n${tag} ${r.file} ${C.dim}(worst: ${r.severity})${C.reset}`);
+    for (const f of r.findings) {
+      const col = SEVERITY_RANK[f.severity] >= 3 ? C.red : C.yellow;
+      console.log(`  ${col}[${f.severity}]${C.reset} ${f.category} — ${f.pattern} ${C.dim}(${f.field})${C.reset}`);
+      console.log(`    ${C.dim}@${f.offset}: …${f.excerpt}…${C.reset}`);
+    }
+  }
+  console.log(
+    `\nScanned content/{${FOLDERS.join(",")}} — ` +
+    `${totalFindings} finding(s), ${blocked} package(s) at/above "${REJECT_AT}".`,
+  );
+}
+
+process.exit(blocked > 0 ? 1 : 0);
+}
+
+if (import.meta.url === pathToFileURL(process.argv[1] ?? "").href) main();
diff --git a/src/lib/admin/author.server.ts b/src/lib/admin/author.server.ts
@@ -37,7 +37,7 @@ export async function insertDraftPackage(
   supabase: any,
   userId: string,
   draft: any,
-  meta: { source_kind: "github" | "markdown" | "request" | "wizard"; source_ref: string; publish?: boolean }
+  meta: { source_kind: "github" | "markdown" | "request" | "wizard"; source_ref: string }
 ) {
   const baseSlug = draft.slug;
   let slug = baseSlug;
@@ -63,9 +63,14 @@ export async function insertDraftPackage(
       description: draft.description,
       long_description: draft.long_description,
       author_id: userId,
-      author_handle: "@admin",
-      author_verified: true,
-      is_published: !!meta.publish,
+      // Trust fields are NOT self-asserted. A new draft is unverified and
+      // unreviewed; `author_verified` and `review_status='approved'` are only
+      // ever granted by an admin via the review workflow. The DB also enforces
+      // this with a BEFORE UPDATE trigger so a compromised/abused client
+      // cannot escalate via direct RLS writes.
+      author_verified: false,
+      is_published: false,
+      review_status: "draft",
       latest_version: "0.1.0",
       scopes: draft.scopes,
       source_kind: meta.source_kind,
@@ -78,7 +83,7 @@ export async function insertDraftPackage(
   const { error: verErr } = await supabase.from("package_versions").insert({
     package_id: pkg.id,
     version: "0.1.0",
-    status: meta.publish ? "stable" : "beta",
+    status: "beta",
     notes: `Source: ${meta.source_kind} (${meta.source_ref})`,
     system_prompt: draft.system_prompt,
     rules: draft.rules,

diff --git a/src/lib/admin/imports.functions.ts b/src/lib/admin/imports.functions.ts
@@ -25,8 +25,16 @@ export const wizardCreatePackage = createServerFn({ method: "POST" })
     const pkg = await insertDraftPackage(supabase, userId, draft, {
       source_kind: "wizard",
       source_ref: vertical || "wizard",
-      publish: data.publish,
     });
+    // Publishing always goes through the single gated path
+    // (setReviewStatus → mandatory adversarial gate). `publish` only submits
+    // the draft into the review queue; it never auto-approves.
+    if (data.publish) {
+      await supabase
+        .from("packages")
+        .update({ review_status: "pending", submitted_at: new Date().toISOString() })
+        .eq("id", pkg.id);
+    }
     return { package: pkg, draft };
   });
 

diff --git a/src/lib/admin/meta-ads-pack.functions.ts b/src/lib/admin/meta-ads-pack.functions.ts
@@ -105,8 +105,17 @@ export const generateMetaAdsBlueprint = createServerFn({ method: "POST" })
     const pkg = await insertDraftPackage(supabase, userId, draft, {
       source_kind: "wizard",
       source_ref: `meta-ads-mcp:${bp.id}`,
-      publish: data.publish,
     });
+    // insertDraftPackage always creates a private, unverified draft. Even for
+    // this admin flow, publishing goes through the single gated path
+    // (setReviewStatus → mandatory adversarial gate). `publish` here only
+    // submits the draft into the review queue; it never auto-approves.
+    if (data.publish) {
+      await supabase
+        .from("packages")
+        .update({ review_status: "pending", submitted_at: new Date().toISOString() })
+        .eq("id", pkg.id);
+    }
 
     // 4) Patch the version row with mcp_servers / permissions / live_resources columns
     //    (insertDraftPackage's INSERT only writes the core version fields).