Skip to content

Commit 4e96f92

Browse files
jddunnclaude
andcommitted
feat(ml-classifiers): add Toxicity, Injection, and Jailbreak classifiers
Implements the three default IContentClassifier implementations for the ML classifier guardrail extension pack: ToxicityClassifier (unitary/toxic-bert, 6 multi-label categories), InjectionClassifier (protectai/deberta-v3-small- prompt-injection-v2, binary INJECTION/SAFE), and JailbreakClassifier (meta-llama/PromptGuard-86M, 3-class jailbreak/injection/benign). All three use lazy pipeline loading via ISharedServiceRegistry with graceful degradation on model load failure. 62 unit tests added, all passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 68e0aee commit 4e96f92

File tree

6 files changed

+1609
-0
lines changed

6 files changed

+1609
-0
lines changed
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
/**
2+
* @fileoverview Prompt-injection content classifier using the
3+
* `protectai/deberta-v3-small-prompt-injection-v2` model.
4+
*
5+
* Prompt injection is the attack pattern where adversarial instructions are
6+
* embedded inside user-supplied text to override or hijack the agent's system
7+
* prompt. This classifier provides a dedicated binary signal (INJECTION /
8+
* SAFE) that the guardrail orchestrator can act on independently of the
9+
* toxicity or jailbreak classifiers.
10+
*
11+
* Model details
12+
* -------------
13+
* `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa
14+
* model from ProtectAI, specifically trained to distinguish benign user
15+
* messages from prompt-injection payloads. It outputs two labels:
16+
* - `INJECTION` — high-confidence injection attempt
17+
* - `SAFE` — normal user input
18+
*
19+
* Graceful degradation
20+
* --------------------
21+
* If the model fails to load the classifier sets `unavailable = true` and
22+
* returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }`
23+
* on every subsequent call.
24+
*
25+
* @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier
26+
*/
27+
28+
import type { ClassificationResult } from '../../../../core/ai_utilities/IUtilityAI';
29+
import type { ISharedServiceRegistry } from '../../../ISharedServiceRegistry';
30+
import type { IContentClassifier } from '../IContentClassifier';
31+
import type { ClassifierConfig } from '../types';
32+
import { ML_CLASSIFIER_SERVICE_IDS } from '../types';
33+
34+
// ---------------------------------------------------------------------------
35+
// Internal raw pipeline output type
36+
// ---------------------------------------------------------------------------
37+
38+
/**
39+
* A single label/score pair as returned by the HuggingFace text-classification
40+
* pipeline when called with `{ topk: null }`.
41+
*/
42+
interface RawLabel {
43+
/** Label name, e.g. `'INJECTION'` or `'SAFE'`. */
44+
label: string;
45+
/** Confidence score in the range [0, 1]. */
46+
score: number;
47+
}
48+
49+
// ---------------------------------------------------------------------------
50+
// InjectionClassifier
51+
// ---------------------------------------------------------------------------
52+
53+
/**
54+
* Binary prompt-injection classifier backed by
55+
* `protectai/deberta-v3-small-prompt-injection-v2`.
56+
*
57+
* Returns one of two labels:
58+
* - `INJECTION` — the text contains an injection attempt
59+
* - `SAFE` — the text is clean
60+
*
61+
* The label with the higher confidence becomes `bestClass` / `confidence`.
62+
* Both labels are present in `allScores` so callers can read the SAFE score
63+
* as well.
64+
*
65+
* @implements {IContentClassifier}
66+
*
67+
* @example
68+
* ```typescript
69+
* const classifier = new InjectionClassifier(serviceRegistry);
70+
* const result = await classifier.classify('Ignore previous instructions and …');
71+
* // result.bestClass === 'INJECTION', result.confidence ≈ 0.97
72+
* ```
73+
*/
74+
export class InjectionClassifier implements IContentClassifier {
75+
// -------------------------------------------------------------------------
76+
// IContentClassifier identity fields
77+
// -------------------------------------------------------------------------
78+
79+
/** Unique service identifier for this classifier. */
80+
readonly id = 'prompt-injection';
81+
82+
/** Human-readable name for dashboards and log output. */
83+
readonly displayName = 'Prompt Injection Classifier';
84+
85+
/** Short description of what this classifier detects. */
86+
readonly description =
87+
'Detects prompt-injection attempts where adversarial instructions are ' +
88+
'embedded in user input to override or hijack the agent system prompt.';
89+
90+
/**
91+
* Default Hugging Face model ID.
92+
* Overridable via {@link ClassifierConfig.modelId}.
93+
*/
94+
readonly modelId = 'protectai/deberta-v3-small-prompt-injection-v2';
95+
96+
// -------------------------------------------------------------------------
97+
// Internal state
98+
// -------------------------------------------------------------------------
99+
100+
/**
101+
* Whether the model weights are fully loaded and the classifier is ready
102+
* to accept `classify()` calls.
103+
*/
104+
private _isLoaded = false;
105+
106+
/**
107+
* Set to `true` when the model fails to load. Once `unavailable`, every
108+
* subsequent `classify()` call immediately returns the pass result rather
109+
* than retrying the expensive model load.
110+
*/
111+
private unavailable = false;
112+
113+
// -------------------------------------------------------------------------
114+
// Constructor
115+
// -------------------------------------------------------------------------
116+
117+
/**
118+
* @param services - Shared service registry used to lazily create and cache
119+
* the underlying HuggingFace pipeline instance.
120+
* @param config - Optional per-classifier configuration. When
121+
* `config.modelId` is provided it overrides the default `modelId` when
122+
* loading the model.
123+
*/
124+
constructor(
125+
private readonly services: ISharedServiceRegistry,
126+
private readonly config?: ClassifierConfig,
127+
) {}
128+
129+
// -------------------------------------------------------------------------
130+
// IContentClassifier.isLoaded (getter)
131+
// -------------------------------------------------------------------------
132+
133+
/**
134+
* Whether the underlying model pipeline has been successfully initialised.
135+
* The flag is set to `true` after the first successful `classify()` call.
136+
*/
137+
get isLoaded(): boolean {
138+
return this._isLoaded;
139+
}
140+
141+
// -------------------------------------------------------------------------
142+
// classify
143+
// -------------------------------------------------------------------------
144+
145+
/**
146+
* Run prompt-injection inference on `text`.
147+
*
148+
* Lazily loads the pipeline on the first call via the shared service
149+
* registry, then calls it with `{ topk: null }` to retrieve scores for both
150+
* labels.
151+
*
152+
* @param text - The text to evaluate.
153+
* @returns A promise that resolves with the classification result. If the
154+
* model is unavailable the pass result is returned instead of throwing.
155+
*/
156+
async classify(text: string): Promise<ClassificationResult> {
157+
// Return the pass result immediately if the model previously failed to load.
158+
if (this.unavailable) {
159+
return this.passResult();
160+
}
161+
162+
// Lazily obtain (or create) the HuggingFace pipeline instance from the
163+
// shared service registry so the model is only downloaded once.
164+
let pipeline: (text: string, opts: { topk: null }) => Promise<RawLabel[]>;
165+
try {
166+
pipeline = await this.services.getOrCreate(
167+
ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE,
168+
async () => {
169+
// Dynamic import so environments without @huggingface/transformers
170+
// can still load the rest of AgentOS.
171+
const { pipeline: createPipeline } = await import(
172+
'@huggingface/transformers'
173+
);
174+
return createPipeline(
175+
'text-classification',
176+
// Honour a caller-supplied model override; fall back to the default.
177+
this.config?.modelId ?? this.modelId,
178+
{ quantized: true },
179+
);
180+
},
181+
{
182+
/** Release ONNX/WASM resources when the registry entry is evicted. */
183+
dispose: async (p: any) => p?.dispose?.(),
184+
/** Tags used for diagnostics and capability discovery. */
185+
tags: ['ml', 'classifier', 'prompt-injection', 'onnx'],
186+
},
187+
);
188+
189+
// Mark the classifier as ready now that the pipeline is available.
190+
this._isLoaded = true;
191+
} catch {
192+
// Model failed to load — mark as unavailable and return the pass result.
193+
this.unavailable = true;
194+
return this.passResult();
195+
}
196+
197+
// Run inference and request both label scores.
198+
const raw = await pipeline(text, { topk: null });
199+
return this.mapResult(raw);
200+
}
201+
202+
// -------------------------------------------------------------------------
203+
// dispose (optional IContentClassifier lifecycle hook)
204+
// -------------------------------------------------------------------------
205+
206+
/**
207+
* Release the pipeline instance from the shared service registry.
208+
*
209+
* Idempotent — safe to call multiple times.
210+
*/
211+
async dispose(): Promise<void> {
212+
await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE);
213+
this._isLoaded = false;
214+
}
215+
216+
// -------------------------------------------------------------------------
217+
// Private helpers
218+
// -------------------------------------------------------------------------
219+
220+
/**
221+
* Returns a "pass" result used when the model is unavailable.
222+
*
223+
* A pass result reports `bestClass: 'benign'` with zero confidence so the
224+
* guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}.
225+
*/
226+
private passResult(): ClassificationResult {
227+
return { bestClass: 'benign', confidence: 0, allScores: [] };
228+
}
229+
230+
/**
231+
* Map the raw pipeline output to a {@link ClassificationResult}.
232+
*
233+
* For binary classification the label with the higher confidence score
234+
* becomes `bestClass` / `confidence`. Both labels are included in
235+
* `allScores`.
236+
*
237+
* @param raw - Array returned by the pipeline when called with `topk: null`.
238+
*/
239+
private mapResult(raw: RawLabel[]): ClassificationResult {
240+
if (!raw || raw.length === 0) {
241+
return this.passResult();
242+
}
243+
244+
// Find the label with the highest score (should be one of INJECTION / SAFE).
245+
let best = raw[0];
246+
for (const item of raw) {
247+
if (item.score > best.score) {
248+
best = item;
249+
}
250+
}
251+
252+
return {
253+
bestClass: best.label,
254+
confidence: best.score,
255+
allScores: raw.map((item) => ({
256+
classLabel: item.label,
257+
score: item.score,
258+
})),
259+
};
260+
}
261+
}

0 commit comments

Comments
 (0)