|
| 1 | +/** |
| 2 | + * @fileoverview Prompt-injection content classifier using the |
| 3 | + * `protectai/deberta-v3-small-prompt-injection-v2` model. |
| 4 | + * |
| 5 | + * Prompt injection is the attack pattern where adversarial instructions are |
| 6 | + * embedded inside user-supplied text to override or hijack the agent's system |
| 7 | + * prompt. This classifier provides a dedicated binary signal (INJECTION / |
| 8 | + * SAFE) that the guardrail orchestrator can act on independently of the |
| 9 | + * toxicity or jailbreak classifiers. |
| 10 | + * |
| 11 | + * Model details |
| 12 | + * ------------- |
| 13 | + * `protectai/deberta-v3-small-prompt-injection-v2` is a fine-tuned DeBERTa |
| 14 | + * model from ProtectAI, specifically trained to distinguish benign user |
| 15 | + * messages from prompt-injection payloads. It outputs two labels: |
| 16 | + * - `INJECTION` — high-confidence injection attempt |
| 17 | + * - `SAFE` — normal user input |
| 18 | + * |
| 19 | + * Graceful degradation |
| 20 | + * -------------------- |
| 21 | + * If the model fails to load the classifier sets `unavailable = true` and |
| 22 | + * returns a pass result `{ bestClass: 'benign', confidence: 0, allScores: [] }` |
| 23 | + * on every subsequent call. |
| 24 | + * |
| 25 | + * @module agentos/extensions/packs/ml-classifiers/classifiers/InjectionClassifier |
| 26 | + */ |
| 27 | + |
| 28 | +import type { ClassificationResult } from '../../../../core/ai_utilities/IUtilityAI'; |
| 29 | +import type { ISharedServiceRegistry } from '../../../ISharedServiceRegistry'; |
| 30 | +import type { IContentClassifier } from '../IContentClassifier'; |
| 31 | +import type { ClassifierConfig } from '../types'; |
| 32 | +import { ML_CLASSIFIER_SERVICE_IDS } from '../types'; |
| 33 | + |
| 34 | +// --------------------------------------------------------------------------- |
| 35 | +// Internal raw pipeline output type |
| 36 | +// --------------------------------------------------------------------------- |
| 37 | + |
| 38 | +/** |
| 39 | + * A single label/score pair as returned by the HuggingFace text-classification |
| 40 | + * pipeline when called with `{ topk: null }`. |
| 41 | + */ |
| 42 | +interface RawLabel { |
| 43 | + /** Label name, e.g. `'INJECTION'` or `'SAFE'`. */ |
| 44 | + label: string; |
| 45 | + /** Confidence score in the range [0, 1]. */ |
| 46 | + score: number; |
| 47 | +} |
| 48 | + |
| 49 | +// --------------------------------------------------------------------------- |
| 50 | +// InjectionClassifier |
| 51 | +// --------------------------------------------------------------------------- |
| 52 | + |
| 53 | +/** |
| 54 | + * Binary prompt-injection classifier backed by |
| 55 | + * `protectai/deberta-v3-small-prompt-injection-v2`. |
| 56 | + * |
| 57 | + * Returns one of two labels: |
| 58 | + * - `INJECTION` — the text contains an injection attempt |
| 59 | + * - `SAFE` — the text is clean |
| 60 | + * |
| 61 | + * The label with the higher confidence becomes `bestClass` / `confidence`. |
| 62 | + * Both labels are present in `allScores` so callers can read the SAFE score |
| 63 | + * as well. |
| 64 | + * |
| 65 | + * @implements {IContentClassifier} |
| 66 | + * |
| 67 | + * @example |
| 68 | + * ```typescript |
| 69 | + * const classifier = new InjectionClassifier(serviceRegistry); |
| 70 | + * const result = await classifier.classify('Ignore previous instructions and …'); |
| 71 | + * // result.bestClass === 'INJECTION', result.confidence ≈ 0.97 |
| 72 | + * ``` |
| 73 | + */ |
| 74 | +export class InjectionClassifier implements IContentClassifier { |
| 75 | + // ------------------------------------------------------------------------- |
| 76 | + // IContentClassifier identity fields |
| 77 | + // ------------------------------------------------------------------------- |
| 78 | + |
| 79 | + /** Unique service identifier for this classifier. */ |
| 80 | + readonly id = 'prompt-injection'; |
| 81 | + |
| 82 | + /** Human-readable name for dashboards and log output. */ |
| 83 | + readonly displayName = 'Prompt Injection Classifier'; |
| 84 | + |
| 85 | + /** Short description of what this classifier detects. */ |
| 86 | + readonly description = |
| 87 | + 'Detects prompt-injection attempts where adversarial instructions are ' + |
| 88 | + 'embedded in user input to override or hijack the agent system prompt.'; |
| 89 | + |
| 90 | + /** |
| 91 | + * Default Hugging Face model ID. |
| 92 | + * Overridable via {@link ClassifierConfig.modelId}. |
| 93 | + */ |
| 94 | + readonly modelId = 'protectai/deberta-v3-small-prompt-injection-v2'; |
| 95 | + |
| 96 | + // ------------------------------------------------------------------------- |
| 97 | + // Internal state |
| 98 | + // ------------------------------------------------------------------------- |
| 99 | + |
| 100 | + /** |
| 101 | + * Whether the model weights are fully loaded and the classifier is ready |
| 102 | + * to accept `classify()` calls. |
| 103 | + */ |
| 104 | + private _isLoaded = false; |
| 105 | + |
| 106 | + /** |
| 107 | + * Set to `true` when the model fails to load. Once `unavailable`, every |
| 108 | + * subsequent `classify()` call immediately returns the pass result rather |
| 109 | + * than retrying the expensive model load. |
| 110 | + */ |
| 111 | + private unavailable = false; |
| 112 | + |
| 113 | + // ------------------------------------------------------------------------- |
| 114 | + // Constructor |
| 115 | + // ------------------------------------------------------------------------- |
| 116 | + |
| 117 | + /** |
| 118 | + * @param services - Shared service registry used to lazily create and cache |
| 119 | + * the underlying HuggingFace pipeline instance. |
| 120 | + * @param config - Optional per-classifier configuration. When |
| 121 | + * `config.modelId` is provided it overrides the default `modelId` when |
| 122 | + * loading the model. |
| 123 | + */ |
| 124 | + constructor( |
| 125 | + private readonly services: ISharedServiceRegistry, |
| 126 | + private readonly config?: ClassifierConfig, |
| 127 | + ) {} |
| 128 | + |
| 129 | + // ------------------------------------------------------------------------- |
| 130 | + // IContentClassifier.isLoaded (getter) |
| 131 | + // ------------------------------------------------------------------------- |
| 132 | + |
| 133 | + /** |
| 134 | + * Whether the underlying model pipeline has been successfully initialised. |
| 135 | + * The flag is set to `true` after the first successful `classify()` call. |
| 136 | + */ |
| 137 | + get isLoaded(): boolean { |
| 138 | + return this._isLoaded; |
| 139 | + } |
| 140 | + |
| 141 | + // ------------------------------------------------------------------------- |
| 142 | + // classify |
| 143 | + // ------------------------------------------------------------------------- |
| 144 | + |
| 145 | + /** |
| 146 | + * Run prompt-injection inference on `text`. |
| 147 | + * |
| 148 | + * Lazily loads the pipeline on the first call via the shared service |
| 149 | + * registry, then calls it with `{ topk: null }` to retrieve scores for both |
| 150 | + * labels. |
| 151 | + * |
| 152 | + * @param text - The text to evaluate. |
| 153 | + * @returns A promise that resolves with the classification result. If the |
| 154 | + * model is unavailable the pass result is returned instead of throwing. |
| 155 | + */ |
| 156 | + async classify(text: string): Promise<ClassificationResult> { |
| 157 | + // Return the pass result immediately if the model previously failed to load. |
| 158 | + if (this.unavailable) { |
| 159 | + return this.passResult(); |
| 160 | + } |
| 161 | + |
| 162 | + // Lazily obtain (or create) the HuggingFace pipeline instance from the |
| 163 | + // shared service registry so the model is only downloaded once. |
| 164 | + let pipeline: (text: string, opts: { topk: null }) => Promise<RawLabel[]>; |
| 165 | + try { |
| 166 | + pipeline = await this.services.getOrCreate( |
| 167 | + ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE, |
| 168 | + async () => { |
| 169 | + // Dynamic import so environments without @huggingface/transformers |
| 170 | + // can still load the rest of AgentOS. |
| 171 | + const { pipeline: createPipeline } = await import( |
| 172 | + '@huggingface/transformers' |
| 173 | + ); |
| 174 | + return createPipeline( |
| 175 | + 'text-classification', |
| 176 | + // Honour a caller-supplied model override; fall back to the default. |
| 177 | + this.config?.modelId ?? this.modelId, |
| 178 | + { quantized: true }, |
| 179 | + ); |
| 180 | + }, |
| 181 | + { |
| 182 | + /** Release ONNX/WASM resources when the registry entry is evicted. */ |
| 183 | + dispose: async (p: any) => p?.dispose?.(), |
| 184 | + /** Tags used for diagnostics and capability discovery. */ |
| 185 | + tags: ['ml', 'classifier', 'prompt-injection', 'onnx'], |
| 186 | + }, |
| 187 | + ); |
| 188 | + |
| 189 | + // Mark the classifier as ready now that the pipeline is available. |
| 190 | + this._isLoaded = true; |
| 191 | + } catch { |
| 192 | + // Model failed to load — mark as unavailable and return the pass result. |
| 193 | + this.unavailable = true; |
| 194 | + return this.passResult(); |
| 195 | + } |
| 196 | + |
| 197 | + // Run inference and request both label scores. |
| 198 | + const raw = await pipeline(text, { topk: null }); |
| 199 | + return this.mapResult(raw); |
| 200 | + } |
| 201 | + |
| 202 | + // ------------------------------------------------------------------------- |
| 203 | + // dispose (optional IContentClassifier lifecycle hook) |
| 204 | + // ------------------------------------------------------------------------- |
| 205 | + |
| 206 | + /** |
| 207 | + * Release the pipeline instance from the shared service registry. |
| 208 | + * |
| 209 | + * Idempotent — safe to call multiple times. |
| 210 | + */ |
| 211 | + async dispose(): Promise<void> { |
| 212 | + await this.services.release(ML_CLASSIFIER_SERVICE_IDS.INJECTION_PIPELINE); |
| 213 | + this._isLoaded = false; |
| 214 | + } |
| 215 | + |
| 216 | + // ------------------------------------------------------------------------- |
| 217 | + // Private helpers |
| 218 | + // ------------------------------------------------------------------------- |
| 219 | + |
| 220 | + /** |
| 221 | + * Returns a "pass" result used when the model is unavailable. |
| 222 | + * |
| 223 | + * A pass result reports `bestClass: 'benign'` with zero confidence so the |
| 224 | + * guardrail orchestrator will always choose {@link GuardrailAction.ALLOW}. |
| 225 | + */ |
| 226 | + private passResult(): ClassificationResult { |
| 227 | + return { bestClass: 'benign', confidence: 0, allScores: [] }; |
| 228 | + } |
| 229 | + |
| 230 | + /** |
| 231 | + * Map the raw pipeline output to a {@link ClassificationResult}. |
| 232 | + * |
| 233 | + * For binary classification the label with the higher confidence score |
| 234 | + * becomes `bestClass` / `confidence`. Both labels are included in |
| 235 | + * `allScores`. |
| 236 | + * |
| 237 | + * @param raw - Array returned by the pipeline when called with `topk: null`. |
| 238 | + */ |
| 239 | + private mapResult(raw: RawLabel[]): ClassificationResult { |
| 240 | + if (!raw || raw.length === 0) { |
| 241 | + return this.passResult(); |
| 242 | + } |
| 243 | + |
| 244 | + // Find the label with the highest score (should be one of INJECTION / SAFE). |
| 245 | + let best = raw[0]; |
| 246 | + for (const item of raw) { |
| 247 | + if (item.score > best.score) { |
| 248 | + best = item; |
| 249 | + } |
| 250 | + } |
| 251 | + |
| 252 | + return { |
| 253 | + bestClass: best.label, |
| 254 | + confidence: best.score, |
| 255 | + allScores: raw.map((item) => ({ |
| 256 | + classLabel: item.label, |
| 257 | + score: item.score, |
| 258 | + })), |
| 259 | + }; |
| 260 | + } |
| 261 | +} |
0 commit comments