Skip to content

Commit e8ade47

Browse files
jddunnclaude
andcommitted
feat(pii): add RegexRecognizer (Tier 1) with openredaction
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ea6116e commit e8ade47

File tree

3 files changed

+576
-1
lines changed

3 files changed

+576
-1
lines changed
Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
/**
2+
* @file RegexRecognizer.ts
3+
* @description Tier 1 PII recogniser that delegates pattern matching to the
4+
* `openredaction` library.
5+
*
6+
* OpenRedaction ships with 500+ curated regex patterns covering emails, SSNs,
7+
* credit cards, phone numbers, IP addresses, IBANs, passports, API keys, and
8+
* many more. This recogniser wraps its `OpenRedaction.detect()` method,
9+
* normalises the results into the pipeline's {@link PiiEntity} shape, and
10+
* applies entity-type filtering so only the categories requested by the caller
11+
* are evaluated.
12+
*
13+
* @module pii-redaction/recognizers
14+
*/
15+
16+
import type { PiiEntity, PiiEntityType } from '../types';
17+
import type { IEntityRecognizer, RecognizeOptions } from './IEntityRecognizer';
18+
19+
// ---------------------------------------------------------------------------
20+
// Mapping from openredaction pattern type strings to PiiEntityType
21+
// ---------------------------------------------------------------------------
22+
23+
/**
24+
* Maps openredaction's built-in pattern `type` strings to our canonical
25+
* {@link PiiEntityType} values.
26+
*
27+
* Only types that have a direct or near-direct counterpart are listed.
28+
* Unmapped openredaction types are silently dropped so that we don't pollute
29+
* downstream consumers with categories they can't act on.
30+
*/
31+
const OPENREDACTION_TYPE_MAP: Record<string, PiiEntityType> = {
32+
EMAIL: 'EMAIL',
33+
SSN: 'SSN',
34+
CREDIT_CARD: 'CREDIT_CARD',
35+
PHONE_US: 'PHONE',
36+
PHONE_UK: 'PHONE',
37+
PHONE_UK_MOBILE: 'PHONE',
38+
PHONE_INTERNATIONAL: 'PHONE',
39+
IPV4: 'IP_ADDRESS',
40+
IPV6: 'IP_ADDRESS',
41+
IBAN: 'IBAN',
42+
PASSPORT_US: 'PASSPORT',
43+
PASSPORT_UK: 'PASSPORT',
44+
PASSPORT_MRZ_TD3: 'PASSPORT',
45+
PASSPORT_MRZ_TD1: 'PASSPORT',
46+
DRIVING_LICENSE_US: 'DRIVERS_LICENSE',
47+
DRIVING_LICENSE_UK: 'DRIVERS_LICENSE',
48+
DATE_OF_BIRTH: 'DATE_OF_BIRTH',
49+
GENERIC_API_KEY: 'API_KEY',
50+
OPENAI_API_KEY: 'API_KEY',
51+
GOOGLE_API_KEY: 'API_KEY',
52+
STRIPE_API_KEY: 'API_KEY',
53+
GITHUB_TOKEN: 'API_KEY',
54+
BEARER_TOKEN: 'API_KEY',
55+
AWS_ACCESS_KEY: 'AWS_KEY',
56+
AWS_SECRET_KEY: 'AWS_KEY',
57+
BITCOIN_ADDRESS: 'CRYPTO_ADDRESS',
58+
ETHEREUM_ADDRESS: 'CRYPTO_ADDRESS',
59+
LITECOIN_ADDRESS: 'CRYPTO_ADDRESS',
60+
MONERO_ADDRESS: 'CRYPTO_ADDRESS',
61+
RIPPLE_ADDRESS: 'CRYPTO_ADDRESS',
62+
CARDANO_ADDRESS: 'CRYPTO_ADDRESS',
63+
SOLANA_ADDRESS: 'CRYPTO_ADDRESS',
64+
NAME: 'PERSON',
65+
TAX_ID: 'GOV_ID',
66+
NATIONAL_INSURANCE_UK: 'GOV_ID',
67+
NHS_NUMBER: 'GOV_ID',
68+
ITIN: 'GOV_ID',
69+
SIN_CA: 'GOV_ID',
70+
};
71+
72+
/**
73+
* Inverse map: for a given {@link PiiEntityType}, lists all openredaction
74+
* pattern type strings that map to it. Built lazily on first access.
75+
*/
76+
let piiTypeToPatternNames: Map<PiiEntityType, string[]> | null = null;
77+
78+
/**
79+
* Builds (or returns the cached) inverse mapping from {@link PiiEntityType}
80+
* to openredaction pattern names.
81+
*/
82+
function getPiiTypeToPatternNames(): Map<PiiEntityType, string[]> {
83+
if (piiTypeToPatternNames) return piiTypeToPatternNames;
84+
85+
piiTypeToPatternNames = new Map<PiiEntityType, string[]>();
86+
for (const [patternName, piiType] of Object.entries(OPENREDACTION_TYPE_MAP)) {
87+
const existing = piiTypeToPatternNames.get(piiType) ?? [];
88+
existing.push(patternName);
89+
piiTypeToPatternNames.set(piiType, existing);
90+
}
91+
return piiTypeToPatternNames;
92+
}
93+
94+
// ---------------------------------------------------------------------------
95+
// Default patterns to load when no entity-type filter is applied
96+
// ---------------------------------------------------------------------------
97+
98+
/**
99+
* The full list of openredaction pattern names we want available by default.
100+
* This is the union of all values in {@link OPENREDACTION_TYPE_MAP}.
101+
*/
102+
const DEFAULT_PATTERN_NAMES: string[] = Object.keys(OPENREDACTION_TYPE_MAP);
103+
104+
// ---------------------------------------------------------------------------
105+
// RegexRecognizer
106+
// ---------------------------------------------------------------------------
107+
108+
/**
109+
* Tier 1 entity recogniser backed by the `openredaction` library.
110+
*
111+
* ### How it works
112+
* 1. On construction an {@link OpenRedaction} instance is created with the
113+
* full set of mapped patterns pre-loaded and pre-compiled.
114+
* 2. When {@link recognize} is called, pattern names are optionally filtered
115+
* to the requested {@link PiiEntityType} subset.
116+
* 3. `OpenRedaction.detect()` runs all active patterns against the input and
117+
* returns raw detections with position offsets.
118+
* 4. Results are mapped to {@link PiiEntity} objects with a fixed high score
119+
* (>= 0.85) because regex matches are deterministic.
120+
*
121+
* ### Thread safety
122+
* Each call to `recognize` creates a fresh `OpenRedaction` instance with
123+
* only the needed patterns so there is no shared mutable state between
124+
* concurrent invocations.
125+
*
126+
* @example
127+
* ```ts
128+
* const recognizer = new RegexRecognizer();
129+
* const entities = await recognizer.recognize(
130+
* 'Email me at alice@example.com',
131+
* { entityTypes: ['EMAIL'] },
132+
* );
133+
* // entities[0].entityType === 'EMAIL'
134+
* // entities[0].score >= 0.85
135+
* ```
136+
*/
137+
export class RegexRecognizer implements IEntityRecognizer {
138+
/** @inheritdoc */
139+
public readonly name = 'RegexRecognizer';
140+
141+
/** @inheritdoc */
142+
public readonly supportedEntities: PiiEntityType[] = [
143+
'SSN',
144+
'CREDIT_CARD',
145+
'EMAIL',
146+
'PHONE',
147+
'IP_ADDRESS',
148+
'IBAN',
149+
'PASSPORT',
150+
'DRIVERS_LICENSE',
151+
'DATE_OF_BIRTH',
152+
'API_KEY',
153+
'AWS_KEY',
154+
'CRYPTO_ADDRESS',
155+
'PERSON',
156+
'GOV_ID',
157+
];
158+
159+
/**
160+
* Minimum confidence score assigned to regex-based detections.
161+
* Regex matches are deterministic so they receive a high baseline; the
162+
* openredaction library may report its own `confidence` which we bump
163+
* to at least this floor value.
164+
*/
165+
private static readonly MIN_SCORE = 0.85;
166+
167+
/**
168+
* Lazily-resolved reference to the `OpenRedaction` constructor from the
169+
* `openredaction` package. Stored after first successful dynamic import
170+
* to avoid repeated module resolution on subsequent calls.
171+
*/
172+
private OpenRedactionCtor: (new (opts: Record<string, unknown>) => OpenRedactionInstance) | null = null;
173+
174+
/**
175+
* Scan the input text for PII entities using openredaction regex patterns.
176+
*
177+
* @param input - Raw text to analyse.
178+
* @param options - Optional filtering and context hints.
179+
* @returns Array of detected {@link PiiEntity} objects, possibly empty.
180+
*/
181+
public async recognize(input: string, options?: RecognizeOptions): Promise<PiiEntity[]> {
182+
// Determine which openredaction patterns to activate based on the
183+
// caller's entity-type filter.
184+
const patternNames = this.resolvePatternNames(options?.entityTypes);
185+
186+
// Nothing to do if no patterns map to the requested entity types.
187+
if (patternNames.length === 0) return [];
188+
189+
// Lazily import openredaction (avoids top-level side effects and keeps
190+
// the module optional for environments that don't need regex detection).
191+
const Ctor = await this.ensureOpenRedaction();
192+
193+
// Create a scoped instance with only the relevant patterns loaded.
194+
const instance = new Ctor({ patterns: patternNames });
195+
196+
// Run detection — openredaction returns a promise.
197+
const result = await instance.detect(input);
198+
199+
// Map openredaction detections to our PiiEntity shape.
200+
return this.mapDetections(result.detections, input, options?.entityTypes);
201+
}
202+
203+
/** @inheritdoc */
204+
public async dispose(): Promise<void> {
205+
// No long-lived resources to release; each `recognize` call creates its
206+
// own scoped OpenRedaction instance.
207+
this.OpenRedactionCtor = null;
208+
}
209+
210+
// -----------------------------------------------------------------------
211+
// Private helpers
212+
// -----------------------------------------------------------------------
213+
214+
/**
215+
* Lazily imports the `openredaction` module and caches its constructor.
216+
*
217+
* @returns The `OpenRedaction` class constructor.
218+
* @throws If the `openredaction` package is not installed.
219+
*/
220+
private async ensureOpenRedaction(): Promise<
221+
new (opts: Record<string, unknown>) => OpenRedactionInstance
222+
> {
223+
if (this.OpenRedactionCtor) return this.OpenRedactionCtor;
224+
225+
// Dynamic import so the dependency is optional at the module level.
226+
// eslint-disable-next-line @typescript-eslint/no-require-imports
227+
const mod = await import('openredaction');
228+
this.OpenRedactionCtor = (mod as Record<string, unknown>).OpenRedaction as
229+
new (opts: Record<string, unknown>) => OpenRedactionInstance;
230+
return this.OpenRedactionCtor;
231+
}
232+
233+
/**
234+
* Resolves the list of openredaction pattern names to activate for the
235+
* given entity-type filter.
236+
*
237+
* When no filter is provided, all mapped patterns are returned.
238+
*
239+
* @param entityTypes - Optional subset of {@link PiiEntityType} to detect.
240+
* @returns Array of openredaction pattern name strings.
241+
*/
242+
private resolvePatternNames(entityTypes?: PiiEntityType[]): string[] {
243+
// No filter → use everything we have mapped.
244+
if (!entityTypes || entityTypes.length === 0) return DEFAULT_PATTERN_NAMES;
245+
246+
const inverse = getPiiTypeToPatternNames();
247+
const names: string[] = [];
248+
249+
for (const piiType of entityTypes) {
250+
const mapped = inverse.get(piiType);
251+
if (mapped) names.push(...mapped);
252+
}
253+
254+
return names;
255+
}
256+
257+
/**
258+
* Converts raw openredaction detection objects into {@link PiiEntity}
259+
* instances, filtering out any that don't map to a requested entity type.
260+
*
261+
* @param detections - Raw detections from `OpenRedaction.detect()`.
262+
* @param input - The original input text (used to extract `text`
263+
* from positional offsets when needed).
264+
* @param entityTypes - Optional entity-type filter for post-filtering.
265+
* @returns Mapped and filtered array of {@link PiiEntity}.
266+
*/
267+
private mapDetections(
268+
detections: OpenRedactionDetection[],
269+
input: string,
270+
entityTypes?: PiiEntityType[],
271+
): PiiEntity[] {
272+
const entities: PiiEntity[] = [];
273+
const allowedTypes = entityTypes ? new Set(entityTypes) : null;
274+
275+
for (const det of detections) {
276+
const mappedType = OPENREDACTION_TYPE_MAP[det.type];
277+
278+
// Skip detections whose openredaction type has no mapping.
279+
if (!mappedType) continue;
280+
281+
// Skip if the mapped type isn't in the caller's requested set.
282+
if (allowedTypes && !allowedTypes.has(mappedType)) continue;
283+
284+
// Compute start/end offsets from the position tuple.
285+
const [start, end] = det.position;
286+
287+
// Extract the matched text span from the input for consistency.
288+
const text = det.value ?? input.slice(start, end);
289+
290+
// Confidence: take the higher of openredaction's score and our floor.
291+
const score = Math.max(det.confidence ?? RegexRecognizer.MIN_SCORE, RegexRecognizer.MIN_SCORE);
292+
293+
entities.push({
294+
entityType: mappedType,
295+
text,
296+
start,
297+
end,
298+
score,
299+
source: 'regex',
300+
metadata: {
301+
openredactionType: det.type,
302+
placeholder: det.placeholder,
303+
severity: det.severity,
304+
},
305+
});
306+
}
307+
308+
return entities;
309+
}
310+
}
311+
312+
// ---------------------------------------------------------------------------
313+
// Minimal type declarations for the openredaction API surface we use
314+
// ---------------------------------------------------------------------------
315+
316+
/**
317+
* Shape of a single detection returned by `OpenRedaction.detect()`.
318+
* Declared locally to avoid depending on openredaction's type exports
319+
* (which may not exist for all versions).
320+
*/
321+
interface OpenRedactionDetection {
322+
/** The openredaction pattern type name (e.g. `'EMAIL'`, `'SSN'`). */
323+
type: string;
324+
/** The matched value string. */
325+
value: string;
326+
/** Placeholder string used in the redacted output. */
327+
placeholder: string;
328+
/** [start, end] character offsets in the original input. */
329+
position: [number, number];
330+
/** Severity classification from openredaction. */
331+
severity: string;
332+
/** Confidence score assigned by openredaction (0–1). */
333+
confidence?: number;
334+
}
335+
336+
/**
337+
* Minimal interface for an instantiated `OpenRedaction` object.
338+
* Only the `detect` method is used by this recogniser.
339+
*/
340+
interface OpenRedactionInstance {
341+
detect(text: string): Promise<{
342+
original: string;
343+
redacted: string;
344+
detections: OpenRedactionDetection[];
345+
redactionMap: Record<string, string>;
346+
stats: { processingTime: number; piiCount: number };
347+
}>;
348+
}

src/extensions/packs/pii-redaction/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ export interface PiiEntity {
152152
* - `'llm'` — Tier 2 LLM judge
153153
* - `'denylist'` — explicit denylist rule
154154
*/
155-
source: 'regex' | 'ner' | 'llm' | 'denylist';
155+
source: 'regex' | 'nlp-prefilter' | 'ner-model' | 'ner' | 'llm' | 'denylist';
156156

157157
/**
158158
* Arbitrary key-value metadata attached by the recogniser that produced

0 commit comments

Comments
 (0)