|
| 1 | +/** |
| 2 | + * @file RegexRecognizer.ts |
| 3 | + * @description Tier 1 PII recogniser that delegates pattern matching to the |
| 4 | + * `openredaction` library. |
| 5 | + * |
| 6 | + * OpenRedaction ships with 500+ curated regex patterns covering emails, SSNs, |
| 7 | + * credit cards, phone numbers, IP addresses, IBANs, passports, API keys, and |
| 8 | + * many more. This recogniser wraps its `OpenRedaction.detect()` method, |
| 9 | + * normalises the results into the pipeline's {@link PiiEntity} shape, and |
| 10 | + * applies entity-type filtering so only the categories requested by the caller |
| 11 | + * are evaluated. |
| 12 | + * |
| 13 | + * @module pii-redaction/recognizers |
| 14 | + */ |
| 15 | + |
| 16 | +import type { PiiEntity, PiiEntityType } from '../types'; |
| 17 | +import type { IEntityRecognizer, RecognizeOptions } from './IEntityRecognizer'; |
| 18 | + |
| 19 | +// --------------------------------------------------------------------------- |
| 20 | +// Mapping from openredaction pattern type strings to PiiEntityType |
| 21 | +// --------------------------------------------------------------------------- |
| 22 | + |
| 23 | +/** |
| 24 | + * Maps openredaction's built-in pattern `type` strings to our canonical |
| 25 | + * {@link PiiEntityType} values. |
| 26 | + * |
| 27 | + * Only types that have a direct or near-direct counterpart are listed. |
| 28 | + * Unmapped openredaction types are silently dropped so that we don't pollute |
| 29 | + * downstream consumers with categories they can't act on. |
| 30 | + */ |
| 31 | +const OPENREDACTION_TYPE_MAP: Record<string, PiiEntityType> = { |
| 32 | + EMAIL: 'EMAIL', |
| 33 | + SSN: 'SSN', |
| 34 | + CREDIT_CARD: 'CREDIT_CARD', |
| 35 | + PHONE_US: 'PHONE', |
| 36 | + PHONE_UK: 'PHONE', |
| 37 | + PHONE_UK_MOBILE: 'PHONE', |
| 38 | + PHONE_INTERNATIONAL: 'PHONE', |
| 39 | + IPV4: 'IP_ADDRESS', |
| 40 | + IPV6: 'IP_ADDRESS', |
| 41 | + IBAN: 'IBAN', |
| 42 | + PASSPORT_US: 'PASSPORT', |
| 43 | + PASSPORT_UK: 'PASSPORT', |
| 44 | + PASSPORT_MRZ_TD3: 'PASSPORT', |
| 45 | + PASSPORT_MRZ_TD1: 'PASSPORT', |
| 46 | + DRIVING_LICENSE_US: 'DRIVERS_LICENSE', |
| 47 | + DRIVING_LICENSE_UK: 'DRIVERS_LICENSE', |
| 48 | + DATE_OF_BIRTH: 'DATE_OF_BIRTH', |
| 49 | + GENERIC_API_KEY: 'API_KEY', |
| 50 | + OPENAI_API_KEY: 'API_KEY', |
| 51 | + GOOGLE_API_KEY: 'API_KEY', |
| 52 | + STRIPE_API_KEY: 'API_KEY', |
| 53 | + GITHUB_TOKEN: 'API_KEY', |
| 54 | + BEARER_TOKEN: 'API_KEY', |
| 55 | + AWS_ACCESS_KEY: 'AWS_KEY', |
| 56 | + AWS_SECRET_KEY: 'AWS_KEY', |
| 57 | + BITCOIN_ADDRESS: 'CRYPTO_ADDRESS', |
| 58 | + ETHEREUM_ADDRESS: 'CRYPTO_ADDRESS', |
| 59 | + LITECOIN_ADDRESS: 'CRYPTO_ADDRESS', |
| 60 | + MONERO_ADDRESS: 'CRYPTO_ADDRESS', |
| 61 | + RIPPLE_ADDRESS: 'CRYPTO_ADDRESS', |
| 62 | + CARDANO_ADDRESS: 'CRYPTO_ADDRESS', |
| 63 | + SOLANA_ADDRESS: 'CRYPTO_ADDRESS', |
| 64 | + NAME: 'PERSON', |
| 65 | + TAX_ID: 'GOV_ID', |
| 66 | + NATIONAL_INSURANCE_UK: 'GOV_ID', |
| 67 | + NHS_NUMBER: 'GOV_ID', |
| 68 | + ITIN: 'GOV_ID', |
| 69 | + SIN_CA: 'GOV_ID', |
| 70 | +}; |
| 71 | + |
| 72 | +/** |
| 73 | + * Inverse map: for a given {@link PiiEntityType}, lists all openredaction |
| 74 | + * pattern type strings that map to it. Built lazily on first access. |
| 75 | + */ |
| 76 | +let piiTypeToPatternNames: Map<PiiEntityType, string[]> | null = null; |
| 77 | + |
| 78 | +/** |
| 79 | + * Builds (or returns the cached) inverse mapping from {@link PiiEntityType} |
| 80 | + * to openredaction pattern names. |
| 81 | + */ |
| 82 | +function getPiiTypeToPatternNames(): Map<PiiEntityType, string[]> { |
| 83 | + if (piiTypeToPatternNames) return piiTypeToPatternNames; |
| 84 | + |
| 85 | + piiTypeToPatternNames = new Map<PiiEntityType, string[]>(); |
| 86 | + for (const [patternName, piiType] of Object.entries(OPENREDACTION_TYPE_MAP)) { |
| 87 | + const existing = piiTypeToPatternNames.get(piiType) ?? []; |
| 88 | + existing.push(patternName); |
| 89 | + piiTypeToPatternNames.set(piiType, existing); |
| 90 | + } |
| 91 | + return piiTypeToPatternNames; |
| 92 | +} |
| 93 | + |
| 94 | +// --------------------------------------------------------------------------- |
| 95 | +// Default patterns to load when no entity-type filter is applied |
| 96 | +// --------------------------------------------------------------------------- |
| 97 | + |
| 98 | +/** |
| 99 | + * The full list of openredaction pattern names we want available by default. |
| 100 | + * This is the union of all values in {@link OPENREDACTION_TYPE_MAP}. |
| 101 | + */ |
| 102 | +const DEFAULT_PATTERN_NAMES: string[] = Object.keys(OPENREDACTION_TYPE_MAP); |
| 103 | + |
| 104 | +// --------------------------------------------------------------------------- |
| 105 | +// RegexRecognizer |
| 106 | +// --------------------------------------------------------------------------- |
| 107 | + |
| 108 | +/** |
| 109 | + * Tier 1 entity recogniser backed by the `openredaction` library. |
| 110 | + * |
| 111 | + * ### How it works |
| 112 | + * 1. On construction an {@link OpenRedaction} instance is created with the |
| 113 | + * full set of mapped patterns pre-loaded and pre-compiled. |
| 114 | + * 2. When {@link recognize} is called, pattern names are optionally filtered |
| 115 | + * to the requested {@link PiiEntityType} subset. |
| 116 | + * 3. `OpenRedaction.detect()` runs all active patterns against the input and |
| 117 | + * returns raw detections with position offsets. |
| 118 | + * 4. Results are mapped to {@link PiiEntity} objects with a fixed high score |
| 119 | + * (>= 0.85) because regex matches are deterministic. |
| 120 | + * |
| 121 | + * ### Thread safety |
| 122 | + * Each call to `recognize` creates a fresh `OpenRedaction` instance with |
| 123 | + * only the needed patterns so there is no shared mutable state between |
| 124 | + * concurrent invocations. |
| 125 | + * |
| 126 | + * @example |
| 127 | + * ```ts |
| 128 | + * const recognizer = new RegexRecognizer(); |
| 129 | + * const entities = await recognizer.recognize( |
| 130 | + * 'Email me at alice@example.com', |
| 131 | + * { entityTypes: ['EMAIL'] }, |
| 132 | + * ); |
| 133 | + * // entities[0].entityType === 'EMAIL' |
| 134 | + * // entities[0].score >= 0.85 |
| 135 | + * ``` |
| 136 | + */ |
| 137 | +export class RegexRecognizer implements IEntityRecognizer { |
| 138 | + /** @inheritdoc */ |
| 139 | + public readonly name = 'RegexRecognizer'; |
| 140 | + |
| 141 | + /** @inheritdoc */ |
| 142 | + public readonly supportedEntities: PiiEntityType[] = [ |
| 143 | + 'SSN', |
| 144 | + 'CREDIT_CARD', |
| 145 | + 'EMAIL', |
| 146 | + 'PHONE', |
| 147 | + 'IP_ADDRESS', |
| 148 | + 'IBAN', |
| 149 | + 'PASSPORT', |
| 150 | + 'DRIVERS_LICENSE', |
| 151 | + 'DATE_OF_BIRTH', |
| 152 | + 'API_KEY', |
| 153 | + 'AWS_KEY', |
| 154 | + 'CRYPTO_ADDRESS', |
| 155 | + 'PERSON', |
| 156 | + 'GOV_ID', |
| 157 | + ]; |
| 158 | + |
| 159 | + /** |
| 160 | + * Minimum confidence score assigned to regex-based detections. |
| 161 | + * Regex matches are deterministic so they receive a high baseline; the |
| 162 | + * openredaction library may report its own `confidence` which we bump |
| 163 | + * to at least this floor value. |
| 164 | + */ |
| 165 | + private static readonly MIN_SCORE = 0.85; |
| 166 | + |
| 167 | + /** |
| 168 | + * Lazily-resolved reference to the `OpenRedaction` constructor from the |
| 169 | + * `openredaction` package. Stored after first successful dynamic import |
| 170 | + * to avoid repeated module resolution on subsequent calls. |
| 171 | + */ |
| 172 | + private OpenRedactionCtor: (new (opts: Record<string, unknown>) => OpenRedactionInstance) | null = null; |
| 173 | + |
| 174 | + /** |
| 175 | + * Scan the input text for PII entities using openredaction regex patterns. |
| 176 | + * |
| 177 | + * @param input - Raw text to analyse. |
| 178 | + * @param options - Optional filtering and context hints. |
| 179 | + * @returns Array of detected {@link PiiEntity} objects, possibly empty. |
| 180 | + */ |
| 181 | + public async recognize(input: string, options?: RecognizeOptions): Promise<PiiEntity[]> { |
| 182 | + // Determine which openredaction patterns to activate based on the |
| 183 | + // caller's entity-type filter. |
| 184 | + const patternNames = this.resolvePatternNames(options?.entityTypes); |
| 185 | + |
| 186 | + // Nothing to do if no patterns map to the requested entity types. |
| 187 | + if (patternNames.length === 0) return []; |
| 188 | + |
| 189 | + // Lazily import openredaction (avoids top-level side effects and keeps |
| 190 | + // the module optional for environments that don't need regex detection). |
| 191 | + const Ctor = await this.ensureOpenRedaction(); |
| 192 | + |
| 193 | + // Create a scoped instance with only the relevant patterns loaded. |
| 194 | + const instance = new Ctor({ patterns: patternNames }); |
| 195 | + |
| 196 | + // Run detection — openredaction returns a promise. |
| 197 | + const result = await instance.detect(input); |
| 198 | + |
| 199 | + // Map openredaction detections to our PiiEntity shape. |
| 200 | + return this.mapDetections(result.detections, input, options?.entityTypes); |
| 201 | + } |
| 202 | + |
| 203 | + /** @inheritdoc */ |
| 204 | + public async dispose(): Promise<void> { |
| 205 | + // No long-lived resources to release; each `recognize` call creates its |
| 206 | + // own scoped OpenRedaction instance. |
| 207 | + this.OpenRedactionCtor = null; |
| 208 | + } |
| 209 | + |
| 210 | + // ----------------------------------------------------------------------- |
| 211 | + // Private helpers |
| 212 | + // ----------------------------------------------------------------------- |
| 213 | + |
| 214 | + /** |
| 215 | + * Lazily imports the `openredaction` module and caches its constructor. |
| 216 | + * |
| 217 | + * @returns The `OpenRedaction` class constructor. |
| 218 | + * @throws If the `openredaction` package is not installed. |
| 219 | + */ |
| 220 | + private async ensureOpenRedaction(): Promise< |
| 221 | + new (opts: Record<string, unknown>) => OpenRedactionInstance |
| 222 | + > { |
| 223 | + if (this.OpenRedactionCtor) return this.OpenRedactionCtor; |
| 224 | + |
| 225 | + // Dynamic import so the dependency is optional at the module level. |
| 226 | + // eslint-disable-next-line @typescript-eslint/no-require-imports |
| 227 | + const mod = await import('openredaction'); |
| 228 | + this.OpenRedactionCtor = (mod as Record<string, unknown>).OpenRedaction as |
| 229 | + new (opts: Record<string, unknown>) => OpenRedactionInstance; |
| 230 | + return this.OpenRedactionCtor; |
| 231 | + } |
| 232 | + |
| 233 | + /** |
| 234 | + * Resolves the list of openredaction pattern names to activate for the |
| 235 | + * given entity-type filter. |
| 236 | + * |
| 237 | + * When no filter is provided, all mapped patterns are returned. |
| 238 | + * |
| 239 | + * @param entityTypes - Optional subset of {@link PiiEntityType} to detect. |
| 240 | + * @returns Array of openredaction pattern name strings. |
| 241 | + */ |
| 242 | + private resolvePatternNames(entityTypes?: PiiEntityType[]): string[] { |
| 243 | + // No filter → use everything we have mapped. |
| 244 | + if (!entityTypes || entityTypes.length === 0) return DEFAULT_PATTERN_NAMES; |
| 245 | + |
| 246 | + const inverse = getPiiTypeToPatternNames(); |
| 247 | + const names: string[] = []; |
| 248 | + |
| 249 | + for (const piiType of entityTypes) { |
| 250 | + const mapped = inverse.get(piiType); |
| 251 | + if (mapped) names.push(...mapped); |
| 252 | + } |
| 253 | + |
| 254 | + return names; |
| 255 | + } |
| 256 | + |
| 257 | + /** |
| 258 | + * Converts raw openredaction detection objects into {@link PiiEntity} |
| 259 | + * instances, filtering out any that don't map to a requested entity type. |
| 260 | + * |
| 261 | + * @param detections - Raw detections from `OpenRedaction.detect()`. |
| 262 | + * @param input - The original input text (used to extract `text` |
| 263 | + * from positional offsets when needed). |
| 264 | + * @param entityTypes - Optional entity-type filter for post-filtering. |
| 265 | + * @returns Mapped and filtered array of {@link PiiEntity}. |
| 266 | + */ |
| 267 | + private mapDetections( |
| 268 | + detections: OpenRedactionDetection[], |
| 269 | + input: string, |
| 270 | + entityTypes?: PiiEntityType[], |
| 271 | + ): PiiEntity[] { |
| 272 | + const entities: PiiEntity[] = []; |
| 273 | + const allowedTypes = entityTypes ? new Set(entityTypes) : null; |
| 274 | + |
| 275 | + for (const det of detections) { |
| 276 | + const mappedType = OPENREDACTION_TYPE_MAP[det.type]; |
| 277 | + |
| 278 | + // Skip detections whose openredaction type has no mapping. |
| 279 | + if (!mappedType) continue; |
| 280 | + |
| 281 | + // Skip if the mapped type isn't in the caller's requested set. |
| 282 | + if (allowedTypes && !allowedTypes.has(mappedType)) continue; |
| 283 | + |
| 284 | + // Compute start/end offsets from the position tuple. |
| 285 | + const [start, end] = det.position; |
| 286 | + |
| 287 | + // Extract the matched text span from the input for consistency. |
| 288 | + const text = det.value ?? input.slice(start, end); |
| 289 | + |
| 290 | + // Confidence: take the higher of openredaction's score and our floor. |
| 291 | + const score = Math.max(det.confidence ?? RegexRecognizer.MIN_SCORE, RegexRecognizer.MIN_SCORE); |
| 292 | + |
| 293 | + entities.push({ |
| 294 | + entityType: mappedType, |
| 295 | + text, |
| 296 | + start, |
| 297 | + end, |
| 298 | + score, |
| 299 | + source: 'regex', |
| 300 | + metadata: { |
| 301 | + openredactionType: det.type, |
| 302 | + placeholder: det.placeholder, |
| 303 | + severity: det.severity, |
| 304 | + }, |
| 305 | + }); |
| 306 | + } |
| 307 | + |
| 308 | + return entities; |
| 309 | + } |
| 310 | +} |
| 311 | + |
| 312 | +// --------------------------------------------------------------------------- |
| 313 | +// Minimal type declarations for the openredaction API surface we use |
| 314 | +// --------------------------------------------------------------------------- |
| 315 | + |
| 316 | +/** |
| 317 | + * Shape of a single detection returned by `OpenRedaction.detect()`. |
| 318 | + * Declared locally to avoid depending on openredaction's type exports |
| 319 | + * (which may not exist for all versions). |
| 320 | + */ |
| 321 | +interface OpenRedactionDetection { |
| 322 | + /** The openredaction pattern type name (e.g. `'EMAIL'`, `'SSN'`). */ |
| 323 | + type: string; |
| 324 | + /** The matched value string. */ |
| 325 | + value: string; |
| 326 | + /** Placeholder string used in the redacted output. */ |
| 327 | + placeholder: string; |
| 328 | + /** [start, end] character offsets in the original input. */ |
| 329 | + position: [number, number]; |
| 330 | + /** Severity classification from openredaction. */ |
| 331 | + severity: string; |
| 332 | + /** Confidence score assigned by openredaction (0–1). */ |
| 333 | + confidence?: number; |
| 334 | +} |
| 335 | + |
| 336 | +/** |
| 337 | + * Minimal interface for an instantiated `OpenRedaction` object. |
| 338 | + * Only the `detect` method is used by this recogniser. |
| 339 | + */ |
| 340 | +interface OpenRedactionInstance { |
| 341 | + detect(text: string): Promise<{ |
| 342 | + original: string; |
| 343 | + redacted: string; |
| 344 | + detections: OpenRedactionDetection[]; |
| 345 | + redactionMap: Record<string, string>; |
| 346 | + stats: { processingTime: number; piiCount: number }; |
| 347 | + }>; |
| 348 | +} |
0 commit comments