|
| 1 | +/** |
| 2 | + * @module voice-pipeline/HeuristicEndpointDetector |
| 3 | + * |
| 4 | + * A lightweight, rule-based endpoint detector that combines terminal punctuation |
| 5 | + * analysis with a configurable silence timeout to determine when the user has |
| 6 | + * finished speaking. Suitable for low-latency deployments where an LLM-based |
| 7 | + * semantic detector would add unacceptable round-trip overhead. |
| 8 | + * |
| 9 | + * Detection strategy: |
| 10 | + * 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`, |
| 11 | + * fire `turn_complete` immediately with reason `'punctuation'`. |
| 12 | + * 2. Otherwise, start a silence timer (default 1 500 ms). If speech does not |
| 13 | + * resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`. |
| 14 | + * 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from |
| 15 | + * accumulation, and re-emitted as `'backchannel_detected'` events so the |
| 16 | + * pipeline can decide whether to suppress an agent response. |
| 17 | + */ |
| 18 | + |
| 19 | +import { EventEmitter } from 'node:events'; |
| 20 | +import type { |
| 21 | + IEndpointDetector, |
| 22 | + TranscriptEvent, |
| 23 | + VadEvent, |
| 24 | + TurnCompleteEvent, |
| 25 | +} from './types.js'; |
| 26 | + |
| 27 | +// --------------------------------------------------------------------------- |
| 28 | +// Constants |
| 29 | +// --------------------------------------------------------------------------- |
| 30 | + |
| 31 | +/** |
| 32 | + * Default silence duration (ms) after speech stops before firing `turn_complete`. |
| 33 | + */ |
| 34 | +const DEFAULT_SILENCE_TIMEOUT_MS = 1_500; |
| 35 | + |
| 36 | +/** |
| 37 | + * Terminal punctuation characters that signal sentence completion. |
| 38 | + */ |
| 39 | +const TERMINAL_PUNCTUATION = /[.?!]$/; |
| 40 | + |
| 41 | +/** |
| 42 | + * Normalised backchannel phrases that indicate the listener is acknowledging |
| 43 | + * but not taking a full conversational turn. Compared after `.trim().toLowerCase()`. |
| 44 | + */ |
| 45 | +const BACKCHANNEL_PHRASES = new Set([ |
| 46 | + 'uh huh', |
| 47 | + 'yeah', |
| 48 | + 'okay', |
| 49 | + 'ok', |
| 50 | + 'mm hmm', |
| 51 | + 'mmhmm', |
| 52 | + 'mhm', |
| 53 | + 'mm-hmm', |
| 54 | + 'right', |
| 55 | + 'sure', |
| 56 | + 'yep', |
| 57 | + 'yup', |
| 58 | + 'gotcha', |
| 59 | +]); |
| 60 | + |
| 61 | +// --------------------------------------------------------------------------- |
| 62 | +// Options |
| 63 | +// --------------------------------------------------------------------------- |
| 64 | + |
| 65 | +/** |
| 66 | + * Constructor options for {@link HeuristicEndpointDetector}. |
| 67 | + */ |
| 68 | +export interface HeuristicEndpointDetectorOptions { |
| 69 | + /** |
| 70 | + * How long (ms) to wait after `speech_end` before emitting `turn_complete` |
| 71 | + * when no terminal punctuation is detected. |
| 72 | + * @defaultValue 1500 |
| 73 | + */ |
| 74 | + silenceTimeoutMs?: number; |
| 75 | +} |
| 76 | + |
| 77 | +// --------------------------------------------------------------------------- |
| 78 | +// Implementation |
| 79 | +// --------------------------------------------------------------------------- |
| 80 | + |
| 81 | +/** |
| 82 | + * Heuristic endpoint detector that uses terminal punctuation and a silence |
| 83 | + * timeout to decide when the user's turn is complete. |
| 84 | + * |
| 85 | + * Emits: |
| 86 | + * - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended. |
| 87 | + * - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was |
| 88 | + * recognised; accumulation is suppressed for this utterance. |
| 89 | + * |
| 90 | + * @example |
| 91 | + * ```typescript |
| 92 | + * const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 }); |
| 93 | + * detector.on('turn_complete', (event) => console.log('Turn done:', event)); |
| 94 | + * detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] }); |
| 95 | + * detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' }); |
| 96 | + * // → 'turn_complete' fires immediately with reason 'punctuation' |
| 97 | + * ``` |
| 98 | + */ |
| 99 | +export class HeuristicEndpointDetector |
| 100 | + extends EventEmitter |
| 101 | + implements IEndpointDetector |
| 102 | +{ |
| 103 | + /** |
| 104 | + * Active detection strategy label. |
| 105 | + * Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers |
| 106 | + * that need to distinguish heuristic detectors may inspect `instanceof`. |
| 107 | + */ |
| 108 | + // We expose 'heuristic' at runtime for consumers, using a wider string type |
| 109 | + // internally to avoid a cast on every read. |
| 110 | + readonly mode: IEndpointDetector['mode'] = 'hybrid'; |
| 111 | + |
| 112 | + /** Resolved silence timeout in milliseconds. */ |
| 113 | + private readonly silenceTimeoutMs: number; |
| 114 | + |
| 115 | + /** The latest final transcript text accumulated for the current turn. */ |
| 116 | + private accumulatedText = ''; |
| 117 | + |
| 118 | + /** Whether the VAD currently reports active speech. */ |
| 119 | + private speechActive = false; |
| 120 | + |
| 121 | + /** Handle to a pending silence timeout, or `null` if none is running. */ |
| 122 | + private silenceTimer: ReturnType<typeof setTimeout> | null = null; |
| 123 | + |
| 124 | + /** Wall-clock timestamp (ms) when the current turn's speech started. */ |
| 125 | + private turnStartMs: number | null = null; |
| 126 | + |
| 127 | + /** Confidence of the most recent final transcript. */ |
| 128 | + private lastConfidence = 1; |
| 129 | + |
| 130 | + // --------------------------------------------------------------------------- |
| 131 | + // Constructor |
| 132 | + // --------------------------------------------------------------------------- |
| 133 | + |
| 134 | + /** |
| 135 | + * Create a new {@link HeuristicEndpointDetector}. |
| 136 | + * |
| 137 | + * @param options — Optional configuration overrides. |
| 138 | + */ |
| 139 | + constructor(options: HeuristicEndpointDetectorOptions = {}) { |
| 140 | + super(); |
| 141 | + this.silenceTimeoutMs = options.silenceTimeoutMs ?? DEFAULT_SILENCE_TIMEOUT_MS; |
| 142 | + } |
| 143 | + |
| 144 | + // --------------------------------------------------------------------------- |
| 145 | + // IEndpointDetector — pushTranscript |
| 146 | + // --------------------------------------------------------------------------- |
| 147 | + |
| 148 | + /** |
| 149 | + * Ingest a transcript event from the upstream STT session. |
| 150 | + * |
| 151 | + * Only final events (`isFinal: true`) affect internal state. Interim results |
| 152 | + * are silently ignored — they may arrive very frequently and their text is |
| 153 | + * unstable. |
| 154 | + * |
| 155 | + * If the final text is a recognised backchannel phrase the detector emits |
| 156 | + * `'backchannel_detected'` and returns without accumulating the text, so that |
| 157 | + * a subsequent `speech_end` event does not trigger `turn_complete`. |
| 158 | + * |
| 159 | + * @param transcript — Transcript event from the STT session. |
| 160 | + */ |
| 161 | + pushTranscript(transcript: TranscriptEvent): void { |
| 162 | + if (!transcript.isFinal) { |
| 163 | + // Ignore partial/interim hypotheses — they will be superseded. |
| 164 | + return; |
| 165 | + } |
| 166 | + |
| 167 | + const text = transcript.text; |
| 168 | + const normalised = text.trim().toLowerCase(); |
| 169 | + |
| 170 | + // Detect backchannel acknowledgements before accumulating. |
| 171 | + if (BACKCHANNEL_PHRASES.has(normalised)) { |
| 172 | + this.emit('backchannel_detected', { text }); |
| 173 | + return; |
| 174 | + } |
| 175 | + |
| 176 | + // Accumulate the final transcript and store the confidence score. |
| 177 | + this.accumulatedText = text; |
| 178 | + this.lastConfidence = transcript.confidence; |
| 179 | + } |
| 180 | + |
| 181 | + // --------------------------------------------------------------------------- |
| 182 | + // IEndpointDetector — pushVadEvent |
| 183 | + // --------------------------------------------------------------------------- |
| 184 | + |
| 185 | + /** |
| 186 | + * Ingest a VAD (voice activity detection) event. |
| 187 | + * |
| 188 | + * - `speech_start`: marks the turn as active and cancels any pending silence |
| 189 | + * timer (the user resumed speaking before the timeout elapsed). |
| 190 | + * - `speech_end`: if accumulated text is available, either fires |
| 191 | + * `turn_complete` immediately (punctuation) or starts the silence timer. |
| 192 | + * - `silence`: heartbeat events are ignored; only explicit `speech_end` |
| 193 | + * drives the timeout logic. |
| 194 | + * |
| 195 | + * @param event — VAD transition event. |
| 196 | + */ |
| 197 | + pushVadEvent(event: VadEvent): void { |
| 198 | + switch (event.type) { |
| 199 | + case 'speech_start': { |
| 200 | + this.speechActive = true; |
| 201 | + this._clearSilenceTimer(); |
| 202 | + if (this.turnStartMs === null) { |
| 203 | + this.turnStartMs = event.timestamp; |
| 204 | + } |
| 205 | + break; |
| 206 | + } |
| 207 | + |
| 208 | + case 'speech_end': { |
| 209 | + this.speechActive = false; |
| 210 | + |
| 211 | + if (!this.accumulatedText) { |
| 212 | + // Nothing to flush — no transcript arrived yet. |
| 213 | + break; |
| 214 | + } |
| 215 | + |
| 216 | + if (TERMINAL_PUNCTUATION.test(this.accumulatedText)) { |
| 217 | + // Sentence-terminal punctuation → fire immediately. |
| 218 | + this._emitTurnComplete('punctuation', event.timestamp); |
| 219 | + } else { |
| 220 | + // No punctuation → wait for silence timeout. |
| 221 | + this._startSilenceTimer(event.timestamp); |
| 222 | + } |
| 223 | + break; |
| 224 | + } |
| 225 | + |
| 226 | + case 'silence': { |
| 227 | + // Periodic heartbeat — no action required; the silence timer already |
| 228 | + // handles the delayed fire if one is pending. |
| 229 | + break; |
| 230 | + } |
| 231 | + } |
| 232 | + } |
| 233 | + |
| 234 | + // --------------------------------------------------------------------------- |
| 235 | + // IEndpointDetector — reset |
| 236 | + // --------------------------------------------------------------------------- |
| 237 | + |
| 238 | + /** |
| 239 | + * Reset all internal state, cancel pending timers, and prepare the detector |
| 240 | + * for the next user turn. Should be called by the pipeline after each |
| 241 | + * `turn_complete` event before audio for the next turn begins to arrive. |
| 242 | + */ |
| 243 | + reset(): void { |
| 244 | + this._clearSilenceTimer(); |
| 245 | + this.accumulatedText = ''; |
| 246 | + this.speechActive = false; |
| 247 | + this.turnStartMs = null; |
| 248 | + this.lastConfidence = 1; |
| 249 | + } |
| 250 | + |
| 251 | + // --------------------------------------------------------------------------- |
| 252 | + // Private helpers |
| 253 | + // --------------------------------------------------------------------------- |
| 254 | + |
| 255 | + /** |
| 256 | + * Emit `turn_complete` with the currently accumulated transcript and then |
| 257 | + * reset internal state so the detector is ready for the next turn. |
| 258 | + * |
| 259 | + * @param reason — The semantic reason driving this completion. |
| 260 | + * @param speechEndTimestamp — Unix epoch ms timestamp of the `speech_end` event, |
| 261 | + * used to compute `durationMs`. |
| 262 | + */ |
| 263 | + private _emitTurnComplete( |
| 264 | + reason: TurnCompleteEvent['reason'], |
| 265 | + speechEndTimestamp: number, |
| 266 | + ): void { |
| 267 | + const durationMs = |
| 268 | + this.turnStartMs !== null ? speechEndTimestamp - this.turnStartMs : 0; |
| 269 | + |
| 270 | + const event: TurnCompleteEvent = { |
| 271 | + transcript: this.accumulatedText, |
| 272 | + confidence: this.lastConfidence, |
| 273 | + durationMs, |
| 274 | + reason, |
| 275 | + }; |
| 276 | + |
| 277 | + // Reset before emitting so that any re-entrant listeners see clean state. |
| 278 | + this.reset(); |
| 279 | + |
| 280 | + this.emit('turn_complete', event); |
| 281 | + } |
| 282 | + |
| 283 | + /** |
| 284 | + * Start the silence-timeout timer. If the user does not resume speaking |
| 285 | + * within {@link silenceTimeoutMs} ms the detector fires `turn_complete`. |
| 286 | + * |
| 287 | + * @param speechEndTimestamp — Timestamp passed through to `_emitTurnComplete`. |
| 288 | + */ |
| 289 | + private _startSilenceTimer(speechEndTimestamp: number): void { |
| 290 | + this._clearSilenceTimer(); |
| 291 | + this.silenceTimer = setTimeout(() => { |
| 292 | + this.silenceTimer = null; |
| 293 | + this._emitTurnComplete('silence_timeout', speechEndTimestamp); |
| 294 | + }, this.silenceTimeoutMs); |
| 295 | + } |
| 296 | + |
| 297 | + /** |
| 298 | + * Cancel a pending silence timer without any side effects. |
| 299 | + */ |
| 300 | + private _clearSilenceTimer(): void { |
| 301 | + if (this.silenceTimer !== null) { |
| 302 | + clearTimeout(this.silenceTimer); |
| 303 | + this.silenceTimer = null; |
| 304 | + } |
| 305 | + } |
| 306 | +} |
0 commit comments