Skip to content

Commit 1d820bc

Browse files
committed
feat(voice-pipeline): add HeuristicEndpointDetector with punctuation and silence detection
1 parent e6cde06 commit 1d820bc

2 files changed

Lines changed: 601 additions & 0 deletions

File tree

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
/**
2+
* @module voice-pipeline/HeuristicEndpointDetector
3+
*
4+
* A lightweight, rule-based endpoint detector that combines terminal punctuation
5+
* analysis with a configurable silence timeout to determine when the user has
6+
* finished speaking. Suitable for low-latency deployments where an LLM-based
7+
* semantic detector would add unacceptable round-trip overhead.
8+
*
9+
* Detection strategy:
10+
* 1. On `speech_end`, if the accumulated final transcript ends with `.`, `?`, or `!`,
11+
* fire `turn_complete` immediately with reason `'punctuation'`.
12+
* 2. Otherwise, start a silence timer (default 1 500 ms). If speech does not
13+
* resume before the timer fires, emit `turn_complete` with reason `'silence_timeout'`.
14+
* 3. Backchannel phrases (e.g. "uh huh", "yeah") are recognised, suppressed from
15+
* accumulation, and re-emitted as `'backchannel_detected'` events so the
16+
* pipeline can decide whether to suppress an agent response.
17+
*/
18+
19+
import { EventEmitter } from 'node:events';
20+
import type {
21+
IEndpointDetector,
22+
TranscriptEvent,
23+
VadEvent,
24+
TurnCompleteEvent,
25+
} from './types.js';
26+
27+
// ---------------------------------------------------------------------------
28+
// Constants
29+
// ---------------------------------------------------------------------------
30+
31+
/**
32+
* Default silence duration (ms) after speech stops before firing `turn_complete`.
33+
*/
34+
const DEFAULT_SILENCE_TIMEOUT_MS = 1_500;
35+
36+
/**
37+
* Terminal punctuation characters that signal sentence completion.
38+
*/
39+
const TERMINAL_PUNCTUATION = /[.?!]$/;
40+
41+
/**
42+
* Normalised backchannel phrases that indicate the listener is acknowledging
43+
* but not taking a full conversational turn. Compared after `.trim().toLowerCase()`.
44+
*/
45+
const BACKCHANNEL_PHRASES = new Set([
46+
'uh huh',
47+
'yeah',
48+
'okay',
49+
'ok',
50+
'mm hmm',
51+
'mmhmm',
52+
'mhm',
53+
'mm-hmm',
54+
'right',
55+
'sure',
56+
'yep',
57+
'yup',
58+
'gotcha',
59+
]);
60+
61+
// ---------------------------------------------------------------------------
62+
// Options
63+
// ---------------------------------------------------------------------------
64+
65+
/**
66+
* Constructor options for {@link HeuristicEndpointDetector}.
67+
*/
68+
export interface HeuristicEndpointDetectorOptions {
69+
/**
70+
* How long (ms) to wait after `speech_end` before emitting `turn_complete`
71+
* when no terminal punctuation is detected.
72+
* @defaultValue 1500
73+
*/
74+
silenceTimeoutMs?: number;
75+
}
76+
77+
// ---------------------------------------------------------------------------
78+
// Implementation
79+
// ---------------------------------------------------------------------------
80+
81+
/**
82+
* Heuristic endpoint detector that uses terminal punctuation and a silence
83+
* timeout to decide when the user's turn is complete.
84+
*
85+
* Emits:
86+
* - `'turn_complete'` ({@link TurnCompleteEvent}) — user turn has ended.
87+
* - `'backchannel_detected'` (`{ text: string }`) — a backchannel phrase was
88+
* recognised; accumulation is suppressed for this utterance.
89+
*
90+
* @example
91+
* ```typescript
92+
* const detector = new HeuristicEndpointDetector({ silenceTimeoutMs: 1000 });
93+
* detector.on('turn_complete', (event) => console.log('Turn done:', event));
94+
* detector.pushTranscript({ text: 'Hello there.', isFinal: true, confidence: 0.95, words: [] });
95+
* detector.pushVadEvent({ type: 'speech_end', timestamp: Date.now(), source: 'vad' });
96+
* // → 'turn_complete' fires immediately with reason 'punctuation'
97+
* ```
98+
*/
99+
export class HeuristicEndpointDetector
100+
extends EventEmitter
101+
implements IEndpointDetector
102+
{
103+
/**
104+
* Active detection strategy label.
105+
* Typed as `'hybrid'` to satisfy {@link IEndpointDetector.mode}; consumers
106+
* that need to distinguish heuristic detectors may inspect `instanceof`.
107+
*/
108+
// We expose 'heuristic' at runtime for consumers, using a wider string type
109+
// internally to avoid a cast on every read.
110+
readonly mode: IEndpointDetector['mode'] = 'hybrid';
111+
112+
/** Resolved silence timeout in milliseconds. */
113+
private readonly silenceTimeoutMs: number;
114+
115+
/** The latest final transcript text accumulated for the current turn. */
116+
private accumulatedText = '';
117+
118+
/** Whether the VAD currently reports active speech. */
119+
private speechActive = false;
120+
121+
/** Handle to a pending silence timeout, or `null` if none is running. */
122+
private silenceTimer: ReturnType<typeof setTimeout> | null = null;
123+
124+
/** Wall-clock timestamp (ms) when the current turn's speech started. */
125+
private turnStartMs: number | null = null;
126+
127+
/** Confidence of the most recent final transcript. */
128+
private lastConfidence = 1;
129+
130+
// ---------------------------------------------------------------------------
131+
// Constructor
132+
// ---------------------------------------------------------------------------
133+
134+
/**
135+
* Create a new {@link HeuristicEndpointDetector}.
136+
*
137+
* @param options — Optional configuration overrides.
138+
*/
139+
constructor(options: HeuristicEndpointDetectorOptions = {}) {
140+
super();
141+
this.silenceTimeoutMs = options.silenceTimeoutMs ?? DEFAULT_SILENCE_TIMEOUT_MS;
142+
}
143+
144+
// ---------------------------------------------------------------------------
145+
// IEndpointDetector — pushTranscript
146+
// ---------------------------------------------------------------------------
147+
148+
/**
149+
* Ingest a transcript event from the upstream STT session.
150+
*
151+
* Only final events (`isFinal: true`) affect internal state. Interim results
152+
* are silently ignored — they may arrive very frequently and their text is
153+
* unstable.
154+
*
155+
* If the final text is a recognised backchannel phrase the detector emits
156+
* `'backchannel_detected'` and returns without accumulating the text, so that
157+
* a subsequent `speech_end` event does not trigger `turn_complete`.
158+
*
159+
* @param transcript — Transcript event from the STT session.
160+
*/
161+
pushTranscript(transcript: TranscriptEvent): void {
162+
if (!transcript.isFinal) {
163+
// Ignore partial/interim hypotheses — they will be superseded.
164+
return;
165+
}
166+
167+
const text = transcript.text;
168+
const normalised = text.trim().toLowerCase();
169+
170+
// Detect backchannel acknowledgements before accumulating.
171+
if (BACKCHANNEL_PHRASES.has(normalised)) {
172+
this.emit('backchannel_detected', { text });
173+
return;
174+
}
175+
176+
// Accumulate the final transcript and store the confidence score.
177+
this.accumulatedText = text;
178+
this.lastConfidence = transcript.confidence;
179+
}
180+
181+
// ---------------------------------------------------------------------------
182+
// IEndpointDetector — pushVadEvent
183+
// ---------------------------------------------------------------------------
184+
185+
/**
186+
* Ingest a VAD (voice activity detection) event.
187+
*
188+
* - `speech_start`: marks the turn as active and cancels any pending silence
189+
* timer (the user resumed speaking before the timeout elapsed).
190+
* - `speech_end`: if accumulated text is available, either fires
191+
* `turn_complete` immediately (punctuation) or starts the silence timer.
192+
* - `silence`: heartbeat events are ignored; only explicit `speech_end`
193+
* drives the timeout logic.
194+
*
195+
* @param event — VAD transition event.
196+
*/
197+
pushVadEvent(event: VadEvent): void {
198+
switch (event.type) {
199+
case 'speech_start': {
200+
this.speechActive = true;
201+
this._clearSilenceTimer();
202+
if (this.turnStartMs === null) {
203+
this.turnStartMs = event.timestamp;
204+
}
205+
break;
206+
}
207+
208+
case 'speech_end': {
209+
this.speechActive = false;
210+
211+
if (!this.accumulatedText) {
212+
// Nothing to flush — no transcript arrived yet.
213+
break;
214+
}
215+
216+
if (TERMINAL_PUNCTUATION.test(this.accumulatedText)) {
217+
// Sentence-terminal punctuation → fire immediately.
218+
this._emitTurnComplete('punctuation', event.timestamp);
219+
} else {
220+
// No punctuation → wait for silence timeout.
221+
this._startSilenceTimer(event.timestamp);
222+
}
223+
break;
224+
}
225+
226+
case 'silence': {
227+
// Periodic heartbeat — no action required; the silence timer already
228+
// handles the delayed fire if one is pending.
229+
break;
230+
}
231+
}
232+
}
233+
234+
// ---------------------------------------------------------------------------
235+
// IEndpointDetector — reset
236+
// ---------------------------------------------------------------------------
237+
238+
/**
239+
* Reset all internal state, cancel pending timers, and prepare the detector
240+
* for the next user turn. Should be called by the pipeline after each
241+
* `turn_complete` event before audio for the next turn begins to arrive.
242+
*/
243+
reset(): void {
244+
this._clearSilenceTimer();
245+
this.accumulatedText = '';
246+
this.speechActive = false;
247+
this.turnStartMs = null;
248+
this.lastConfidence = 1;
249+
}
250+
251+
// ---------------------------------------------------------------------------
252+
// Private helpers
253+
// ---------------------------------------------------------------------------
254+
255+
/**
256+
* Emit `turn_complete` with the currently accumulated transcript and then
257+
* reset internal state so the detector is ready for the next turn.
258+
*
259+
* @param reason — The semantic reason driving this completion.
260+
* @param speechEndTimestamp — Unix epoch ms timestamp of the `speech_end` event,
261+
* used to compute `durationMs`.
262+
*/
263+
private _emitTurnComplete(
264+
reason: TurnCompleteEvent['reason'],
265+
speechEndTimestamp: number,
266+
): void {
267+
const durationMs =
268+
this.turnStartMs !== null ? speechEndTimestamp - this.turnStartMs : 0;
269+
270+
const event: TurnCompleteEvent = {
271+
transcript: this.accumulatedText,
272+
confidence: this.lastConfidence,
273+
durationMs,
274+
reason,
275+
};
276+
277+
// Reset before emitting so that any re-entrant listeners see clean state.
278+
this.reset();
279+
280+
this.emit('turn_complete', event);
281+
}
282+
283+
/**
284+
* Start the silence-timeout timer. If the user does not resume speaking
285+
* within {@link silenceTimeoutMs} ms the detector fires `turn_complete`.
286+
*
287+
* @param speechEndTimestamp — Timestamp passed through to `_emitTurnComplete`.
288+
*/
289+
private _startSilenceTimer(speechEndTimestamp: number): void {
290+
this._clearSilenceTimer();
291+
this.silenceTimer = setTimeout(() => {
292+
this.silenceTimer = null;
293+
this._emitTurnComplete('silence_timeout', speechEndTimestamp);
294+
}, this.silenceTimeoutMs);
295+
}
296+
297+
/**
298+
* Cancel a pending silence timer without any side effects.
299+
*/
300+
private _clearSilenceTimer(): void {
301+
if (this.silenceTimer !== null) {
302+
clearTimeout(this.silenceTimer);
303+
this.silenceTimer = null;
304+
}
305+
}
306+
}

0 commit comments

Comments
 (0)