Skip to content

Commit 271defd

Browse files
committed
feat(voice): TranscriptDedupe for cross-provider overlap suppression
After a mid-utterance failover the backup provider replays the ring buffer and re-transcribes audio the primary already emitted — without dedupe the session sees 'hello world' twice. This class filters those duplicates using audio-clock overlap as the primary signal and normalized string comparison as the tie-breaker (exact > superset > fuzzy token-set similarity >= 0.85). Same-provider observations are never considered duplicates so streaming interim transcripts still pass through. Task 5/17.
1 parent ab2e41e commit 271defd

2 files changed

Lines changed: 256 additions & 0 deletions

File tree

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/**
2+
* @module voice-pipeline/TranscriptDedupe
3+
*
4+
* Duplicate-transcript detector used by StreamingSTTChain after a mid-
5+
* utterance failover. The backup provider replays the ring buffer and
6+
* re-transcribes audio the primary already saw; without dedupe the session
7+
* sees "hello world" twice.
8+
*
9+
* Dedupe signal: audio-clock overlap (primary fact) + fuzzy string match
10+
* (tie-breaker). Two transcripts overlap if their [audioStartMs, audioEndMs]
11+
* ranges intersect; when they do, we compare normalized text.
12+
*
13+
* Same-provider observations are never considered duplicates — interim
14+
* transcripts from a single streaming provider are part of its protocol.
15+
*/
16+
17+
export interface TranscriptObservation {
18+
provider: string;
19+
text: string;
20+
audioStartMs: number;
21+
audioEndMs: number;
22+
isFinal: boolean;
23+
}
24+
25+
export interface DedupeResult {
26+
isDuplicate: boolean;
27+
reason?: 'exact' | 'fuzzy' | 'superset';
28+
against?: { provider: string; text: string };
29+
}
30+
31+
const PUNCT_RE = /[.,!?;:"'()\[\]{}]/g;
32+
const WS_RE = /\s+/g;
33+
34+
function normalize(text: string): string {
35+
return text.toLowerCase().replace(PUNCT_RE, '').replace(WS_RE, ' ').trim();
36+
}
37+
38+
function tokenSet(text: string): Set<string> {
39+
return new Set(normalize(text).split(' ').filter(Boolean));
40+
}
41+
42+
function tokenSetSimilarity(a: string, b: string): number {
43+
const ta = tokenSet(a);
44+
const tb = tokenSet(b);
45+
if (ta.size === 0 && tb.size === 0) return 1;
46+
if (ta.size === 0 || tb.size === 0) return 0;
47+
let intersect = 0;
48+
for (const t of ta) if (tb.has(t)) intersect++;
49+
return intersect / Math.max(ta.size, tb.size);
50+
}
51+
52+
function rangesOverlap(a: TranscriptObservation, b: TranscriptObservation): boolean {
53+
return a.audioStartMs < b.audioEndMs && b.audioStartMs < a.audioEndMs;
54+
}
55+
56+
export class TranscriptDedupe {
57+
private recent: TranscriptObservation[] = [];
58+
private readonly fuzzyThreshold: number;
59+
private readonly retainMs: number;
60+
61+
constructor(opts: { fuzzyThreshold?: number; retainMs?: number } = {}) {
62+
this.fuzzyThreshold = opts.fuzzyThreshold ?? 0.85;
63+
this.retainMs = opts.retainMs ?? 10_000;
64+
}
65+
66+
evaluate(obs: TranscriptObservation): DedupeResult {
67+
this.prune(obs.audioEndMs);
68+
69+
for (const prev of this.recent) {
70+
if (prev.provider === obs.provider) continue;
71+
if (!rangesOverlap(prev, obs)) continue;
72+
73+
const na = normalize(prev.text);
74+
const nb = normalize(obs.text);
75+
76+
if (na === nb) {
77+
this.recent.push(obs);
78+
return {
79+
isDuplicate: true,
80+
reason: 'exact',
81+
against: { provider: prev.provider, text: prev.text },
82+
};
83+
}
84+
85+
// Longer transcript subsumes shorter one (primary saw more audio by
86+
// the time backup caught up — suppress the backup's shorter view).
87+
if (na.includes(nb) || nb.includes(na)) {
88+
if (na.length >= nb.length) {
89+
this.recent.push(obs);
90+
return {
91+
isDuplicate: true,
92+
reason: 'superset',
93+
against: { provider: prev.provider, text: prev.text },
94+
};
95+
}
96+
}
97+
98+
const sim = tokenSetSimilarity(prev.text, obs.text);
99+
if (sim >= this.fuzzyThreshold) {
100+
this.recent.push(obs);
101+
return {
102+
isDuplicate: true,
103+
reason: 'fuzzy',
104+
against: { provider: prev.provider, text: prev.text },
105+
};
106+
}
107+
}
108+
109+
this.recent.push(obs);
110+
return { isDuplicate: false };
111+
}
112+
113+
reset(): void {
114+
this.recent = [];
115+
}
116+
117+
private prune(upToMs: number): void {
118+
const cutoff = upToMs - this.retainMs;
119+
this.recent = this.recent.filter((o) => o.audioEndMs >= cutoff);
120+
}
121+
}
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { TranscriptDedupe } from '../TranscriptDedupe.js';
3+
4+
describe('TranscriptDedupe', () => {
5+
it('returns isDuplicate=false on first observation', () => {
6+
const dedupe = new TranscriptDedupe();
7+
const r = dedupe.evaluate({
8+
provider: 'deepgram',
9+
text: 'hello world',
10+
audioStartMs: 0,
11+
audioEndMs: 600,
12+
isFinal: false,
13+
});
14+
expect(r.isDuplicate).toBe(false);
15+
});
16+
17+
it('suppresses exact duplicates within overlap window', () => {
18+
const dedupe = new TranscriptDedupe();
19+
dedupe.evaluate({
20+
provider: 'deepgram',
21+
text: 'hello world',
22+
audioStartMs: 0,
23+
audioEndMs: 600,
24+
isFinal: true,
25+
});
26+
const r = dedupe.evaluate({
27+
provider: 'elevenlabs',
28+
text: 'hello world',
29+
audioStartMs: 100,
30+
audioEndMs: 700,
31+
isFinal: true,
32+
});
33+
expect(r.isDuplicate).toBe(true);
34+
expect(r.reason).toBe('exact');
35+
});
36+
37+
it('suppresses fuzzy matches above threshold', () => {
38+
const dedupe = new TranscriptDedupe();
39+
dedupe.evaluate({
40+
provider: 'deepgram',
41+
text: 'i said hello world',
42+
audioStartMs: 0,
43+
audioEndMs: 900,
44+
isFinal: true,
45+
});
46+
const r = dedupe.evaluate({
47+
provider: 'elevenlabs',
48+
text: 'i said hello world.',
49+
audioStartMs: 100,
50+
audioEndMs: 1000,
51+
isFinal: true,
52+
});
53+
expect(r.isDuplicate).toBe(true);
54+
});
55+
56+
it('does NOT suppress legitimate continuations', () => {
57+
const dedupe = new TranscriptDedupe();
58+
dedupe.evaluate({
59+
provider: 'deepgram',
60+
text: 'hello world',
61+
audioStartMs: 0,
62+
audioEndMs: 600,
63+
isFinal: true,
64+
});
65+
const r = dedupe.evaluate({
66+
provider: 'elevenlabs',
67+
text: 'how are you today',
68+
audioStartMs: 800,
69+
audioEndMs: 1800,
70+
isFinal: true,
71+
});
72+
expect(r.isDuplicate).toBe(false);
73+
});
74+
75+
it('does NOT suppress same-provider repeated transcripts', () => {
76+
// Interim transcripts from the same provider shouldn't be deduped;
77+
// they're a legitimate part of the streaming protocol.
78+
const dedupe = new TranscriptDedupe();
79+
dedupe.evaluate({
80+
provider: 'deepgram',
81+
text: 'hello',
82+
audioStartMs: 0,
83+
audioEndMs: 300,
84+
isFinal: false,
85+
});
86+
const r = dedupe.evaluate({
87+
provider: 'deepgram',
88+
text: 'hello world',
89+
audioStartMs: 0,
90+
audioEndMs: 600,
91+
isFinal: false,
92+
});
93+
expect(r.isDuplicate).toBe(false);
94+
});
95+
96+
it('suppresses supersets: shorter transcript contained in longer one', () => {
97+
const dedupe = new TranscriptDedupe();
98+
dedupe.evaluate({
99+
provider: 'deepgram',
100+
text: 'the quick brown fox jumps',
101+
audioStartMs: 0,
102+
audioEndMs: 1500,
103+
isFinal: true,
104+
});
105+
const r = dedupe.evaluate({
106+
provider: 'elevenlabs',
107+
text: 'quick brown fox',
108+
audioStartMs: 200,
109+
audioEndMs: 1000,
110+
isFinal: true,
111+
});
112+
expect(r.isDuplicate).toBe(true);
113+
expect(r.reason).toBe('superset');
114+
});
115+
116+
it('reset() clears memory', () => {
117+
const dedupe = new TranscriptDedupe();
118+
dedupe.evaluate({
119+
provider: 'a',
120+
text: 'hello world',
121+
audioStartMs: 0,
122+
audioEndMs: 600,
123+
isFinal: true,
124+
});
125+
dedupe.reset();
126+
const r = dedupe.evaluate({
127+
provider: 'b',
128+
text: 'hello world',
129+
audioStartMs: 0,
130+
audioEndMs: 600,
131+
isFinal: true,
132+
});
133+
expect(r.isDuplicate).toBe(false);
134+
});
135+
});

0 commit comments

Comments
 (0)