Skip to content

Commit 9593ce2

Browse files
committed
feat(voice): add TwilioMediaStreamParser
1 parent 2c1c6f0 commit 9593ce2

2 files changed

Lines changed: 389 additions & 0 deletions

File tree

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import { describe, it, expect, beforeEach } from 'vitest';
2+
import { TwilioMediaStreamParser } from '../parsers/TwilioMediaStreamParser.js';
3+
4+
/**
5+
* Unit tests for {@link TwilioMediaStreamParser}.
6+
*
7+
* Each Twilio event type is exercised, plus edge cases: outbound track
8+
* filtering, unknown events, malformed JSON, and missing fields.
9+
*/
10+
describe('TwilioMediaStreamParser', () => {
11+
let parser: TwilioMediaStreamParser;
12+
13+
beforeEach(() => {
14+
parser = new TwilioMediaStreamParser();
15+
});
16+
17+
// ---------------------------------------------------------------------------
18+
// parseIncoming — start
19+
// ---------------------------------------------------------------------------
20+
21+
describe('parseIncoming — start event', () => {
22+
it('returns a start event with streamSid and callSid', () => {
23+
const msg = JSON.stringify({
24+
event: 'start',
25+
streamSid: 'MZ001',
26+
start: { callSid: 'CA001', accountSid: 'AC001' },
27+
});
28+
29+
const result = parser.parseIncoming(msg);
30+
31+
expect(result).not.toBeNull();
32+
expect(result!.type).toBe('start');
33+
if (result!.type === 'start') {
34+
expect(result.streamSid).toBe('MZ001');
35+
expect(result.callSid).toBe('CA001');
36+
}
37+
});
38+
39+
it('accepts a Buffer input', () => {
40+
const msg = Buffer.from(
41+
JSON.stringify({ event: 'start', streamSid: 'MZ002', start: { callSid: 'CA002' } }),
42+
);
43+
44+
const result = parser.parseIncoming(msg);
45+
expect(result?.type).toBe('start');
46+
});
47+
});
48+
49+
// ---------------------------------------------------------------------------
50+
// parseIncoming — media (inbound)
51+
// ---------------------------------------------------------------------------
52+
53+
describe('parseIncoming — media event (inbound)', () => {
54+
it('decodes base64 audio payload into a Buffer', () => {
55+
const rawBytes = Buffer.from([0x7f, 0x80, 0x7e]);
56+
const msg = JSON.stringify({
57+
event: 'media',
58+
streamSid: 'MZ001',
59+
media: { track: 'inbound', payload: rawBytes.toString('base64') },
60+
});
61+
62+
const result = parser.parseIncoming(msg);
63+
64+
expect(result).not.toBeNull();
65+
expect(result!.type).toBe('audio');
66+
if (result!.type === 'audio') {
67+
expect(result.streamSid).toBe('MZ001');
68+
expect(result.payload).toEqual(rawBytes);
69+
}
70+
});
71+
72+
it('parses inbound media without an explicit track field', () => {
73+
const msg = JSON.stringify({
74+
event: 'media',
75+
streamSid: 'MZ003',
76+
media: { payload: Buffer.from([0x00]).toString('base64') },
77+
});
78+
79+
const result = parser.parseIncoming(msg);
80+
expect(result?.type).toBe('audio');
81+
});
82+
});
83+
84+
// ---------------------------------------------------------------------------
85+
// parseIncoming — media (outbound, must be ignored)
86+
// ---------------------------------------------------------------------------
87+
88+
describe('parseIncoming — outbound media track', () => {
89+
it('returns null for outbound track messages', () => {
90+
const msg = JSON.stringify({
91+
event: 'media',
92+
streamSid: 'MZ001',
93+
media: { track: 'outbound', payload: 'AAAA' },
94+
});
95+
96+
const result = parser.parseIncoming(msg);
97+
expect(result).toBeNull();
98+
});
99+
});
100+
101+
// ---------------------------------------------------------------------------
102+
// parseIncoming — dtmf
103+
// ---------------------------------------------------------------------------
104+
105+
describe('parseIncoming — dtmf event', () => {
106+
it('returns dtmf event with digit and durationMs', () => {
107+
const msg = JSON.stringify({
108+
event: 'dtmf',
109+
streamSid: 'MZ001',
110+
dtmf: { digit: '5', duration: 500 },
111+
});
112+
113+
const result = parser.parseIncoming(msg);
114+
115+
expect(result).not.toBeNull();
116+
expect(result!.type).toBe('dtmf');
117+
if (result!.type === 'dtmf') {
118+
expect(result.digit).toBe('5');
119+
expect(result.streamSid).toBe('MZ001');
120+
expect(result.durationMs).toBe(500);
121+
}
122+
});
123+
124+
it('returns dtmf event without durationMs when duration is absent', () => {
125+
const msg = JSON.stringify({
126+
event: 'dtmf',
127+
streamSid: 'MZ001',
128+
dtmf: { digit: '#' },
129+
});
130+
131+
const result = parser.parseIncoming(msg);
132+
133+
expect(result?.type).toBe('dtmf');
134+
if (result?.type === 'dtmf') {
135+
expect(result.durationMs).toBeUndefined();
136+
}
137+
});
138+
});
139+
140+
// ---------------------------------------------------------------------------
141+
// parseIncoming — stop
142+
// ---------------------------------------------------------------------------
143+
144+
describe('parseIncoming — stop event', () => {
145+
it('returns a stop event with streamSid', () => {
146+
const msg = JSON.stringify({ event: 'stop', streamSid: 'MZ001' });
147+
148+
const result = parser.parseIncoming(msg);
149+
150+
expect(result).not.toBeNull();
151+
expect(result!.type).toBe('stop');
152+
if (result!.type === 'stop') {
153+
expect(result.streamSid).toBe('MZ001');
154+
}
155+
});
156+
});
157+
158+
// ---------------------------------------------------------------------------
159+
// parseIncoming — mark
160+
// ---------------------------------------------------------------------------
161+
162+
describe('parseIncoming — mark event', () => {
163+
it('returns a mark event with name and streamSid', () => {
164+
const msg = JSON.stringify({
165+
event: 'mark',
166+
streamSid: 'MZ001',
167+
mark: { name: 'done' },
168+
});
169+
170+
const result = parser.parseIncoming(msg);
171+
172+
expect(result).not.toBeNull();
173+
expect(result!.type).toBe('mark');
174+
if (result!.type === 'mark') {
175+
expect(result.name).toBe('done');
176+
expect(result.streamSid).toBe('MZ001');
177+
}
178+
});
179+
});
180+
181+
// ---------------------------------------------------------------------------
182+
// parseIncoming — unknown / malformed
183+
// ---------------------------------------------------------------------------
184+
185+
describe('parseIncoming — unknown events and malformed input', () => {
186+
it('returns null for unknown event types', () => {
187+
const msg = JSON.stringify({ event: 'heartbeat', streamSid: 'MZ001' });
188+
expect(parser.parseIncoming(msg)).toBeNull();
189+
});
190+
191+
it('returns null for malformed JSON', () => {
192+
expect(parser.parseIncoming('not-json')).toBeNull();
193+
});
194+
195+
it('returns null when streamSid is missing', () => {
196+
const msg = JSON.stringify({ event: 'stop' });
197+
expect(parser.parseIncoming(msg)).toBeNull();
198+
});
199+
});
200+
201+
// ---------------------------------------------------------------------------
202+
// formatOutgoing
203+
// ---------------------------------------------------------------------------
204+
205+
describe('formatOutgoing', () => {
206+
it('wraps audio in the Twilio media envelope', () => {
207+
const audio = Buffer.from([0x7f, 0x80]);
208+
const result = parser.formatOutgoing(audio, 'MZ001');
209+
210+
const parsed = JSON.parse(result as string);
211+
expect(parsed.event).toBe('media');
212+
expect(parsed.streamSid).toBe('MZ001');
213+
expect(parsed.media.payload).toBe(audio.toString('base64'));
214+
});
215+
});
216+
217+
// ---------------------------------------------------------------------------
218+
// formatConnected
219+
// ---------------------------------------------------------------------------
220+
221+
describe('formatConnected', () => {
222+
it('returns a JSON string with event=connected', () => {
223+
const result = parser.formatConnected!('MZ001');
224+
225+
expect(typeof result).toBe('string');
226+
const parsed = JSON.parse(result as string);
227+
expect(parsed.event).toBe('connected');
228+
expect(parsed.protocol).toBe('Call');
229+
expect(parsed.version).toBe('1.0.0');
230+
});
231+
});
232+
});
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import type { MediaStreamParser, MediaStreamIncoming } from '../MediaStreamParser.js';
2+
3+
/**
4+
* Parses the Twilio `<Connect><Stream>` WebSocket media stream protocol.
5+
*
6+
* Twilio sends all messages as JSON-encoded strings. Outbound audio is
7+
* wrapped in the same JSON envelope so Twilio can associate it with the
8+
* correct stream. An explicit `connected` acknowledgment is sent once
9+
* immediately after the WebSocket handshake to signal that the listener is
10+
* ready to receive media.
11+
*
12+
* @see {@link https://www.twilio.com/docs/voice/twiml/stream}
13+
*/
14+
export class TwilioMediaStreamParser implements MediaStreamParser {
15+
/**
16+
* Parse a raw WebSocket frame from Twilio's media stream.
17+
*
18+
* Supported Twilio event types:
19+
* - `start` — stream established, includes callSid
20+
* - `media` — audio chunk (inbound track only; outbound chunks are ignored)
21+
* - `dtmf` — DTMF keypress detected
22+
* - `stop` — stream ended
23+
* - `mark` — named synchronisation marker
24+
*
25+
* @param data - Raw WebSocket frame payload (always a JSON string from Twilio).
26+
* @returns Normalised {@link MediaStreamIncoming} event, or `null` for
27+
* outbound audio tracks, unknown event types, or malformed messages.
28+
*/
29+
parseIncoming(data: Buffer | string): MediaStreamIncoming | null {
30+
const raw = typeof data === 'string' ? data : data.toString('utf8');
31+
32+
let msg: Record<string, unknown>;
33+
try {
34+
msg = JSON.parse(raw) as Record<string, unknown>;
35+
} catch {
36+
return null;
37+
}
38+
39+
const event = msg['event'] as string | undefined;
40+
const streamSid = msg['streamSid'] as string | undefined;
41+
42+
if (!event || !streamSid) {
43+
return null;
44+
}
45+
46+
switch (event) {
47+
case 'start': {
48+
const startPayload = msg['start'] as Record<string, unknown> | undefined;
49+
const callSid = (startPayload?.['callSid'] as string | undefined) ?? '';
50+
const result: MediaStreamIncoming = {
51+
type: 'start',
52+
streamSid,
53+
callSid,
54+
metadata: startPayload as Record<string, unknown> | undefined,
55+
};
56+
return result;
57+
}
58+
59+
case 'media': {
60+
const media = msg['media'] as Record<string, unknown> | undefined;
61+
if (!media) return null;
62+
63+
// Only process inbound audio — outbound echoes must be discarded.
64+
const track = media['track'] as string | undefined;
65+
if (track === 'outbound') return null;
66+
67+
const payloadB64 = media['payload'] as string | undefined;
68+
if (!payloadB64) return null;
69+
70+
const sequenceNumber = typeof msg['sequenceNumber'] === 'number'
71+
? (msg['sequenceNumber'] as number)
72+
: undefined;
73+
74+
const result: MediaStreamIncoming = {
75+
type: 'audio',
76+
payload: Buffer.from(payloadB64, 'base64'),
77+
streamSid,
78+
...(sequenceNumber !== undefined ? { sequenceNumber } : {}),
79+
};
80+
return result;
81+
}
82+
83+
case 'dtmf': {
84+
const dtmf = msg['dtmf'] as Record<string, unknown> | undefined;
85+
if (!dtmf) return null;
86+
87+
const digit = dtmf['digit'] as string | undefined;
88+
if (!digit) return null;
89+
90+
const duration = typeof dtmf['duration'] === 'number'
91+
? (dtmf['duration'] as number)
92+
: undefined;
93+
94+
const result: MediaStreamIncoming = {
95+
type: 'dtmf',
96+
digit,
97+
streamSid,
98+
...(duration !== undefined ? { durationMs: duration } : {}),
99+
};
100+
return result;
101+
}
102+
103+
case 'stop': {
104+
const result: MediaStreamIncoming = { type: 'stop', streamSid };
105+
return result;
106+
}
107+
108+
case 'mark': {
109+
const mark = msg['mark'] as Record<string, unknown> | undefined;
110+
if (!mark) return null;
111+
112+
const name = mark['name'] as string | undefined;
113+
if (!name) return null;
114+
115+
const result: MediaStreamIncoming = { type: 'mark', name, streamSid };
116+
return result;
117+
}
118+
119+
default:
120+
return null;
121+
}
122+
}
123+
124+
/**
125+
* Encode mu-law audio for transmission back to the Twilio stream.
126+
*
127+
* Twilio requires base64-encoded audio wrapped in a JSON `media` envelope
128+
* so it can route the audio to the correct stream.
129+
*
130+
* @param audio - Raw mu-law PCM bytes to send to the caller.
131+
* @param streamSid - The stream identifier to include in the envelope.
132+
* @returns JSON string conforming to the Twilio media-out envelope format.
133+
*/
134+
formatOutgoing(audio: Buffer, streamSid: string): string {
135+
return JSON.stringify({
136+
event: 'media',
137+
streamSid,
138+
media: { payload: audio.toString('base64') },
139+
});
140+
}
141+
142+
/**
143+
* Generate the initial `connected` acknowledgment expected by Twilio
144+
* immediately after the WebSocket connection is established.
145+
*
146+
* @param _streamSid - Unused — Twilio does not require the stream ID in the
147+
* `connected` message, but the parameter is accepted for interface parity.
148+
* @returns JSON string with the `connected` envelope.
149+
*/
150+
formatConnected(_streamSid: string): string {
151+
return JSON.stringify({
152+
event: 'connected',
153+
protocol: 'Call',
154+
version: '1.0.0',
155+
});
156+
}
157+
}

0 commit comments

Comments
 (0)