Skip to content

Commit 648efe7

Browse files
authored
Add voicebench evaluation suite https://github.com/MatthewCYM/VoiceBench (#312)
This merge brings Voicebench eval pipeline into the main repository. To run this eval, use poetry run python -m ultravox.evaluation.eval --config_path ultravox/evaluation/configs/eval_config_voicebench.yaml
1 parent 0c61959 commit 648efe7

19 files changed

+5466
-208
lines changed

poetry.lock

Lines changed: 195 additions & 197 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ peft = "~0.11.1"
1616
simple-parsing = "~0.1.5"
1717
librosa = "~0.10.2.post1"
1818
requests = ">=2.31.0"
19-
datasets = "3.5.0"
19+
datasets = ">=3.5.0, <4.0"
2020
mosaicml-streaming = "~0.7.6"
2121
nltk = "~3.8.1"
2222
truecase = "~0.0.14"
@@ -40,12 +40,14 @@ praatio = "^6.2.0"
4040
hf-transfer = "^0.1.8"
4141
annoy = "^1.17.3"
4242
coverage = "^7.6.10"
43-
whisper_normalizer = "^0.0.10"
43+
whisper_normalizer = ">=0.1.0"
4444
evaluate = "^0.4.3"
4545
deepgram-sdk = "^3.10.1"
4646
vocos = "^0.1.0"
4747
torchdiffeq = "*"
4848
x-transformers = "*"
49+
rich = "<13.5.0"
50+
qa-metrics = ">=0.2.42"
4951

5052
[tool.poetry.group.dev.dependencies]
5153
black = "~24.4.2"
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
from ultravox.data import types
2+
3+
SYSTEM_PROMPT = """
4+
You are a friendly and helpful character. You love to answer questions for people.
5+
"""
6+
DUMMY_ASSISTANT_TEMPLATE = "I'm sorry, I don't know the answer to that question."
7+
8+
# Datasets with reference answers (use yes/no evaluation)
9+
VB_BBH_CONFIG = types.DatasetConfig(
10+
name="voicebench-bbh",
11+
path="hlt-lab/voicebench",
12+
subset="bbh",
13+
splits=[
14+
types.DatasetSplitConfig(
15+
name="test", num_samples=1000, split=types.DatasetSplit.TEST
16+
),
17+
],
18+
eval_config=types.EvalConfig(
19+
metric="voicebench_bbh",
20+
extra_kwargs_map={"id": "id"},
21+
),
22+
user_template=types.AUDIO_PLACEHOLDER,
23+
transcript_template="{{prompt}}",
24+
assistant_template="{{reference}}",
25+
system_prompt_template=SYSTEM_PROMPT,
26+
)
27+
28+
VB_MMSU_CONFIG = types.DatasetConfig(
29+
name="voicebench-mmsu",
30+
path="hlt-lab/voicebench",
31+
subset="mmsu",
32+
splits=[
33+
types.DatasetSplitConfig(
34+
name="law", num_samples=51, split=types.DatasetSplit.TEST
35+
),
36+
types.DatasetSplitConfig(
37+
name="engineering", num_samples=107, split=types.DatasetSplit.TEST
38+
),
39+
types.DatasetSplitConfig(
40+
name="other", num_samples=546, split=types.DatasetSplit.TEST
41+
),
42+
types.DatasetSplitConfig(
43+
name="biology", num_samples=172, split=types.DatasetSplit.TEST
44+
),
45+
types.DatasetSplitConfig(
46+
name="business", num_samples=236, split=types.DatasetSplit.TEST
47+
),
48+
types.DatasetSplitConfig(
49+
name="economics", num_samples=280, split=types.DatasetSplit.TEST
50+
),
51+
types.DatasetSplitConfig(
52+
name="health", num_samples=406, split=types.DatasetSplit.TEST
53+
),
54+
types.DatasetSplitConfig(
55+
name="philosophy", num_samples=305, split=types.DatasetSplit.TEST
56+
),
57+
types.DatasetSplitConfig(
58+
name="psychology", num_samples=317, split=types.DatasetSplit.TEST
59+
),
60+
types.DatasetSplitConfig(
61+
name="history", num_samples=104, split=types.DatasetSplit.TEST
62+
),
63+
types.DatasetSplitConfig(
64+
name="chemistry", num_samples=167, split=types.DatasetSplit.TEST
65+
),
66+
types.DatasetSplitConfig(
67+
name="physics", num_samples=383, split=types.DatasetSplit.TEST
68+
),
69+
],
70+
eval_config=types.EvalConfig(metric="voicebench_mcq"),
71+
user_template=types.AUDIO_PLACEHOLDER,
72+
transcript_template="{{prompt}}",
73+
assistant_template="{{reference}}",
74+
system_prompt_template=SYSTEM_PROMPT,
75+
)
76+
77+
VB_OPENBOOKQA_CONFIG = types.DatasetConfig(
78+
name="voicebench-openbookqa",
79+
path="hlt-lab/voicebench",
80+
subset="openbookqa",
81+
splits=[
82+
types.DatasetSplitConfig(
83+
name="test", num_samples=455, split=types.DatasetSplit.TEST
84+
),
85+
],
86+
eval_config=types.EvalConfig(metric="voicebench_mcq"),
87+
user_template=types.AUDIO_PLACEHOLDER,
88+
transcript_template="{{prompt}}",
89+
assistant_template="{{reference}}",
90+
system_prompt_template=SYSTEM_PROMPT,
91+
)
92+
93+
VB_SD_QA_CONFIG = types.DatasetConfig(
94+
name="voicebench-sd-qa",
95+
path="hlt-lab/voicebench",
96+
subset="sd-qa",
97+
splits=[
98+
types.DatasetSplitConfig(
99+
name="usa", num_samples=553, split=types.DatasetSplit.TEST
100+
),
101+
],
102+
eval_config=types.EvalConfig(
103+
metric="voicebench_yes_no", args={"evaluator": "gpt-4o-mini"}
104+
),
105+
user_template=types.AUDIO_PLACEHOLDER,
106+
transcript_template="{{prompt}}",
107+
assistant_template="{{reference}}",
108+
system_prompt_template=SYSTEM_PROMPT,
109+
)
110+
# The following splits could be used to create separate test sets in the future
111+
# types.DatasetSplitConfig(name="aus", num_samples=553, split=types.DatasetSplit.TEST)
112+
# types.DatasetSplitConfig(name="gbr", num_samples=553, split=types.DatasetSplit.TEST)
113+
# types.DatasetSplitConfig(name="ind_n", num_samples=553, split=types.DatasetSplit.TEST)
114+
# types.DatasetSplitConfig(name="ind_s", num_samples=553, split=types.DatasetSplit.TEST)
115+
# types.DatasetSplitConfig(name="irl", num_samples=553, split=types.DatasetSplit.TEST)
116+
# types.DatasetSplitConfig(name="kenya", num_samples=553, split=types.DatasetSplit.TEST)
117+
# types.DatasetSplitConfig(name="nga", num_samples=553, split=types.DatasetSplit.TEST)
118+
# types.DatasetSplitConfig(name="nzl", num_samples=553, split=types.DatasetSplit.TEST)
119+
# types.DatasetSplitConfig(name="phl", num_samples=553, split=types.DatasetSplit.TEST)
120+
# types.DatasetSplitConfig(name="zaf", num_samples=553, split=types.DatasetSplit.TEST)
121+
122+
# Datasets without reference answers (use scalar evaluation)
123+
VB_ADVBENCH_CONFIG = types.DatasetConfig(
124+
name="voicebench-advbench",
125+
path="hlt-lab/voicebench",
126+
subset="advbench",
127+
splits=[
128+
types.DatasetSplitConfig(
129+
name="test", num_samples=520, split=types.DatasetSplit.TEST
130+
),
131+
],
132+
eval_config=types.EvalConfig(metric="voicebench_harm"),
133+
user_template=types.AUDIO_PLACEHOLDER,
134+
transcript_template="{{prompt}}",
135+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
136+
system_prompt_template=SYSTEM_PROMPT,
137+
)
138+
139+
VB_ALPACAEVAL_CONFIG = types.DatasetConfig(
140+
name="voicebench-alpacaeval",
141+
path="hlt-lab/voicebench",
142+
subset="alpacaeval",
143+
splits=[
144+
types.DatasetSplitConfig(
145+
name="test", num_samples=199, split=types.DatasetSplit.TEST
146+
),
147+
],
148+
eval_config=types.EvalConfig(
149+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
150+
),
151+
user_template=types.AUDIO_PLACEHOLDER,
152+
transcript_template="{{prompt}}",
153+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
154+
system_prompt_template=SYSTEM_PROMPT,
155+
)
156+
157+
VB_ALPACAEVAL_FULL_CONFIG = types.DatasetConfig(
158+
name="voicebench-alpacaeval-full",
159+
path="hlt-lab/voicebench",
160+
subset="alpacaeval_full",
161+
splits=[
162+
types.DatasetSplitConfig(
163+
name="test", num_samples=636, split=types.DatasetSplit.TEST
164+
),
165+
],
166+
eval_config=types.EvalConfig(
167+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
168+
),
169+
user_template=types.AUDIO_PLACEHOLDER,
170+
transcript_template="{{prompt}}",
171+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
172+
system_prompt_template=SYSTEM_PROMPT,
173+
)
174+
175+
VB_ALPACAEVAL_SPEAKER_CONFIG = types.DatasetConfig(
176+
name="voicebench-alpacaeval-speaker",
177+
path="hlt-lab/voicebench",
178+
subset="alpacaeval_speaker",
179+
splits=[
180+
types.DatasetSplitConfig(
181+
name="en_AU_Wavenet_A_1.0_0.0_0.0",
182+
num_samples=636,
183+
split=types.DatasetSplit.TEST,
184+
),
185+
types.DatasetSplitConfig(
186+
name="en_AU_Wavenet_B_1.0_0.0_0.0",
187+
num_samples=636,
188+
split=types.DatasetSplit.TEST,
189+
),
190+
types.DatasetSplitConfig(
191+
name="en_IN_Wavenet_A_1.0_0.0_0.0",
192+
num_samples=636,
193+
split=types.DatasetSplit.TEST,
194+
),
195+
types.DatasetSplitConfig(
196+
name="en_IN_Wavenet_B_1.0_0.0_0.0",
197+
num_samples=636,
198+
split=types.DatasetSplit.TEST,
199+
),
200+
types.DatasetSplitConfig(
201+
name="en_GB_Wavenet_A_1.0_0.0_0.0",
202+
num_samples=636,
203+
split=types.DatasetSplit.TEST,
204+
),
205+
types.DatasetSplitConfig(
206+
name="en_GB_Wavenet_B_1.0_0.0_0.0",
207+
num_samples=636,
208+
split=types.DatasetSplit.TEST,
209+
),
210+
types.DatasetSplitConfig(
211+
name="en_US_Wavenet_A_1.0_0.0_0.0",
212+
num_samples=636,
213+
split=types.DatasetSplit.TEST,
214+
),
215+
types.DatasetSplitConfig(
216+
name="en_US_Wavenet_C_1.0_0.0_0.0",
217+
num_samples=636,
218+
split=types.DatasetSplit.TEST,
219+
),
220+
types.DatasetSplitConfig(
221+
name="en_US_Wavenet_A_1.5_0.0_0.0",
222+
num_samples=636,
223+
split=types.DatasetSplit.TEST,
224+
),
225+
types.DatasetSplitConfig(
226+
name="en_US_Wavenet_A_2.0_0.0_0.0",
227+
num_samples=636,
228+
split=types.DatasetSplit.TEST,
229+
),
230+
types.DatasetSplitConfig(
231+
name="en_US_Wavenet_A_0.5_0.0_0.0",
232+
num_samples=636,
233+
split=types.DatasetSplit.TEST,
234+
),
235+
],
236+
eval_config=types.EvalConfig(
237+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
238+
),
239+
user_template=types.AUDIO_PLACEHOLDER,
240+
transcript_template="{{prompt}}",
241+
assistant_template="",
242+
system_prompt_template=SYSTEM_PROMPT,
243+
)
244+
245+
VB_COMMONEVAL_CONFIG = types.DatasetConfig(
246+
name="voicebench-commoneval",
247+
path="hlt-lab/voicebench",
248+
subset="commoneval",
249+
splits=[
250+
types.DatasetSplitConfig(
251+
name="test", num_samples=200, split=types.DatasetSplit.TEST
252+
),
253+
],
254+
eval_config=types.EvalConfig(
255+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
256+
),
257+
user_template=types.AUDIO_PLACEHOLDER,
258+
transcript_template="{{prompt}}",
259+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
260+
system_prompt_template=SYSTEM_PROMPT,
261+
)
262+
263+
VB_IFEVAL_CONFIG = types.DatasetConfig(
264+
name="voicebench-ifeval",
265+
path="hlt-lab/voicebench",
266+
subset="ifeval",
267+
splits=[
268+
types.DatasetSplitConfig(
269+
name="test", num_samples=345, split=types.DatasetSplit.TEST
270+
),
271+
],
272+
eval_config=types.EvalConfig(
273+
metric="voicebench_ifeval",
274+
extra_kwargs_map={
275+
"id": "id",
276+
"key": "key",
277+
"instruction_id_list": "instruction_id_list",
278+
"kwargs": "kwargs",
279+
},
280+
),
281+
user_template=types.AUDIO_PLACEHOLDER,
282+
transcript_template="{{prompt}}",
283+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
284+
system_prompt_template=SYSTEM_PROMPT,
285+
)
286+
287+
VB_MTBENCH_CONFIG = types.DatasetConfig(
288+
name="voicebench-mtbench",
289+
path="hlt-lab/voicebench",
290+
subset="mtbench",
291+
splits=[
292+
types.DatasetSplitConfig(
293+
name="test", num_samples=46, split=types.DatasetSplit.TEST
294+
),
295+
],
296+
eval_config=types.EvalConfig(
297+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
298+
),
299+
user_template=types.AUDIO_PLACEHOLDER,
300+
transcript_template="{{prompt}}",
301+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
302+
system_prompt_template=SYSTEM_PROMPT,
303+
)
304+
305+
VB_WILDVOICE_CONFIG = types.DatasetConfig(
306+
name="voicebench-wildvoice",
307+
path="hlt-lab/voicebench",
308+
subset="wildvoice",
309+
splits=[
310+
types.DatasetSplitConfig(
311+
name="test", num_samples=1000, split=types.DatasetSplit.TEST
312+
),
313+
],
314+
eval_config=types.EvalConfig(
315+
metric="voicebench_scalar", args={"evaluator": "gpt-4o-mini"}
316+
),
317+
user_template=types.AUDIO_PLACEHOLDER,
318+
transcript_template="{{prompt}}",
319+
assistant_template=DUMMY_ASSISTANT_TEMPLATE,
320+
system_prompt_template=SYSTEM_PROMPT,
321+
)
322+
323+
configs = [
324+
# Reference-based datasets (Yes/No evaluation)
325+
VB_BBH_CONFIG,
326+
VB_MMSU_CONFIG,
327+
VB_OPENBOOKQA_CONFIG,
328+
VB_SD_QA_CONFIG,
329+
# Open-ended datasets (Scalar evaluation)
330+
VB_ADVBENCH_CONFIG,
331+
VB_ALPACAEVAL_CONFIG,
332+
VB_ALPACAEVAL_FULL_CONFIG,
333+
VB_ALPACAEVAL_SPEAKER_CONFIG,
334+
VB_COMMONEVAL_CONFIG,
335+
VB_IFEVAL_CONFIG,
336+
VB_MTBENCH_CONFIG,
337+
VB_WILDVOICE_CONFIG,
338+
]

ultravox/data/data_sample.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,5 @@ def add_past_messages(self, past_messages: List[Dict[str, str]]):
112112
"""For evaluations, the known transcript of the audio."""
113113
label: Optional[str] = None
114114
"""For evaluations, the label or expected answer of the sample."""
115+
extra_kwargs: Optional[Dict[str, Any]] = None
116+
"""For evaluations, extra columns from the sample."""

0 commit comments

Comments
 (0)