Fix sampling defaults: align with Python's non-greedy config

gianni-cor · gianni-cor · commit bb0eb992e3e2 · 2026-04-18T01:44:17.000+02:00
Symptom: paragraph-length inputs produced a wav where only the first
second had audio and the rest was pure silence. Example input
"hello how are you? i am good..." generated 257 speech tokens of which
240 were the silence token 4218 — the C++ T3 was running with
top_k=1 (greedy), which on Chatterbox falls into a silence-token
repetition trap as soon as any natural pause is synthesized.

Align the defaults with ChatterboxTurboTTS.generate() in tts_turbo.py:

                 before (C++)   after (C++, matches Python)
  top_k           1  (greedy)    1000
  top_p           1.0            0.95
  temperature     1.0            0.8
  repeat_penalty  1.0            1.2
  n_predict       256            1000

Any of these can still be overridden on the CLI; --top-k 1 reproduces
the old greedy behaviour for debugging.

Verified: same input that previously yielded one 0.5-s window of speech
followed by 19 windows of pure zero RMS now has non-trivial RMS across
all 21 windows; total wav RMS goes from 8.3e-03 to 4.8e-02 and max
amplitude from 0.18 to 0.50 on the same prompt. afplay confirms normal
continuous speech.
diff --git a/src/main.cpp b/src/main.cpp
@@ -150,13 +150,17 @@ struct cli_params {
     bool    dump_tokens_only = false;
     int32_t seed           = 0;
     int32_t n_threads      = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict      = 256;
+    int32_t n_predict      = 1000;   // matches Python's default-ish output budget for paragraph-length text
     int32_t n_ctx          = 0;
     int32_t n_gpu_layers   = 0;
-    int32_t top_k          = 1;
-    float   top_p          = 1.0f;
-    float   temp           = 1.0f;
-    float   repeat_penalty = 1.0f;
+    // Sampling defaults matched to ChatterboxTurboTTS.generate() in tts_turbo.py:
+    //   temperature=0.8, top_k=1000, top_p=0.95, repetition_penalty=1.2
+    // The previous greedy defaults (top_k=1) collapse into silence-token
+    // repetition loops on any non-trivial text.
+    int32_t top_k          = 1000;
+    float   top_p          = 0.95f;
+    float   temp           = 0.8f;
+    float   repeat_penalty = 1.2f;
 };
 
 static void print_usage(const char * argv0) {
@@ -179,13 +183,13 @@ static void print_usage(const char * argv0) {
     fprintf(stderr, "                          bit-exact numerical validation (requires --ref-dir).\n");
     fprintf(stderr, "  --seed N                RNG seed (default: 0)\n");
     fprintf(stderr, "  --threads N             CPU threads (default: %d)\n", std::min(4, (int32_t) std::thread::hardware_concurrency()));
-    fprintf(stderr, "  --n-predict N           Max speech tokens (default: 256)\n");
+    fprintf(stderr, "  --n-predict N           Max speech tokens (default: 1000)\n");
     fprintf(stderr, "  --context N             Override KV context length\n");
     fprintf(stderr, "  --n-gpu-layers N        GPU backend when N > 0\n");
-    fprintf(stderr, "  --top-k N               (default: 1)\n");
-    fprintf(stderr, "  --top-p P               (default: 1.0)\n");
-    fprintf(stderr, "  --temp T                (default: 1.0)\n");
-    fprintf(stderr, "  --repeat-penalty R      (default: 1.0)\n");
+    fprintf(stderr, "  --top-k N               (default: 1000, matches Python; use 1 for greedy)\n");
+    fprintf(stderr, "  --top-p P               (default: 0.95)\n");
+    fprintf(stderr, "  --temp T                (default: 0.8)\n");
+    fprintf(stderr, "  --repeat-penalty R      (default: 1.2)\n");
     fprintf(stderr, "  -h, --help\n");
 }