In [None]:
import sys
import tempfile
import os
import torch
from zipvoice.bin.infer_zipvoice import main as base_infer
from dataclasses import dataclass
from typing import Generator

@dataclass
class StreamArgs:
    model_name: str = "zipvoice"
    model_dir: str = "exp/zipvoice"
    checkpoint_name: str = "iter-60000-avg-2.pt"
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_steps: int = 8
    speed: float = 1.0

def stream_generate(
    prompt_wav: str,
    prompt_text: str,
    text_segments: list,
    args: StreamArgs
) -> Generator[bytes, None, None]:
    with tempfile.TemporaryDirectory() as tmp_dir:
        for i, segment in enumerate(text_segments):
            # 1. 生成单段 TSV 文件
            tsv_path = os.path.join(tmp_dir, f"segment_{i}.tsv")
            with open(tsv_path, "w", encoding="utf-8") as f:
                f.write(f"seg_{i}\t{prompt_text}\t{prompt_wav}\t{segment}\n")

            # 2. 构造命令行参数列表（模拟命令行输入）
            original_argv = sys.argv.copy()  # 保存原始命令行参数
            sys.argv = [
                "infer_zipvoice.py",  # 第一个参数固定为脚本名（无实际意义）
                f"--model-name={args.model_name}",
                f"--model-dir={args.model_dir}",
                f"--checkpoint-name={args.checkpoint_name}",
                f"--device={args.device}",
                f"--num-steps={args.num_steps}",
                f"--speed={args.speed}",
                f"--test-list={tsv_path}",
                f"--res-dir={tmp_dir}"
            ]

            # 3. 调用 main 函数（此时 argparse 会解析我们构造的 sys.argv）
            base_infer()

            # 4. 恢复原始 sys.argv，避免影响其他调用
            sys.argv = original_argv

            # 5. 读取并返回当前段音频
            wav_path = os.path.join(tmp_dir, f"seg_{i}.wav")
            with open(wav_path, "rb") as f:
                yield f.read()

SyntaxError: invalid syntax (1646824964.py, line 1)

In [3]:
import sys
sys.path.append("D:/work/tts/ZipVoice")
python3 -m zipvoice.bin.infer_zipvoice --help


SyntaxError: invalid syntax (2781513357.py, line 3)

In [6]:
from kittentts import KittenTTS
m = KittenTTS("KittenML/kitten-tts-nano-0.2")

audio = m.generate('是的，前面提到的 Step-Audio 2 mini、Xiaomi-MiMo-Audio 等模型相对来说参数规模较', voice='expr-voice-2-f' )

# available_voices : [  'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',  'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ]

# Save the audio
import soundfile as sf
sf.write('output.wav', audio, 24000)
