In [1]:
from torchaudio.datasets import LJSPEECH
from IPython.display import Audio

dataset = LJSPEECH(root="data", download=True)
waveform, sample_rate, transcript, normalized = dataset[4]

print(transcript)
wav_ref = waveform.squeeze().numpy()
Audio(wav_ref, rate=sample_rate)

the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.


In [2]:
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.47it/s]


In [3]:
import os, random

save_dir = "tts_out"
os.makedirs(save_dir, exist_ok=True)

random.seed(42)

N = len(dataset)
pick = random.sample(range(N), k=min(20, N))

In [4]:
for i in pick:
    trnascript = dataset[i]
    text = transcript
    
    conversation = [
        {
            "role": "0", 
            "content": [{"type": "text", "text": text}],
        },
    ]

    inputs = processor.apply_chat_template(
        conversation,
        tokenize=True,
        return_dict=True,
    ).to(device)
    
    audio = model.generate(**inputs, output_audio=True)
    out_path = os.path.join(save_dir, f"sample_{i:05d}.wav")
    processor.save_audio(audio, out_path)
    print(f"success to make sample_{i:05d}.wav")

success to make sample_10476.wav
success to make sample_01824.wav
success to make sample_00409.wav
success to make sample_12149.wav
success to make sample_04506.wav
success to make sample_04012.wav
success to make sample_03657.wav
success to make sample_02286.wav
success to make sample_12066.wav
success to make sample_01679.wav
success to make sample_11087.wav
success to make sample_12135.wav
success to make sample_08935.wav
success to make sample_01424.wav
success to make sample_09674.wav
success to make sample_06912.wav
success to make sample_00520.wav
success to make sample_00488.wav
success to make sample_01535.wav
success to make sample_03582.wav


In [5]:
sample_rate = 25000
#wav_np = (audio[0] if isinstance(audio, (list, tuple)) else audio).detach().cpu().numpy()
files = [f for f in os.listdir(save_dir) if f.endswith(".wav")]
wav_path = os.path.join(save_dir, files[6])
Audio(wav_path)

In [6]:
import numpy as np, librosa, pandas as pd

def f0_rmse_refarray(ref_np, ref_sr, gen_wav_path, fmin=50.0, fmax=600.0, hop_length=256):
    y_gen, sr_gen = librosa.load(gen_wav_path, sr=ref_sr, mono=True)
    f0_ref, _, _ = librosa.pyin(ref_np, fmin=fmin, fmax=fmax, sr=ref_sr, hop_length=hop_length)
    f0_gen, _, _ = librosa.pyin(y_gen,  fmin=fmin, fmax=fmax, sr=ref_sr, hop_length=hop_length)
    f0_ref = pd.Series(f0_ref).interpolate(limit_direction="both").to_numpy()
    f0_gen = pd.Series(f0_gen).interpolate(limit_direction="both").to_numpy()
    L = min(len(f0_ref), len(f0_gen))
    return float(np.sqrt(np.mean((f0_ref[:L] - f0_gen[:L])**2))) if L else float("nan")


In [8]:
for i in range(len(pick)):
    j = pick[i]          
    gen_path = os.path.join(save_dir, f"sample_{j:05d}.wav")

    waveform, sr_ref, _, _ = dataset[j]
    ref_np = waveform.squeeze().numpy()

    rmse = f0_rmse_refarray(ref_np, sr_ref, gen_path)
    print(f"[{j}] F0 RMSE: {rmse:.2f} Hz")

[10476] F0 RMSE: 94.91 Hz
[1824] F0 RMSE: 64.35 Hz
[409] F0 RMSE: 76.72 Hz
[12149] F0 RMSE: 68.77 Hz
[4506] F0 RMSE: 97.27 Hz
[4012] F0 RMSE: 117.18 Hz
[3657] F0 RMSE: 58.96 Hz
[2286] F0 RMSE: 96.88 Hz
[12066] F0 RMSE: 83.36 Hz
[1679] F0 RMSE: 140.55 Hz
[11087] F0 RMSE: 111.44 Hz
[12135] F0 RMSE: 145.05 Hz
[8935] F0 RMSE: 90.57 Hz
[1424] F0 RMSE: 68.63 Hz
[9674] F0 RMSE: 58.73 Hz
[6912] F0 RMSE: 149.16 Hz
[520] F0 RMSE: 103.73 Hz
[488] F0 RMSE: 155.63 Hz
[1535] F0 RMSE: 114.60 Hz
[3582] F0 RMSE: 77.20 Hz
