In [1]:
import torch
import torchaudio
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality

# Load models
hf_repo = "LiquidAI/LFM2-Audio-1.5B"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import random
from pathlib import Path
from typing import List, Tuple

from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
import whisper

from liquid_audio import ChatState, LFMModality

In [3]:
DATA_ROOT = Path("./data_librispeech")
DATA_ROOT.mkdir(parents=True, exist_ok=True) 
DATA_ROOT = DATA_ROOT.resolve() 
OUTPUT_DIR = Path("./lfm2_bench_outputs")  
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
NUM_SAMPLES = 20
SEED = 42
WHISPER_MODEL_NAME = "base"
GEN_MAX_NEW_TOKENS = 512
AUDIO_TEMPERATURE = 1.0
AUDIO_TOP_K = 4
TARGET_SR = 24_000

In [17]:
def text_normalizer():
    return Compose([
        ToLowerCase(),
        RemovePunctuation(),
        RemoveMultipleSpaces(),
        Strip(),
    ])

def save_wav(path: Path, wav: torch.Tensor, sr: int):
    """
    torchaudio.save는 [T] 또는 [C, T]만 허용.
    - [B, C, T] → 배치 차원 제거
    - [T, C] → [C, T]로 전치
    - [T]  → [1, T]로 승격
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    wav = wav.detach().cpu()

    if wav.ndim == 3:           # [B, C, T] or [B, T, C] 방지
        wav = wav.squeeze(0)    # -> 보통 [C, T] 또는 [T, C]
    if wav.ndim == 2:
        # time 축이 더 길다고 가정해 [C, T]로 맞춤
        if wav.shape[0] > wav.shape[1]:  # [T, C]로 추정
            wav = wav.transpose(0, 1)    # -> [C, T]
    elif wav.ndim == 1:
        wav = wav.unsqueeze(0)  # -> [1, T] (mono)
    else:
        raise ValueError(f"Expected 1D or 2D (or squeezable 3D) tensor, got {wav.ndim}D tensor")

    torchaudio.save(str(path), wav, sr)

def set_seed(seed: int):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [18]:
def load_librispeech_subset(root: Path, num_samples: int, seed: int = 42):
    """
    LibriSpeech test-clean에서 num_samples개만 샘플링하여 (waveform, sample_rate, transcript, utt_id) 리스트 반환
    """
    from torchaudio.datasets import LIBRISPEECH
    ds = LIBRISPEECH(root=str(root), url="test-clean", download=True)

    # 무작위 샘플 뽑기
    set_seed(seed)
    indices = list(range(len(ds)))
    random.shuffle(indices)
    pick = indices[:num_samples]

    items = []
    for i in pick:
        # item: (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
        waveform, sr, transcript, spk, chap, utt = ds[i]
        utt_id = f"{spk}-{chap}-{utt}"
        items.append((waveform, sr, transcript, utt_id))
    return items

In [19]:
def load_models(hf_repo: str, device: str = None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    processor = LFM2AudioProcessor.from_pretrained(hf_repo).eval()
    model = LFM2AudioModel.from_pretrained(hf_repo).to(device).eval()
    return processor, model, device

In [20]:
def load_whisper(name: str):
    # openai-whisper
    return whisper.load_model(name)

In [21]:
@torch.no_grad()
def generate_echo_audio(processor, model, device, waveform: torch.Tensor, sr: int):
    """
    입력 waveform(sr)을 넣고, 시스템 프롬프트로 '사용자 음성을 그대로, 오디오로만 반복'을 지시.
    LFM2가 낸 오디오 토큰을 Mimi 디코더로 복원하여 (wav_24k, 24000) 반환.
    """
    # 안전용: 입력 텐서를 모델 디바이스로 보낼 필요는 없음(채팅 상태는 CPU 텐서로도 동작)
    chat = ChatState(processor)
    chat.new_turn("system")
    chat.add_text("Respond in AUDIO ONLY. Repeat the user's speech verbatim without adding or removing words.")
    chat.end_turn()

    chat.new_turn("user")
    chat.add_audio(waveform, sr)  # torchaudio.load 그대로 사용
    chat.end_turn()

    chat.new_turn("assistant")

    audio_tokens = []
    for t in model.generate_interleaved(
        **chat,
        max_new_tokens=GEN_MAX_NEW_TOKENS,
        audio_temperature=AUDIO_TEMPERATURE,
        audio_top_k=AUDIO_TOP_K,
    ):
        if t.numel() == 1:
            # 혹시 나오는 텍스트 토큰은 무시
            continue
        audio_tokens.append(t)

    # 오디오 토큰이 없거나(e.g., 실패) EOS 1개만 나온 경우 방어
    if len(audio_tokens) < 2:
        return None, None

    # 마지막 EOS 토큰 제거
    # NOTE: mimi.decode는 CPU 텐서여도 동작하므로 굳이 model.device로 옮길 필요 없음
    mimi_codes = torch.stack(audio_tokens[:-1], dim=1).unsqueeze(0)  # [1, T, code_dim]
    wav_24k = processor.mimi.decode(mimi_codes)[0]  # [T]
    return wav_24k, TARGET_SR


In [22]:
def whisper_transcribe(whisper_model, wav: torch.Tensor, sr: int) -> str:
    """
    Whisper로 (wav, sr)을 받아 적는다. 임시 wav 파일로 저장 후 transcribe 사용.
    """
    tmp_path = OUTPUT_DIR / "_tmp.wav"
    save_wav(tmp_path, wav, sr)
    # Whisper는 파일 경로 입력을 주로 사용
    result = whisper_model.transcribe(str(tmp_path), language="en")
    hyp = result.get("text", "").strip()
    try:
        tmp_path.unlink()
    except Exception:
        pass
    return hyp

In [23]:
def main():
    set_seed(SEED)

    print("Loading dataset (LibriSpeech test-clean, 20 samples)...")
    items = load_librispeech_subset(DATA_ROOT, NUM_SAMPLES, SEED)

    print("Loading LFM2 and Whisper...")
    processor, model, device = load_models(hf_repo)
    asr = load_whisper(WHISPER_MODEL_NAME)

    tn = text_normalizer()

    wer_list = []
    cer_list = []

    for idx, (waveform, sr, ref_text, utt_id) in enumerate(items, 1):
        print(f"[{idx}/{len(items)}] {utt_id}")

        # 1) LFM2로 오디오→오디오 (echo) 생성
        gen_wav, gen_sr = generate_echo_audio(processor, model, device, waveform, sr)
        if gen_wav is None:
            print("  - No audio generated. Skipping.")
            continue

        # 2) 생성 오디오를 저장 (옵션)
        out_path = OUTPUT_DIR / f"{utt_id}_gen.wav"
        save_wav(out_path, gen_wav, gen_sr) 

        # 3) Whisper로 생성 오디오 전사
        hyp_text = whisper_transcribe(asr, gen_wav, gen_sr)

        # 4) 정규화 후 WER/CER 계산
        ref_norm = tn(ref_text)
        hyp_norm = tn(hyp_text)

        s_wer = wer(ref_norm, hyp_norm)
        s_cer = cer(ref_norm, hyp_norm)

        wer_list.append(s_wer)
        cer_list.append(s_cer)

        print(f"  - REF: {ref_text[:120]}{'...' if len(ref_text)>120 else ''}")
        print(f"  - HYP: {hyp_text[:120]}{'...' if len(hyp_text)>120 else ''}")
        print(f"  - WER: {s_wer:.3f}, CER: {s_cer:.3f}")

    if wer_list:
        avg_wer = sum(wer_list) / len(wer_list)
        avg_cer = sum(cer_list) / len(cer_list)
        print("\n==== Summary (Echo Audio-to-Audio, 20 samples) ====")
        print(f"Avg WER: {avg_wer:.3f}")
        print(f"Avg CER: {avg_cer:.3f}")
        print(f"Saved generated audio to: {OUTPUT_DIR.resolve()}")
    else:
        print("No results to summarize (no audio generated or an error occurred).")

if __name__ == "__main__":
    main()

Loading dataset (LibriSpeech test-clean, 20 samples)...
Loading LFM2 and Whisper...
[1/20] 4970-29093-15
  - REF: YOU CAN BEGIN BY CARRYING A ROD AND PUTTING DOWN THE FIGURES
  - HYP: You can tell why and telling tall, tall pies, then tall pies.
  - WER: 0.833, CER: 0.617
[2/20] 5639-40744-30
  - REF: JUST THEN LEOCADIA CAME TO HERSELF AND EMBRACING THE CROSS SEEMED CHANGED INTO A SEA OF TEARS AND THE GENTLEMAN REMAINED...
  - HYP: is that
  - WER: 1.000, CER: 0.979
[3/20] 8555-292519-11
  - REF: HE HAD GOT INTO HER COURTYARD
  - HYP: 
  - WER: 1.000, CER: 1.000
[4/20] 121-127105-34
  - REF: IT SOUNDED DULL IT SOUNDED STRANGE AND ALL THE MORE SO BECAUSE OF HIS MAIN CONDITION WHICH WAS
  - HYP: One, two, three, four, one, one, one.
  - WER: 1.000, CER: 0.787
[5/20] 8555-284447-10


/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [24,0,0], thread: [32,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [24,0,0], thread: [33,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [24,0,0], thread: [34,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [24,0,0], thread: [35,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [24,0,0], thread: [36,0,0] Asser

AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [28]:
!git add untitled2.ipynb

fatal: not a git repository (or any parent up to mount point /root)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
