
# Anime-Llasa-3B — HF Space準拠 /tmp完結ノートブック（**v6：EOS即終了対策版**）
- HF Space相当（ASR→参照プレフィックス→生成→XCodec2復号）
- **/tmpのみ**使用、**Agg**固定、**torch 2.4.1+cu121**
- **hf_transfer**：カーネル＆venv両対応＋**フォールバック**（未導入時は通常DL）
- **Audio(type="numpy")**（UIに確実表示）＋ **Debug Log**（詳細ログ）＋ テストトーン**フォールバック**
- **Whisper** の **task/language 明示**（挙動固定）
- **CUDA 12.x 自動整合**（nvJitLink/cusparse符号ずれに対応）
- **share=True**（Ports不要）
- **LOCAL_ONLY_MODE=1** で **ローカルモデル強制**が既定（必要なら 0 に）
- **EOS即終了対策**：`min_new_tokens` 導入＋**Ignore EOS** トグル＋**PADトークン自動追加**

**実行順序：0 → 1 → 2 → 3 → 4 → 4.5 → 4.9 → 5 → 5.1 → 6 → 7**


## 0️⃣ Matplotlib backend を Agg 固定（バックエンドエラー回避）

In [None]:

import os, matplotlib
os.environ["MPLBACKEND"] = "Agg"
matplotlib.use("Agg")
print("matplotlib backend:", matplotlib.get_backend())


## 1️⃣ /tmp 構成＆環境変数（/dev/shmは使いません）

In [None]:

import os
from pathlib import Path

base = Path("/tmp/llasa_space")
for p in ["hf", ".cache", "tmp", "outputs", "app"]:
    (base / p).mkdir(parents=True, exist_ok=True)

os.environ["HF_HOME"]="/tmp/llasa_space/hf"
os.environ["HUGGINGFACE_HUB_CACHE"]="/tmp/llasa_space/hf"
os.environ["TRANSFORMERS_CACHE"]="/tmp/llasa_space/hf"
os.environ["XDG_CACHE_HOME"]="/tmp/llasa_space/.cache"
os.environ["TMPDIR"]="/tmp/llasa_space/tmp"
os.environ["LLASA_OUT"]="/tmp/llasa_space/outputs"
# 高速DL（Step 4.9 と 4 で kernel/venv 双方対応）
os.environ["HF_HUB_ENABLE_HF_TRANSFER"]="1"
# ローカルモデルの強制（必要なら "0" に変更）
os.environ["LOCAL_ONLY_MODE"]="1"

print({k: os.environ[k] for k in ["HF_HOME","HUGGINGFACE_HUB_CACHE","TRANSFORMERS_CACHE","XDG_CACHE_HOME","TMPDIR","LLASA_OUT","HF_HUB_ENABLE_HF_TRANSFER","LOCAL_ONLY_MODE"]})


## 2️⃣ GPU / /tmp 空き容量チェック

In [None]:

try:
    import torch
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
except Exception as e:
    print("torch not installed yet:", e)

import shutil
def human(n):
    for u in ['B','KB','MB','GB','TB']:
        if n<1024: return f"{n:.1f}{u}"
        n/=1024
    return f"{n:.1f}PB"
total, used, free = shutil.disk_usage("/tmp")
print("/tmp free:", human(free), "total:", human(total))


## 3️⃣ OS依存（ffmpeg, git）

In [None]:

!sudo apt-get update -y
!sudo apt-get install -y ffmpeg git


## 4️⃣ venv作成＆依存導入（torch 2.4.1 + cu121）＋ **venv側に hf_transfer**

In [None]:

!python3 -m venv /tmp/llasa_space/app/venv
VENV_PIP="/tmp/llasa_space/app/venv/bin/pip"
VENV_PY="/tmp/llasa_space/app/venv/bin/python"

!$VENV_PIP uninstall -y torch torchvision torchaudio || true
!$VENV_PIP install -U pip wheel setuptools

# PyTorch 2.4.1 + cu121（実績構成）
!$VENV_PIP install --index-url https://download.pytorch.org/whl/cu121 \
  torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1

# ランタイム依存
!$VENV_PIP install "transformers>=4.43.0" accelerate gradio==4.44.0 soundfile numpy scipy huggingface_hub==0.24.6
# codec（Spaceは 0.1.3 を想定）
!$VENV_PIP install xcodec2==0.1.3
# venv 側にも hf_transfer を導入（実行時DLで必要になるため）
!$VENV_PIP install -U hf_transfer

print("Base installs (incl. hf_transfer in venv) completed.")


## 4.5️⃣ CUDA 12.x マイナー不整合の自動整合（nvJitLink/cusparse 等）

In [None]:

import os, re, subprocess

VENV_PIP="/tmp/llasa_space/app/venv/bin/pip"
VENV_PY ="/tmp/llasa_space/app/venv/bin/python"

def set_ld_library_path():
    site="/tmp/llasa_space/app/venv/lib/python3.11/site-packages/nvidia"
    paths=[]
    for sub in ["cuda_runtime","cublas","cusparse","cudnn","nvjitlink","cuda_nvrtc"]:
        p=os.path.join(site, sub, "lib")
        if os.path.isdir(p): paths.append(p)
    ld=":".join(paths)
    if "LD_LIBRARY_PATH" in os.environ and os.environ["LD_LIBRARY_PATH"]:
        ld = ld + ":" + os.environ["LD_LIBRARY_PATH"]
    os.environ["LD_LIBRARY_PATH"]=ld
    return ld

def install_cuda_minor(minor:str):
    pkgs=[
        f"nvidia-cuda-nvrtc-cu12==12.{minor}.*",
        f"nvidia-cuda-runtime-cu12==12.{minor}.*",
        f"nvidia-cublas-cu12==12.{minor}.*",
        f"nvidia-cusparse-cu12==12.{minor}.*",
        f"nvidia-nvjitlink-cu12==12.{minor}.*",
        "nvidia-cudnn-cu12>=9.0.0",
    ]
    cmd=[VENV_PIP,"install","-U"]+pkgs
    print("Installing CUDA libs for 12.%s.* ..." % minor)
    subprocess.run(cmd, check=False)

def try_import():
    code = "import os; print('LD=',os.environ.get('LD_LIBRARY_PATH','')[:200]); import torch, torchaudio; print('OK', torch.__version__, torch.version.cuda, torch.cuda.is_available(), torchaudio.__version__)"
    r = subprocess.run([VENV_PY,"-c",code], capture_output=True, text=True)
    return r.returncode, r.stdout + r.stderr

def parse_needed_minor(msg:str):
    m = re.search(r"__nvJitLink(?:AddData|Complete)_(12)_(\d+)", msg)
    if m:
        return m.group(2)
    return None

set_ld_library_path()
rc, out = try_import()
print(out)
if rc==0:
    print("✔ CUDA libs already consistent.")
else:
    need = parse_needed_minor(out)
    tried=set()
    candidates = ([need] if need else []) + ["5","4","3","2","1"]
    for minor in candidates:
        if minor in tried or minor is None: continue
        tried.add(minor)
        install_cuda_minor(minor)
        set_ld_library_path()
        rc, out = try_import()
        print(out)
        if rc==0:
            print(f"✔ Fixed by installing cu12.{minor} libs.")
            break
    if rc!=0:
        print("❌ Import still failing. Please copy the above logs and share.")


## 4.9️⃣ カーネル側に `hf_transfer` を自動導入（高速DLの有効化）

In [None]:

import os, sys, subprocess
if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER","0") == "1":
    try:
        import hf_transfer  # noqa: F401
        print("hf_transfer already available (kernel).")
    except Exception:
        print("Installing hf_transfer into the **kernel** Python...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "hf_transfer"])
        import hf_transfer  # noqa: F401
        print("hf_transfer installed (kernel).")
else:
    print("HF_HUB_ENABLE_HF_TRANSFER is not 1; skipping.")


## 5️⃣ モデルDL（Anime-Llasa-3B / Anime-XCodec2 / Whisper Turbo を /tmp に）

In [None]:

from huggingface_hub import snapshot_download
from pathlib import Path
import os

BASE="/tmp/llasa_space/hf/models"
Path(BASE).mkdir(parents=True, exist_ok=True)

targets={
    "Anime-Llasa-3B":"NandemoGHS/Anime-Llasa-3B",
    "Anime-XCodec2":"NandemoGHS/Anime-XCodec2",
    "Whisper-Large-V3-Turbo":"openai/whisper-large-v3-turbo"
}
for name, repo in targets.items():
    out=f"{BASE}/{name}"
    if not os.path.isdir(out):
        print("Downloading", repo, "->", out)
        snapshot_download(repo_id=repo, local_dir=out, local_dir_use_symlinks=False,
                          resume_download=True, max_workers=8)
    else:
        print("Exists:", out)
print("Models ready.")


## 5.1️⃣ モデル配置の検証（必要ファイルの有無チェック）

In [None]:

from pathlib import Path

BASE="/tmp/llasa_space/hf/models"
paths = {
    "LLASA": f"{BASE}/Anime-Llasa-3B",
    "XCODEC2": f"{BASE}/Anime-XCodec2",
    "WHISPER": f"{BASE}/Whisper-Large-V3-Turbo",
}

def must(p, names):
    from pathlib import Path
    missing=[n for n in names if not Path(p, n).exists()]
    return missing

must_llasa = ["config.json", "generation_config.json", "tokenizer_config.json"]
must_xcodec = ["config.json"]
must_whisper = ["config.json", "preprocessor_config.json"]

print("LLASA missing:", must(paths["LLASA"], must_llasa))
print("XCODEC2 missing:", must(paths["XCODEC2"], must_xcodec))
print("WHISPER missing:", must(paths["WHISPER"], must_whisper))

for k,p in paths.items():
    p=Path(p)
    print(f"{k}: {p} exists={p.exists()} files={len(list(p.glob('*')))}")


## 6️⃣ Gradio アプリ配置（v6：min_new_tokens＋Ignore EOS＋PAD追加）

In [None]:

app_code = r'''
import os, re, time
# ---- Guard: fallback if hf_transfer missing in runtime (venv) ----
try:
    import hf_transfer  # noqa: F401
except Exception:
    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
# ------------------------------------------------------------------

import numpy as np
import torch, torchaudio, soundfile as sf
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from xcodec2.modeling_xcodec2 import XCodec2Model

os.environ.setdefault("MPLBACKEND","Agg")

HF_BASE="/tmp/llasa_space/hf/models"
LLASA_DIR=f"{HF_BASE}/Anime-Llasa-3B"
XCODEC_DIR=f"{HF_BASE}/Anime-XCodec2"
WHISPER_DIR=f"{HF_BASE}/Whisper-Large-V3-Turbo"
OUT_DIR=os.environ.get("LLASA_OUT","/tmp/llasa_space/outputs")
LOCAL_ONLY = os.environ.get("LOCAL_ONLY_MODE","1") == "1"
os.makedirs(OUT_DIR, exist_ok=True)

DEVICE="cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {DEVICE}  bf16={torch.cuda.is_available() and torch.cuda.is_bf16_supported()}")
DTYPE = torch.bfloat16 if (DEVICE=="cuda" and torch.cuda.is_bf16_supported()) else torch.float16

def _test_tone(seconds=1.0, sr=16000, freq=1000.0):
    t = np.linspace(0, seconds, int(sr*seconds), endpoint=False, dtype=np.float32)
    y = (np.sin(2*np.pi*freq*t)).astype("float32")
    return (sr, y)

def _npinfo(x):
    if isinstance(x, np.ndarray):
        try:
            return f"ndarray shape={x.shape} dtype={x.dtype} min={x.min():.4f} max={x.max():.4f}"
        except Exception:
            return f"ndarray shape={x.shape} dtype={x.dtype}"
    return str(type(x))

def _path_or_id(local_dir, repo_id):
    return local_dir if (os.path.isdir(local_dir) and LOCAL_ONLY) else (local_dir if os.path.isdir(local_dir) else repo_id)

# Loaders (Space準拠) — LOCAL_ONLY_MODE=1 ならローカルを強制
llasa_src = _path_or_id(LLASA_DIR, "NandemoGHS/Anime-Llasa-3B")
xcodec_src = _path_or_id(XCODEC_DIR, "NandemoGHS/Anime-XCodec2")
whisp_src  = _path_or_id(WHISPER_DIR, "openai/whisper-large-v3-turbo")

tokenizer = AutoTokenizer.from_pretrained(llasa_src, local_files_only=LOCAL_ONLY)
print("[LLASA Tokenizer]", llasa_src)
model = AutoModelForCausalLM.from_pretrained(
    llasa_src, trust_remote_code=True, torch_dtype=DTYPE, local_files_only=LOCAL_ONLY
).to(DEVICE).eval()
print("[LLASA Model]", llasa_src)

# PADが無ければ新規追加（モデル埋め込みもresize）
if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
pad_id_force = tokenizer.pad_token_id

codec_model = XCodec2Model.from_pretrained(xcodec_src, local_files_only=LOCAL_ONLY).to(DEVICE).eval()
print("[XCodec2 Model]", xcodec_src)

# Whisper pipeline（generate_kwargs は呼び出し時に渡す）
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisp_src,
    torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
    device=0 if DEVICE=="cuda" else -1
)
print("[Whisper Model]", whisp_src)

INVALID_PATTERN = re.compile(r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005\u0041-\u005A\u0061-\u007A\u0030-\u0039。、!?…♪♡○]")
def normalize(text: str) -> str:
    text = re.sub(r"\t", "", text)
    text = re.sub(r"[\n]", "", text)
    text = text.replace(" ", "")
    text = re.sub(r"[;▼♀♂《》≪≫①②③④⑤⑥]", "", text)
    text = re.sub(r"[\u02d7\u2010-\u2015\u2043\u2212\u23af\u23e4\u2500\u2501\u2e3a\u2e3b]", "", text)
    text = text.replace("？","?").replace("！","!").replace("♥","♡").replace("●","○").replace("◯","○").replace("〇","○")
    text = re.sub(r"…{3,}", "……", text)
    text = INVALID_PATTERN.sub("", text)
    return text

def to_16k_mono(wav: torch.Tensor, sr: int) -> torch.Tensor:
    if wav.dim()==1:
        wav=wav.unsqueeze(0)
    if wav.size(0)>1:
        wav = wav.mean(0, keepdim=True)
    if sr!=16000:
        wav = torchaudio.functional.resample(wav, sr, 16000)
    return wav  # [1, T]

def ids_to_speech_tokens(ids):
    return [f"<|s_{int(i)}|>" for i in ids]

def extract_speech_ids(token_strings):
    speech_ids = []
    for s in token_strings:
        if s.startswith("<|s_") and s.endswith("|>"):
            try: speech_ids.append(int(s[4:-2]))
            except: pass
    return speech_ids

def ensure_pad_and_mask(input_ids, tok, pad_id_hint=None):
    eos_id = tok.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
    pad_id = tok.pad_token_id if tok.pad_token_id is not None else (pad_id_hint if pad_id_hint is not None else eos_id)
    attn   = (input_ids != pad_id).long()
    return pad_id, eos_id, attn

def infer(sample_audio_path, target_text, temperature, top_p, repetition_penalty, asr_mode, min_new_tokens, ignore_eos, progress=gr.Progress()):
    log_lines=[]
    import time as _time
    _t0=_time.time()
    def _log(s): log_lines.append(str(s))

    if not target_text or not target_text.strip():
        gr.Warning("テキストを入力してください。")
        return (16000, np.zeros(1600, dtype=np.float32)), "no text"

    if len(target_text) > 300:
        gr.Warning("テキストが長すぎます（300字以内推奨）。先頭300字で生成します。")
        target_text = target_text[:300]

    target_text = normalize(target_text)
    _log(f"text(len={len(target_text)})='{target_text[:120]}'")

    with torch.no_grad():
        if sample_audio_path:
            progress(0, "参照音声をロード中...")
            wav, sr = torchaudio.load(sample_audio_path)
            if wav.shape[1] / sr > 15:
                wav = wav[:, : sr * 15]
            prompt_wav = to_16k_mono(wav, sr)  # [1, T] @16k
            prompt_len = prompt_wav.shape[1]

            progress(0.4, "Whisperで文字起こし中...")
            if asr_mode == "auto_transcribe_ja":
                asr = whisper_pipe(prompt_wav[0].numpy(), return_timestamps=False,
                                   generate_kwargs={"task":"transcribe","language":"ja"})
            elif asr_mode == "translate_en":
                asr = whisper_pipe(prompt_wav[0].numpy(), return_timestamps=False,
                                   generate_kwargs={"task":"translate","language":"en"})
            else:  # auto_transcribe
                asr = whisper_pipe(prompt_wav[0].numpy(), return_timestamps=False,
                                   generate_kwargs={"task":"transcribe"})
            prompt_text = asr["text"].strip()
            _log(f"ASR: len={len(prompt_text)} text='{prompt_text[:120]}'")

            progress(0.6, "参照音声をコード化中...")
            vq = codec_model.encode_code(input_waveform=prompt_wav.to(DEVICE))[0,0,:]
            _log(f"VQ prompt codes: {int(vq.numel())}")
            speech_prefix = "".join(ids_to_speech_tokens(vq))
            input_text = (prompt_text + " " + target_text).strip()
            assistant_content = "<|SPEECH_GENERATION_START|>" + speech_prefix
        else:
            prompt_len = 0
            input_text = target_text
            assistant_content = "<|SPEECH_GENERATION_START|>"

        progress(0.8, "音声トークンを生成中...")
        formatted = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
        chat = [
            {"role":"user","content":"Convert the text to speech:" + formatted},
            {"role":"assistant","content": assistant_content},
        ]
        input_ids = tokenizer.apply_chat_template(chat, tokenize=True, return_tensors="pt",
                                                  continue_final_message=True).to(DEVICE)

        pad_id, eos_id, attn = ensure_pad_and_mask(input_ids, tokenizer, pad_id_hint=pad_id_force)
        _log(f"ids_in={int(input_ids.numel())} eos={eos_id} pad={pad_id} min_new_tokens={int(min_new_tokens)} ignore_eos={bool(ignore_eos)}")

        _log("generate() starting...")
        outputs = model.generate(
            input_ids,
            max_length=2048,
            min_new_tokens=int(min_new_tokens),
            eos_token_id=None if ignore_eos else eos_id,
            pad_token_id=pad_id,
            attention_mask=attn,
            do_sample=True,
            top_p=float(top_p),
            temperature=float(temperature),
            repetition_penalty=float(repetition_penalty),
        )

        gen_ids = outputs[0][input_ids.shape[1]:]
        speech_tokens_str = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        _log(f"gen_ids={int(gen_ids.numel())} decoded_strings={len(speech_tokens_str)}")
        speech_ids = extract_speech_ids(speech_tokens_str)
        _log(f"speech_ids extracted={len(speech_ids)}")
        if not speech_ids:
            _log("[fallback] no speech_ids -> test tone")
            t = np.linspace(0, 1.0, 16000, endpoint=False, dtype=np.float32)
            y = (np.sin(2*np.pi*880*t)).astype("float32")
            return (16000, y), "\\n".join(log_lines)

        codes = torch.tensor(speech_ids, device=DEVICE).view(1,1,-1)
        gen_wav = codec_model.decode_code(codes)  # [1,1,T]

        if sample_audio_path and prompt_len>0:
            if gen_wav.shape[-1] > prompt_len:
                gen_wav = gen_wav[:, :, prompt_len:]
                _log(f"trimmed leading {prompt_len} samples (prompt)")
            else:
                _log(f"skip trim: gen_len={gen_wav.shape[-1]} <= prompt_len={prompt_len}")

        arr = gen_wav[0,0,:].float().cpu().numpy()
        ts = time.strftime("%Y%m%d_%H%M%S")
        out = os.path.join(OUT_DIR, f"llasa_space_{ts}.wav")
        sf.write(out, arr, 16000)
        _log(f"wav: shape={arr.shape} min={arr.min():.4f} max={arr.max():.4f} file={out}")
        _log(f"total_time={time.time()-_t0:.2f}s")

        # Validate audio and fallback if needed
        if not isinstance(arr, np.ndarray) or arr.size < 320 or not np.isfinite(arr).all():
            _log("[fallback] returning test tone (invalid or too short audio)")
            t = np.linspace(0, 1.0, 16000, endpoint=False, dtype=np.float32)
            y = (np.sin(2*np.pi*880*t)).astype("float32")
            return (16000, y), "\\n".join(log_lines)

        return (16000, arr), "\\n".join(log_lines)

with gr.Blocks(title="Anime-Llasa-3B Space-Style", theme=gr.themes.Base()) as app:
    gr.Markdown("### Anime-Llasa-3B — HF Space 構成（/tmp 完結）")
    ref = gr.Audio(label="Reference Audio (optional, <=15s)", type="filepath")
    text= gr.Textbox(label="Text", lines=8, placeholder="テキストを入力")
    with gr.Row():
        temperature        = gr.Slider(0.0,1.2,1.0,0.05,label="Temperature")
        top_p              = gr.Slider(0.5,1.0,0.95,0.01,label="Top-p")
        repetition_penalty = gr.Slider(1.0,1.5,1.05,0.05,label="Repetition Penalty")
    with gr.Row():
        asr_mode = gr.Dropdown(
            ["auto_transcribe_ja", "auto_transcribe", "translate_en"],
            value="auto_transcribe_ja",
            label="ASRモード"
        )
        min_new_tokens = gr.Slider(10, 200, 50, 1, label="min_new_tokens")
        ignore_eos     = gr.Checkbox(value=False, label="Ignore EOS (debug)")
    go   = gr.Button("Synthesize", variant="primary")
    aout = gr.Audio(label="Output", type="numpy", autoplay=True, show_download_button=True)
    log  = gr.Textbox(label="Debug Log", lines=14, value="", interactive=False)

    go.click(infer, [ref,text,temperature,top_p,repetition_penalty,asr_mode,min_new_tokens,ignore_eos], [aout, log])

if __name__=="__main__":
    site="/tmp/llasa_space/app/venv/lib/python3.11/site-packages/nvidia"
    os.environ["LD_LIBRARY_PATH"]=":".join(
        p for s in ["cuda_runtime","cublas","cusparse","cudnn","nvjitlink","cuda_nvrtc"]
        if os.path.isdir((p:=f"{site}/{s}/lib"))
    )
    app.queue().launch(server_name="0.0.0.0", server_port=7860, share=True, show_error=True)
'''
open("/tmp/llasa_space/app/app.py","w").write(app_code)
print("Wrote /tmp/llasa_space/app/app.py")


## 7️⃣ 起動（ログの `https://xxxx.gradio.live` を開く）

In [None]:

!MPLBACKEND=Agg /tmp/llasa_space/app/venv/bin/python /tmp/llasa_space/app/app.py
