<a href="https://colab.research.google.com/github/dungdt-infopstats/TV-command-synthesis/blob/main/notebooks/Simple_Whisper_Infer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

tridungdo_ddss_aug_ratio_test_path = kagglehub.dataset_download('tridungdo/ddss-aug-ratio-test')

print('Data source import complete.')


In [None]:
!pip install faster-whisper

In [None]:
import os
import json
from multiprocessing import Pool, cpu_count
from faster_whisper import WhisperModel

# Model dùng chung trong mỗi process
_MODEL = None

def _init_worker(model_size, device, compute_type):
    """Khởi tạo model 1 lần / process."""
    global _MODEL
    _MODEL = WhisperModel(model_size, device=device, compute_type=compute_type)

def _transcribe_file(args):
    """
    Worker xử lý từng file.
    args = (wav_path, input_dir, output_dir, skip_existing)
    """
    wav_path, input_dir, output_dir, skip_existing = args
    global _MODEL
    assert _MODEL is not None, "Model chưa được init trong worker"

    # Tạo đường dẫn output tương ứng, giữ nguyên cấu trúc thư mục
    rel_path = os.path.relpath(wav_path, start=input_dir)
    out_path = os.path.join(output_dir, os.path.splitext(rel_path)[0] + ".json")
    out_dir = os.path.dirname(out_path)

    try:
        os.makedirs(out_dir, exist_ok=True)
    except Exception as e:
        return ("ERR_MKDIR", wav_path, str(e))

    if skip_existing and os.path.exists(out_path):
        return ("SKIP", wav_path, out_path)

    try:
        segments, info = _MODEL.transcribe(wav_path)
        data = {
            "file": wav_path,
            "language": info.language,
            "language_probability": info.language_probability,
            "duration": info.duration,
            "segments": [
                {"id": s.id, "start": s.start, "end": s.end, "text": s.text}
                for s in segments
            ],
        }
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        return ("OK", wav_path, out_path)
    except Exception as e:
        return ("ERR", wav_path, str(e))

def transcribe_wav_dir_stream(
    input_dir,
    output_dir,
    model_size="small",
    device="cpu",
    compute_type="int8",
    num_workers=None,
    skip_existing=True
):
    """
    Duyệt đệ quy input_dir, gặp .wav là đẩy vào Pool ngay (streaming).
    Ghi JSON vào output_dir, KHÔNG sửa gì trong input_dir.
    """
    if num_workers is None:
        num_workers = cpu_count()

    # Kiểm tra quyền ghi output_dir sớm để fail-fast
    try:
        os.makedirs(output_dir, exist_ok=True)
        test_path = os.path.join(output_dir, ".write_test.tmp")
        with open(test_path, "w", encoding="utf-8") as t:
            t.write("ok")
        os.remove(test_path)
    except Exception as e:
        print(f"❌ output_dir không ghi được: {output_dir} — {e}")
        return

    def jobs():
        for root, _, files in os.walk(input_dir):
            for f in files:
                if f.lower().endswith(".wav"):
                    yield (os.path.join(root, f), input_dir, output_dir, skip_existing)

    done = skipped = errors = 0
    print(f"🚀 Bắt đầu xử lý (x{num_workers} process). Output: {output_dir}")

    with Pool(
        processes=num_workers,
        initializer=_init_worker,
        initargs=(model_size, device, compute_type)
    ) as pool:
        for status, wav_path, info in pool.imap_unordered(_transcribe_file, jobs(), chunksize=1):
            if status == "OK":
                done += 1
                print(f"✅ [{done}] {wav_path} → {info}")
            elif status == "SKIP":
                skipped += 1
                print(f"⏭️  Bỏ qua (đã có): {info}")
            elif status == "ERR_MKDIR":
                errors += 1
                print(f"❌ Lỗi tạo thư mục cho {wav_path}: {info}")
            else:
                errors += 1
                print(f"❌ Lỗi với {wav_path}: {info}")

    print(f"🎯 Hoàn tất. OK: {done}, Skipped: {skipped}, Errors: {errors}")


In [None]:
transcribe_wav_dir_stream("/kaggle/input/ddss-aug-ratio-test/test", output_dir ="data" , model_size="medium", device="cuda", num_workers = 4)


In [None]:
!zip data.zip data -r