In [None]:
import re

from urllib.parse import urlparse, parse_qs

In [None]:
links = [
    'https://www.youtube.com/watch?v=VuQUF1VVX40',  # lee
    'https://www.youtube.com/watch?v=g4LYbQYDy14',  # hung
]

In [None]:
def _extract_video_id(url: str) -> str:
    p = urlparse(url)
    if p.netloc.endswith("youtu.be"):
        return p.path.lstrip("/").split("/")[0]
    q = parse_qs(p.query)
    if "v" in q:
        return q["v"][0]
    m = re.search(r"/(shorts|embed)/([A-Za-z0-9_-]{6,})", p.path)
    if m:
        return m.group(2)
    raise ValueError("無法從網址解析出 YouTube 影片 ID。")

In [None]:
_extract_video_id(links[0])

In [None]:
url = links[0]

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
api = YouTubeTranscriptApi()

In [None]:
video_id = _extract_video_id(url)

In [None]:
api.list(video_id)

In [None]:
def has_youtube_captions(url: str, preferred_langs=None) -> bool:
    """
    檢查 YouTube 影片是否有可用字幕（人工或自動）。
    - url: YouTube 影片連結
    - preferred_langs: 語言優先序，若 None 則不過濾，任意語言有字幕就回 True
    回傳: True = 有字幕可抓, False = 沒有字幕
    """
    from youtube_transcript_api import YouTubeTranscriptApi
    from youtube_transcript_api._errors import TranscriptsDisabled
    from youtube_transcript_api import NoTranscriptFound

    video_id = _extract_video_id(url)
    api = YouTubeTranscriptApi()

    try:
        tlist = api.list(video_id)  # TranscriptList，可疊代
    except (TranscriptsDisabled, NoTranscriptFound):
        return False
    except Exception:
        return False

    if preferred_langs:
        try:
            tlist.find_transcript(preferred_langs)
            return True
        except NoTranscriptFound:
            return False
    else:
        return len(list(tlist)) > 0


In [None]:
has_youtube_captions(links[0])

In [None]:
has_youtube_captions(links[1])

## case1: has youtube captions

In [None]:
def fetch_youtube_captions(
    url: str,
    preferred_langs=None,
    translate_to: str | None = None,
    join_with: str = "\n",
    preserve_formatting: bool = False,
) -> str:
    """
    取得 YouTube 字幕純文字
    - url: YouTube 連結
    - preferred_langs: 語言優先序，預設 ['zh-Hant','zh-TW','zh-Hans','zh','en']
    - translate_to: 翻譯成目標語言（如 'zh-Hant','en'），若為 None 則不翻譯
    - join_with: 合併字幕的分隔符號
    - preserve_formatting: 是否保留 HTML 格式標籤
    """
    if preferred_langs is None:
        preferred_langs = ["zh-Hant", "zh-TW", "zh-Hans", "zh", "en"]

    video_id = _extract_video_id(url)
    from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound

    api = YouTubeTranscriptApi()

    if translate_to:
        # 先找到一個字幕，再翻譯
        tlist = api.list(video_id)
        try:
            base_t = tlist.find_transcript(preferred_langs)
        except NoTranscriptFound:
            base_t = next(iter(tlist), None)
        fetched = base_t.translate(translate_to).fetch(preserve_formatting=preserve_formatting)
    else:
        fetched = api.fetch(video_id, languages=preferred_langs, preserve_formatting=preserve_formatting)

    raw = fetched.to_raw_data()
    lines = [d["text"].strip() for d in raw if d.get("text", "").strip()]
    junk = {"[Music]", "[Applause]", "[Laughter]"}
    return join_with.join([ln for ln in lines if ln not in junk])


In [None]:
_extract_video_id(links[0])

In [None]:
text = fetch_youtube_captions(links[0])

In [None]:
text[:30]

# case2: NoTranscriptFound

In [None]:
import yt_dlp

In [None]:
def download_audio(url, output):
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": output,
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192"
        }]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return output

In [None]:
output_path = 'audio.mp3'
download_audio(links[1], output_path)

## faster_whisper

In [None]:
output_path = 'audio.mp3'

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # 允許 OpenMP 重複載入，避免直接 abort
os.environ["OMP_NUM_THREADS"] = "4"          # 限制執行緒，穩定一點（可選）


In [None]:
import numpy as np
import ctranslate2, sys
from faster_whisper import WhisperModel
print("py", sys.version)
print("ct2", ctranslate2.__version__)


In [None]:
m = WhisperModel(
    "tiny", device="cpu", compute_type="int8",
    cpu_threads=2, num_workers=1
)
segs, info = m.transcribe(output_path, vad_filter=False, beam_size=1)
print(info.language, info.language_probability, len(list(segs)))


In [None]:
import sys, shutil, subprocess
import ctranslate2 as ct2

print("Python:", sys.version)
print("ctranslate2:", ct2.__version__)
ngpu = ct2.get_cuda_device_count()
print("CUDA devices:", ngpu)

if ngpu > 0:
    print("Supported compute types (cuda):", ct2.get_supported_compute_types("cuda"))
    # 額外查驅動/Runtime（用 nvidia-smi）
    if shutil.which("nvidia-smi"):
        out = subprocess.check_output(["nvidia-smi"], text=True)
        # 直接看頁首會出現 "CUDA Version: 12.x"
        print(out.splitlines()[0])
else:
    print("Supported compute types (cpu):", ct2.get_supported_compute_types("cpu"))


In [None]:
from faster_whisper import WhisperModel
from pathlib import Path

def transcribe_faster_whisper(
    audio_file: str | Path,
    model_name: str = "large-v3",
    device: str = "auto",          # "cuda" | "cpu" | "auto"
    compute_type: str = "auto",     # "float16" | "int8_float16" | "int8" | "auto"
    language: str | None = None,    # "zh"、"en"、None=自動偵測
):
    audio_file = str(audio_file)
    model = WhisperModel(model_name, device=device, compute_type=compute_type)
    segments, info = model.transcribe(
        audio_file,
        language=language,
        vad_filter=True,        # 內建 Voice Activity Detection，通常品質更穩
        beam_size=5,
    )
    # 收集所有 segments
    segs = []
    full_text = []
    for seg in segments:
        segs.append({"start": seg.start, "end": seg.end, "text": seg.text})
        full_text.append(seg.text)
    return {
        "text": " ".join(full_text).strip(),
        "segments": segs,
        "language": info.language,
        "language_probability": info.language_probability,
    }

In [None]:
import time

In [None]:
start = time.time()

result = transcribe_faster_whisper(output_path, model_name="small")
print(result["language"], result["language_probability"])
print(result["text"][:100])

end = time.time()
print(f'dur: {end - start}')
