In [None]:
# STEP 0: Install Required Packages and Environment Setup
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 0: Installing required packages...")

!pip install azure-cognitiveservices-speech pysubs2 pillow snownlp python-dotenv tqdm pyyaml --quiet
!pip install ipywidgets --upgrade
print("STEP 0 Complete! All packages installed.")

In [None]:
# STEP 1: Project Initialization and Config Load
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 1: Initializing project and loading config...")

import os
import yaml
from dotenv import load_dotenv

# Set main working directory
main_dir = os.getcwd()
subdir = main_dir  # All operations will happen in the main workspace

# Load environment variables
load_dotenv()
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

if not AZURE_SPEECH_KEY or not AZURE_SPEECH_REGION or not MINIMAX_SPEECH_KEY or not OPENAI_API_KEY:
    raise ValueError("Missing one or more required keys in .env file: AZURE_SPEECH_KEY, AZURE_SPEECH_REGION, MINIMAX_SPEECH_KEY, or OPENAI_API_KEY!")

# Load config.yaml if it exists, otherwise use empty config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

print("="*40)
print(f"專案名稱: {config.get('project_title', config.get('title', '未設定'))}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)
# print("STEP 1 Complete!")
from datetime import datetime
print(f"STEP 1 Complete! {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Step 2: 完整 TTS 參數與 UI，正確保存主文字到 text.txt（Azure SSML/Minimax全參數）
# 必填：.env 需有 AZURE_SPEECH_KEY, AZURE_SPEECH_REGION, MINIMAX_SPEECH_KEY, MINIMAX_GROUP_ID

import os
import ipywidgets as widgets
from IPython.display import display, clear_output, Audio
import yaml
import tempfile
import requests
from dotenv import load_dotenv

clear_output(wait=True)

# ===== Step 2.1: Project Info =====
label_style = {'description_width': '120px'}
wide_layout = widgets.Layout(width='1100px')

project_title_widget = widgets.Text(value='', description='專案名稱:', style=label_style, layout=wide_layout)
project_author_widget = widgets.Text(value='', description='作者:', style=label_style, layout=wide_layout)
bgm_volume_widget = widgets.FloatSlider(value=0.3, min=0, max=1, step=0.01, description='BGM音量:', style=label_style, layout=wide_layout)
attribution_widget = widgets.Text(value='', description='版權說明:', style=label_style, layout=wide_layout)
subtitle_font_widget = widgets.Text(value='NotoSansCJKtc-Regular.otf', description='字幕字型:', style=label_style, layout=wide_layout)
subtitle_fontsize_widget = widgets.IntSlider(value=30, min=10, max=80, step=1, description='字幕字體大小:', style=label_style, layout=wide_layout)
video_resolution_widget = widgets.Text(value='1920,1080', description='影片解析度:', style=label_style, layout=wide_layout)
background_file_widget = widgets.Text(value='background.jpg', description='背景檔:', style=label_style, layout=wide_layout)
bgm_file_widget = widgets.Text(value='bgm.mp3', description='BGM檔:', style=label_style, layout=wide_layout)
thumbnail_file_widget = widgets.Text(value='thumbnail.jpg', description='縮圖檔:', style=label_style, layout=wide_layout)
text_file_widget = widgets.Text(value='text.txt', description='文字檔:', style=label_style, layout=wide_layout)

project_info_box = widgets.VBox([
    project_title_widget, project_author_widget, bgm_volume_widget, attribution_widget,
    subtitle_font_widget, subtitle_fontsize_widget, video_resolution_widget,
    background_file_widget, bgm_file_widget, thumbnail_file_widget, text_file_widget
])

# ===== Step 2.2: Main Text & Preview =====
input_text_widget = widgets.Textarea(value='', description='主文字:', style=label_style, layout=widgets.Layout(width='1100px', height='150px'))
preview_text_widget = widgets.Textarea(value='', description='試聽文字:', style=label_style, layout=widgets.Layout(width='1100px', height='80px'))
preview_button = widgets.Button(description='語音試聽', button_style='info', layout=widgets.Layout(width='350px', height='50px'))
audio_output = widgets.Output(layout=wide_layout)

# ===== Step 2.3: TTS Server 選擇下拉選單 =====
tts_server_widget = widgets.Dropdown(
    options=['azure', 'minimax'],
    value='azure',
    description='TTS Server(供Step4/5使用):',
    style=label_style, layout=wide_layout
)

# ===== Step 2.4: TTS Tabs UI =====
# --- Azure ---
azure_voice_widget = widgets.Dropdown(
    options=['zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-CN-YunxiNeural', 'en-US-AriaNeural', 'en-US-GuyNeural', 'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural'],
    value='zh-TW-YunJheNeural', description='語音:', style=label_style, layout=wide_layout)
azure_format_widget = widgets.Dropdown(
    options=['mp3', 'wav', 'ogg', 'pcm'], value='mp3', description='音檔格式:', style=label_style, layout=wide_layout)

azure_box = widgets.VBox([
    widgets.HTML('<b>Azure TTS 設定（語音可選，SSML自動生成）</b>'),
    azure_voice_widget, azure_format_widget
])

# --- MINIMAX ---
minimax_model_widget = widgets.Dropdown(
    options=['speech-02-hd', 'speech-2.5-hd-preview'], value='speech-02-hd', description='模型:', style=label_style, layout=wide_layout)
minimax_accent_widget = widgets.Dropdown(
    options=['Chinese (Mandarin)', 'English', 'Japanese'], value='Chinese (Mandarin)', description='腔調:', style=label_style, layout=wide_layout)
minimax_voice_widget = widgets.Dropdown(
    options=['Chinese (Mandarin)_Warm_Bestie', 'Chinese (Mandarin)_Bright_Light', 'English_Graceful_Lady', 'Japanese_Whisper_Belle'],
    value='Chinese (Mandarin)_Warm_Bestie', description='語音:', style=label_style, layout=wide_layout)
minimax_emotion_widget = widgets.Dropdown(
    options=['calm', 'happy', 'sad', 'angry', 'fearful', 'disgusted', 'surprised'], value='calm', description='情感:', style=label_style, layout=wide_layout)
minimax_speed_widget = widgets.FloatSlider(
    value=1.0, min=0.5, max=2.0, step=0.01, description='語速:', style=label_style, layout=wide_layout)
minimax_pitch_widget = widgets.IntSlider(
    value=0, min=-12, max=12, step=1, description='語調(半音):', style=label_style, layout=wide_layout)
minimax_vol_widget = widgets.FloatSlider(
    value=1.0, min=0.1, max=10.0, step=0.1, description='音量:', style=label_style, layout=wide_layout)
minimax_format_widget = widgets.Dropdown(
    options=['mp3', 'wav'], value='mp3', description='音檔格式:', style=label_style, layout=wide_layout)
minimax_sample_widget = widgets.Dropdown(
    options=[32000, 44100], value=32000, description='取樣率:', style=label_style, layout=wide_layout)
minimax_bitrate_widget = widgets.Dropdown(
    options=[128000, 256000], value=128000, description='比特率:', style=label_style, layout=wide_layout)
minimax_channel_widget = widgets.Dropdown(
    options=[1, 2], value=1, description='聲道:', style=label_style, layout=wide_layout)
minimax_lang_boost_widget = widgets.Dropdown(
    options=['auto', 'none', 'zh', 'en', 'ja'], value='auto', description='語言加強:', style=label_style, layout=wide_layout)
minimax_subtitle_enable_widget = widgets.Checkbox(
    value=False, description='字幕啟用:', style=label_style)

minimax_box = widgets.VBox([
    widgets.HTML('<b>MINIMAX TTS 設定</b>'),
    minimax_model_widget,
    minimax_accent_widget, minimax_voice_widget, minimax_emotion_widget,
    minimax_speed_widget, minimax_pitch_widget, minimax_vol_widget,
    minimax_format_widget, minimax_sample_widget, minimax_bitrate_widget, minimax_channel_widget,
    minimax_lang_boost_widget, minimax_subtitle_enable_widget
])

tts_tab = widgets.Tab(children=[azure_box, minimax_box])
tts_tab.set_title(0, 'Azure TTS')
tts_tab.set_title(1, 'MINIMAX TTS')

# ===== Step 2.4: TTS API Functions =====
def azure_tts(text, voice, output_format_key):
    load_dotenv()
    AZURE_SPEECH_KEY = os.getenv('AZURE_SPEECH_KEY')
    AZURE_SPEECH_REGION = os.getenv('AZURE_SPEECH_REGION')
    AZURE_FORMAT_MAP = {
        "mp3": "audio-16khz-32kbitrate-mono-mp3",
        "wav": "riff-24khz-16bit-mono-pcm",
        "ogg": "ogg-48khz-16bit-mono-opus",
        "pcm": "raw-16khz-16bit-mono-pcm"
    }
    output_format = AZURE_FORMAT_MAP.get(output_format_key, "audio-16khz-32kbitrate-mono-mp3")
    url = f"https://{AZURE_SPEECH_REGION}.tts.speech.microsoft.com/cognitiveservices/v1"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_SPEECH_KEY,
        "Content-Type": "application/ssml+xml",
        "X-Microsoft-OutputFormat": output_format,
        "User-Agent": "TTSClient"
    }
    ssml = f"""<speak version='1.0' xml:lang='zh-TW'>
        <voice name='{voice}'>{text}</voice>
    </speak>"""
    try:
        r = requests.post(url, headers=headers, data=ssml.encode('utf-8'))
        if r.status_code == 200:
            with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{output_format_key}') as tf:
                tf.write(r.content)
                return tf.name
        else:
            print("Azure TTS API失敗，狀態碼：", r.status_code)
            print("回應：", r.text[:300])
            print("Endpoint:", url)
            print("Headers:", headers)
            print("Payload used:\n", ssml)
    except Exception as e:
        print('Azure TTS error:', e)
    return None

def minimax_tts(text, model, accent, voice, emotion, speed, pitch, vol, fmt, sample_rate, bitrate, channel, lang_boost, subtitle_enable):
    try:
        load_dotenv()
        MINIMAX_SPEECH_KEY = os.getenv('MINIMAX_SPEECH_KEY')
        GROUP_ID = os.getenv('MINIMAX_GROUP_ID', '1982992498867311582')
        url = f"https://api.minimax.io/v1/t2a_v2?GroupId={GROUP_ID}"
        headers = {
            "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": model,
            "text": text,
            "voice_setting": {
                "voice_id": voice,
                "speed": speed,
                "vol": vol,
                "pitch": pitch,
                "emotion": emotion
            },
            "audio_setting": {
                "sample_rate": sample_rate,
                "bitrate": bitrate,
                "format": fmt,
                "channel": channel
            },
            "output_format": "url",
            "language_boost": lang_boost,
            "subtitle_enable": subtitle_enable
        }
        r = requests.post(url, headers=headers, json=payload)
        if r.status_code == 200:
            data = r.json()
            base_resp = data.get("base_resp", {})
            if base_resp.get("status_code") == 0:
                audio_url = data.get("data", {}).get("audio")
                if audio_url:
                    r2 = requests.get(audio_url)
                    if r2.status_code == 200:
                        with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{fmt}') as tf:
                            tf.write(r2.content)
                            return tf.name
                    else:
                        print("MINIMAX音檔下載失敗：", r2.status_code)
                else:
                    print("MINIMAX未取得音檔URL，請檢查API回應！")
            else:
                print("MINIMAX API錯誤：", base_resp.get("status_msg"))
        else:
            print("MINIMAX TTS API失敗，狀態碼：", r.status_code)
            print("回應：", r.text[:300])
            print("Payload used:\n", payload)
    except Exception as e:
        print("MINIMAX TTS試聽錯誤：", e)
    return None

# ===== Step 2.5: Preview Button Logic =====
def on_preview_clicked(b):
    with audio_output:
        audio_output.clear_output()
        text = preview_text_widget.value.strip()
        if not text:
            print("請輸入試聽文字")
            return

        audio_path = None
        # 根據 tts_server_widget 的選擇呼叫不同 TTS
        if tts_server_widget.value == 'azure':
            audio_path = azure_tts(
                text,
                azure_voice_widget.value,
                azure_format_widget.value
            )
        else:  # MINIMAX
            audio_path = minimax_tts(
                text,
                minimax_model_widget.value,
                minimax_accent_widget.value,
                minimax_voice_widget.value,
                minimax_emotion_widget.value,
                minimax_speed_widget.value,
                minimax_pitch_widget.value,
                minimax_vol_widget.value,
                minimax_format_widget.value,
                minimax_sample_widget.value,
                minimax_bitrate_widget.value,
                minimax_channel_widget.value,
                minimax_lang_boost_widget.value,
                minimax_subtitle_enable_widget.value
            )
        if audio_path:
            display(Audio(audio_path, autoplay=True))
        else:
            print("語音產生失敗，請檢查API金鑰或設定。")

preview_button.on_click(on_preview_clicked)

# ===== Step 2.7: Save Logic (正確: 寫入 tts_server 為字串到 config.yaml) =====
output = widgets.Output(layout=wide_layout)
def on_save_clicked(b):
    config = {
        'project_title': project_title_widget.value,
        'project_author': project_author_widget.value,
        'bgm_volume': bgm_volume_widget.value,
        'attribution': attribution_widget.value,
        'subtitle_font': subtitle_font_widget.value,
        'subtitle_fontsize': subtitle_fontsize_widget.value,
        'video_resolution': video_resolution_widget.value,
        'background': background_file_widget.value,
        'bgm': bgm_file_widget.value,
        'thumbnail': thumbnail_file_widget.value,
        'text': text_file_widget.value,
        'input_text': input_text_widget.value,
        'preview_text': preview_text_widget.value,
        'tts_server': tts_server_widget.value,  # 修正：直接寫字串'azure'或'minimax'

        # Azure TTS
        'azure_voice': azure_voice_widget.value,
        'azure_format': azure_format_widget.value,

        # MINIMAX TTS
        'minimax_model': minimax_model_widget.value,
        'minimax_accent': minimax_accent_widget.value,
        'minimax_voice': minimax_voice_widget.value,
        'minimax_emotion': minimax_emotion_widget.value,
        'minimax_speed': minimax_speed_widget.value,
        'minimax_pitch': minimax_pitch_widget.value,
        'minimax_vol': minimax_vol_widget.value,
        'minimax_format': minimax_format_widget.value,
        'minimax_sample_rate': minimax_sample_widget.value,
        'minimax_bitrate': minimax_bitrate_widget.value,
        'minimax_channel': minimax_channel_widget.value,
        'minimax_language_boost': minimax_lang_boost_widget.value,
        'minimax_subtitle_enable': minimax_subtitle_enable_widget.value
    }
    with output:
        output.clear_output()
        yaml.safe_dump(config, open('config.yaml', 'w', encoding='utf-8'), allow_unicode=True)
        # --- 修正: 寫入主文字到 text.txt ---
        try:
            with open(text_file_widget.value, "w", encoding="utf-8") as tf:
                tf.write(input_text_widget.value)
            print(f'主文字已正確寫入 {text_file_widget.value}！')
        except Exception as e:
            print(f'寫入 {text_file_widget.value} 時發生錯誤:', e)
        print('所有設定已完整儲存到 config.yaml！')
        print(config)

save_button = widgets.Button(description='儲存全部設定', button_style='success', layout=widgets.Layout(width='350px', height='50px'))
save_button.on_click(on_save_clicked)

# ===== Step 2.7: Display Unified UI =====
display(
    widgets.HTML("<h3>Step 2：完整 YouTuber 專案設定 + TTS 調音（Azure / MINIMAX）</h3>"),
    project_info_box,
    input_text_widget,
    preview_text_widget,
    preview_button,
    audio_output,
    tts_server_widget,  # TTS Server 選擇獨立顯示
    tts_tab,
    save_button,
    output
)

In [None]:
# STEP 3: Load, Clean, and Split Sentences from Text File
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 3: Loading and cleaning sentences from text file...")

import yaml
import os
import re
from snownlp import SnowNLP

# Reload latest config (in case changed/reset)
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

text_file = config.get('text', 'text.txt')
if not os.path.exists(text_file):
    print(f"[ERROR] 找不到文字檔: {text_file}")
    sentences = []
else:
    # Load text and clean/split sentences
    try:
        with open(text_file, 'r', encoding='utf-8') as f:
            raw_text = f.read().strip()
    except Exception as e:
        print(f"[ERROR] Failed to read {text_file}: {e}")
        raw_text = ""

    def is_pronounceable(s):
        return bool(re.search(r'[\u4e00-\u9fffA-Za-z0-9]', s))

    def clean_markdown(s):
        s = re.sub(r"^[#\-\*\s>]+", "", s)
        s = re.sub(r"(\*|`|_|>|#|\[|\]|\(|\)|\-|~|=|>)", "", s)
        s = re.sub(r"[「」]", "", s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()

    try:
        sentences = [s.strip() for s in SnowNLP(raw_text).sentences if s.strip()]
    except Exception as e:
        print(f"[ERROR] SnowNLP sentence segmentation failed: {e}")
        sentences = [raw_text] if raw_text else []

    sentences = [clean_markdown(s) for s in sentences if is_pronounceable(s)]
    sentences = [s for s in sentences if is_pronounceable(s)]

    print(f"Total sentences after cleaning: {len(sentences)}")
    for idx, s in enumerate(sentences):
        print(f"{idx}: '{s}'")

print("STEP 3 Complete! Sentences loaded and cleaned.")

In [None]:
# STEP 4: TTS voice synthesis for each sentence (Azure/MINIMAX)
from IPython.display import clear_output
import os
import yaml
from dotenv import load_dotenv
import requests
import tempfile

clear_output(wait=True)
print("STEP 4: TTS voice synthesis for each sentence (Azure/MINIMAX)")

# 讀取 config.yaml
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

sentences = config.get('input_text', '').split('\n') if config.get('input_text') else []
if not sentences:
    print("[ERROR] No sentences found in config['input_text']")
print(f"Total sentences to synthesize: {len(sentences)}")

tts_server = config.get('tts_server', 'azure')
subdir = os.getcwd()
failed_sentences = []

if tts_server == 'azure':
    import azure.cognitiveservices.speech as speechsdk
    load_dotenv()
    AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
    AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
    VOICE = config.get('azure_voice', 'zh-TW-YunJheNeural')
    print(f"[CHECK] Azure voice_id for synthesis: {VOICE}")
    for i, sentence in enumerate(sentences):
        mp3_fname = os.path.join(subdir, f"voice_{i}.mp3")
        if os.path.exists(mp3_fname):
            overwrite = input(f"{mp3_fname} 已存在，要覆蓋嗎？(y/n): ")
            if overwrite.lower() != 'y':
                print(f"跳過 {mp3_fname}")
                continue
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_synthesis_voice_name = VOICE
        speech_config.set_speech_synthesis_output_format(
            speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
        )
        audio_config = speechsdk.audio.AudioOutputConfig(filename=mp3_fname)
        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = synthesizer.speak_text_async(sentence).get()
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print(f"[OK] 合成完成: {mp3_fname}")
        else:
            print(f"[ERROR] 合成失敗: {sentence}")
            failed_sentences.append((i, sentence))
elif tts_server == 'minimax':
    load_dotenv()
    MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")
    MINIMAX_URL = "https://api.minimax.io/v1/t2a_v2?GroupId=1982992498867311582"
    MODEL = config.get("minimax_model", "speech-02-hd")
    VOICE = config.get("minimax_voice", "Chinese (Mandarin)_Warm_Bestie")
    EMOTION = config.get("minimax_emotion", "calm")
    VOL = config.get("minimax_vol", 1.0)
    SPEED = config.get("minimax_speed", 1.0)
    PITCH = config.get("minimax_pitch", 0)
    AUDIO_FORMAT = config.get("minimax_audio_format", "mp3")
    SAMPLE_RATE = config.get("minimax_sample_rate", 32000)
    BITRATE = config.get("minimax_bitrate", 128000)
    CHANNEL = config.get("minimax_channel", 1)
    headers = {
        "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
        "Content-Type": "application/json"
    }
    print(f"[CHECK] MINIMAX voice_id for synthesis: {VOICE}")
    for i, sentence in enumerate(sentences):
        fname = os.path.join(subdir, f"voice_{i}.{AUDIO_FORMAT}")
        if os.path.exists(fname):
            overwrite = input(f"{fname} 已存在，要覆蓋嗎？(y/n): ")
            if overwrite.lower() != 'y':
                print(f"跳過 {fname}")
                continue
        payload = {
            "model": MODEL,
            "text": sentence,
            "voice_setting": {
                "voice_id": VOICE,
                "speed": SPEED,
                "vol": VOL,
                "pitch": PITCH,
                "emotion": EMOTION
            },
            "audio_setting": {
                "sample_rate": SAMPLE_RATE,
                "bitrate": BITRATE,
                "format": AUDIO_FORMAT,
                "channel": CHANNEL
            },
            "output_format": "url",
            "language_boost": "auto",
            "subtitle_enable": False
        }
        r = requests.post(MINIMAX_URL, headers=headers, json=payload)
        if r.status_code == 200:
            data = r.json()
            base_resp = data.get("base_resp", {})
            if base_resp.get("status_code") == 0:
                audio_url = data.get("data", {}).get("audio")
                if audio_url:
                    r2 = requests.get(audio_url)
                    if r2.status_code == 200:
                        return r2.content
                    else:
                        print("MINIMAX音檔下載失敗：", r2.status_code)
                else:
                    print("MINIMAX未取得音檔URL，請檢查API回應！")
            else:
                print("MINIMAX API錯誤：", base_resp.get("status_msg"))
        else:
            print("MINIMAX TTS API失敗，狀態碼：", r.status_code)
            print("回應：", r.text[:300])
    except Exception as e:
        print("MINIMAX TTS錯誤：", e)
    return None

if tts_server not in ('azure', 'minimax'):
    print(f"[ERROR] 未支援的 TTS server 設定：{tts_server}")
    print("請在 config.yaml 設定 tts_server 為 'azure' 或 'minimax'")
else:
    success_count = 0
    for idx, sentence in enumerate(sentences):
        audio_path = os.path.join(voice_dir, f'voice_{idx+1:04d}.mp3')
        audio_data = None
        if tts_server == 'azure':
            audio_data = azure_tts(
                sentence,
                config.get('azure_voice', 'zh-TW-YunJheNeural'),
                config.get('azure_format', 'mp3')
            )
        elif tts_server == 'minimax':
            audio_data = minimax_tts(
                sentence,
                config.get('minimax_model', 'speech-02-hd'),
                config.get('minimax_accent', 'Chinese (Mandarin)'),
                config.get('minimax_voice', 'Chinese (Mandarin)_Warm_Bestie'),
                config.get('minimax_emotion', 'calm'),
                config.get('minimax_speed', 1.0),
                config.get('minimax_pitch', 0),
                config.get('minimax_vol', 1.0),
                config.get('minimax_format', 'mp3'),
                config.get('minimax_sample_rate', 32000),
                config.get('minimax_bitrate', 128000),
                config.get('minimax_channel', 1),
                config.get('minimax_language_boost', 'auto'),
                config.get('minimax_subtitle_enable', False)
            )
        if audio_data:
            with open(audio_path, "wb") as f:
                f.write(audio_data)
            print(f"Generated {audio_path}")
            success_count += 1
        else:
            print(f"[ERROR] 語音合成失敗：{audio_path}")

    print(f"TTS synthesis finished. 成功產生 {success_count} 檔案。")

print("STEP 4 Complete! TTS synthesis finished.")

In [None]:
# STEP 5: Validate Audio Files and Get Durations
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 5: Validating TTS audio files and getting durations...")

import os
import subprocess
import yaml

# Reload config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
sentences = config.get('input_text', '').split('\n') if config.get('input_text') else []

valid_sentences = []
valid_audio_files = []
valid_durations = []

for i, sentence in enumerate(sentences):
    mp3_fname = os.path.join(subdir, f"voice_{i}.mp3")
    if os.path.exists(mp3_fname) and os.path.getsize(mp3_fname) > 0:
        r = subprocess.run([
            "ffprobe", "-v", "error", "-show_entries",
            "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", mp3_fname
        ], capture_output=True)
        try:
            duration_val = float(r.stdout.decode().strip())
            if duration_val > 0.01 and duration_val < 30:
                valid_durations.append(duration_val)
                valid_audio_files.append(mp3_fname)
                valid_sentences.append(sentence)
                print(f"OK: {i}: '{sentence}' ({duration_val:.2f}s)")
            else:
                print(f"Skipped {i}: '{sentence}' - duration {duration_val:.2f}s")
        except Exception as e:
            print(f"[ERROR] ffprobe failed for {mp3_fname}: {e}")
    else:
        print(f"[WARNING] Missing or empty audio file: {mp3_fname}")

print(f"\nKept {len(valid_audio_files)} valid audio files and sentences.")
if valid_sentences:
    print(f"✅ Last sentence included: {valid_sentences[-1]}")
print("STEP 5 Complete! Audio validation done.")

In [None]:
# STEP 6: Concatenate Audio Files into voice.mp3
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 6: Concatenating TTS audio files into voice.mp3...")

import os
import subprocess
import yaml

# Reload config and get sentences
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
sentences = config.get('input_text', '').split('\n') if config.get('input_text') else []

voice_concat_path = os.path.join(subdir, "voice.mp3")
audio_files = [os.path.join(subdir, f"voice_{i}.mp3") for i in range(len(sentences))]

if os.path.exists(voice_concat_path):
    overwrite = input(f"{voice_concat_path} 已存在，要覆蓋嗎？(y/n): ")
    if overwrite.lower() != 'y':
        print("跳過合併語音檔，保留舊檔案。")
        print("STEP 6 Complete!")
    else:
        print("重新合併語音檔。")
        with open(os.path.join(subdir, "tts_list.txt"), "w", encoding="utf-8") as f:
            for af in audio_files:
                if os.path.exists(af):
                    f.write(f"file '{af}'\n")
        cmd = [
            "ffmpeg", "-y",
            "-f", "concat", "-safe", "0", "-i", "tts_list.txt",
            "-c", "copy", "voice.mp3"
        ]
        print("執行語音合併指令：", " ".join(cmd))
        subprocess.run(cmd, cwd=subdir)
        print("STEP 6 Complete!")
else:
    print("開始合併語音檔。")
    with open(os.path.join(subdir, "tts_list.txt"), "w", encoding="utf-8") as f:
        for af in audio_files:
            if os.path.exists(af):
                f.write(f"file '{af}'\n")
    cmd = [
        "ffmpeg", "-y",
        "-f", "concat", "-safe", "0", "-i", "tts_list.txt",
        "-c", "copy", "voice.mp3"
    ]
    print("執行語音合併指令：", " ".join(cmd))
    subprocess.run(cmd, cwd=subdir)
    print("STEP 6 Complete!")

In [None]:
# STEP 7: Backup Project Data to New Project Directory
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 7: Backing up project files to a new project directory...")

import os
import shutil
import yaml
import datetime

# Reload config to get latest project title
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

project_title = config.get('project_title', '新專案')
timestamp = datetime.datetime.now().strftime('%Y%m%d')
project_dir = os.path.join(os.getcwd(), f"{timestamp}_{project_title}")

os.makedirs(project_dir, exist_ok=True)

files_to_backup = [
    config.get('background', 'background.jpg'),
    config.get('bgm', 'bgm.mp3'),
    config.get('thumbnail', 'thumbnail.jpg'),
    config.get('text', 'text.txt'),
    'config.yaml',
    'voice.mp3',
    'voice_bgm.mp3',
    'subtitle.srt',
    'final.mp4'
]
for fname in files_to_backup:
    src = os.path.join(os.getcwd(), fname)
    dst = os.path.join(project_dir, fname)
    if os.path.exists(src):
        shutil.copy2(src, dst)
        print(f"已備份 {fname} 到 {project_dir}")
    else:
        print(f"找不到 {fname}，略過。")
print("STEP 7 Complete! All project files backed up.")

In [None]:
# STEP 8: Generating subtitles (SRT) for the video and YAML for scene planning

from IPython.display import clear_output
import os
import yaml

clear_output(wait=True)
print("STEP 8: Generating subtitles (SRT) for the video...")

# ========== 讀取 Step 5 的結果 ==========
# 若在 notebook，valid_sentences, audio_durations 已經存在
# 若不存在，則自動重新讀取 voice 檔案和 sentences
try:
    valid_sentences
    audio_durations
except NameError:
    print("audio_durations not found or does not match sentence count, trying to load durations from audio files...")
    # 讀取主文字並切句
    if os.path.exists('config.yaml'):
        with open('config.yaml', encoding='utf-8') as f:
            config = yaml.safe_load(f)
    else:
        config = {}

    text_file = config.get('text', 'text.txt')
    if os.path.exists(text_file):
        with open(text_file, 'r', encoding='utf-8') as f:
            raw_text = f.read().strip()
    else:
        raw_text = ""
    from snownlp import SnowNLP
    sentences = [s.strip() for s in SnowNLP(raw_text).sentences if s.strip()]

    voice_dir = config.get('voice_dir', 'voice')
    try:
        from pydub import AudioSegment
    except ImportError:
        raise ImportError("需要安裝pydub，請執行 pip install pydub")
    valid_sentences = []
    audio_durations = []
    for idx, sentence in enumerate(sentences):
        audio_file = os.path.join(voice_dir, f'voice_{idx+1:04d}.mp3')
        if os.path.exists(audio_file):
            audio = AudioSegment.from_file(audio_file)
            duration_sec = len(audio) / 1000.0
            valid_sentences.append(sentence)
            audio_durations.append(duration_sec)

# ========== SRT 生成 ==========
total_subtitles = len(valid_sentences)
print(f"Total subtitles: {total_subtitles}")

srt_lines = []
yaml_data = []

start_time = 0.0
for idx, (sentence, duration) in enumerate(zip(valid_sentences, audio_durations)):
    end_time = start_time + duration
    # SRT格式
    srt_lines.append(f"{idx+1}\n{start_time:.2f} --> {end_time:.2f}\n{sentence}\n")
    print(f"{idx+1}: {start_time:.2f} --> {end_time:.2f} | {sentence}")
    # YAML資料
    yaml_data.append({
        'index': idx+1,
        'start': float(f"{start_time:.2f}"),
        'end': float(f"{end_time:.2f}"),
        'text': sentence
    })
    start_time = end_time

with open('subtitle.srt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(srt_lines))
print("SRT file saved as subtitle.srt.")

with open('subtitle.yaml', 'w', encoding='utf-8') as f:
    yaml.safe_dump({'subtitles': yaml_data}, f, allow_unicode=True)
print("YAML file saved as subtitle.yaml for scene planning.")

print("STEP 8 Complete! Subtitles generated for both video editing and scene planning.")

In [None]:
# STEP 8.5: Scene Visual Plan Editor (modular code, notebook cell version with image preview fix)
# - Semantic scene split (GPT-3.5)
# - DALL·E 3 AI background generation
# - Interactive storyboard editor for sentences
# - Manual/AI background selection
# - Video/audio attachment
# - Save full visual plan to YAML

import os
import openai
from PIL import Image
from io import BytesIO
import requests
import ipywidgets as widgets
import yaml
from dotenv import load_dotenv
import subprocess
import re
import functools
from IPython.display import display, Image as IPyImage

DEFAULT_MAX_SCENES = 4

def step8_5_scene_editor(
    valid_sentences,
    valid_audio_files,
    subdir,
    main_dir,
    config,
    ammunition_dir=None,
    max_scene=DEFAULT_MAX_SCENES
):
    print("▇▇▇▇ 步驟 8.5 分鏡表編輯器呼叫 ▇▇▇▇")
    print("[Debug] valid_sentences[:2]:", valid_sentences[:2] if isinstance(valid_sentences, list) else valid_sentences)
    print("[Debug] valid_audio_files[:2]:", valid_audio_files[:2] if isinstance(valid_audio_files, list) else valid_audio_files)
    print("[Debug] subdir:", subdir)
    print("[Debug] main_dir:", main_dir)
    print("[Debug] config:", config)
    print("[Debug] ammunition_dir:", ammunition_dir)
    print("[Debug] max_scene:", max_scene)

    try:
        load_dotenv()
        openai_api_key = os.getenv("OPENAI_API_KEY")
        print("[Debug] OPENAI_API_KEY:", openai_api_key)
        openai.api_key = openai_api_key
        if not openai_api_key or not openai_api_key.startswith("sk-"):
            print("[ERROR] OPENAI_API_KEY is not set or invalid!")
            raise RuntimeError("OPENAI_API_KEY is not set or invalid. Please check .env or environment variables!")
        if not ammunition_dir:
            ammunition_dir = os.path.join(main_dir, "ammunition")

        print("STEP 8.5: Scene Visual Plan Editor")
        print("專案名稱:", config.get("project_title", config.get("title", "未設定")))
        print("Current subdir:", subdir)
        print("目前工作目錄：", os.getcwd())

        # --- 1️⃣ Semantic scene split ---
        def semantic_scene_split(subtitles, max_scene=4):
            print("[Debug] semantic_scene_split subtitles[:2]:", subtitles[:2])
            prompt = (
                "請將以下字幕依語意分割為不超過4個場景，每個場景列出起迄句子編號（如：1-5，6-8），只需輸出每個場景的起迄範圍，不需解釋：\n"
                + "\n".join([f"{i+1}. {s}" for i, s in enumerate(subtitles)])
            )
            print("[Debug] GPT Prompt:", prompt[:80], "...")
            try:
                resp = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[{"role": "user", "content": prompt}]
                )
                content = resp.choices[0].message.content
                print("[Debug] GPT分鏡分割 response:", content)
            except Exception as e:
                print("[ERROR] GPT 分鏡分割失敗:", type(e), e)
                content = ""
            ranges = []
            for line in content.splitlines():
                m = re.match(r"(\d+)\s*-\s*(\d+)", line)
                if m:
                    start = int(m.group(1)) - 1
                    end = int(m.group(2))
                    ranges.append((start, end))
            if not ranges:
                print("[WARNING] GPT未能分割分鏡，使用預設範圍")
                ranges = [(0, len(subtitles))]
            return ranges[:max_scene]

        scene_ranges = semantic_scene_split(valid_sentences, max_scene)
        print("[Debug] scene_ranges:", scene_ranges)
        scene_images = [None] * len(scene_ranges)
        scene_img_names = [None] * len(scene_ranges)

        # --- 2️⃣ UI and file lists ---
        try:
            all_files = os.listdir(ammunition_dir)
            print("[Debug] all_files:", all_files)
        except Exception as e:
            print(f"[ERROR] Failed to list ammunition_dir: {e}")
            all_files = []

        image_files = [f for f in all_files if f.lower().endswith(('.jpg', '.png'))]
        video_files = [f for f in all_files if f.lower().endswith(('.mp4', '.mov', '.avi'))]
        music_files = [f for f in all_files if f.lower().endswith(('.mp3', '.wav'))]
        print("[Debug] image_files:", image_files)
        print("[Debug] video_files:", video_files)
        print("[Debug] music_files:", music_files)

        video_info = []
        for vf in video_files:
            path = os.path.join(ammunition_dir, vf)
            result = subprocess.run([
                "ffprobe", "-v", "error", "-show_entries",
                "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path
            ], capture_output=True)
            try:
                dur = float(result.stdout.decode().strip())
            except Exception:
                dur = None
            video_info.append((vf, dur))
        video_options = ["(無)"] + [f"{fn} ({d:.2f}s)" if d else fn for fn, d in video_info]
        video_map = {f"{fn} ({d:.2f}s)": fn for fn, d in video_info if d}
        image_options = ["(無)"] + image_files
        print("[Debug] video_options:", video_options)
        print("[Debug] video_map:", video_map)

        audio_durations = [None] * len(valid_sentences)
        for i in range(len(valid_sentences)):
            af = valid_audio_files[i] if i < len(valid_audio_files) else None
            if af and os.path.exists(af):
                result = subprocess.run([
                    "ffprobe", "-v", "error", "-show_entries",
                    "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", af
                ], capture_output=True)
                try:
                    audio_durations[i] = float(result.stdout.decode().strip())
                except Exception:
                    audio_durations[i] = None
        accum_durations = []
        running_total = 0.0
        for d in audio_durations:
            running_total += d if d else 0
            accum_durations.append(running_total)
        print("[Debug] audio_durations:", audio_durations)
        print("[Debug] accum_durations:", accum_durations)

        # --- 3️⃣ UI for sentences ---
        sentence_labels = []
        bg_selectors = []
        vid_selectors = []
        bg_preview_btns = []

        for i, sentence in enumerate(valid_sentences):
            aud_dur = audio_durations[i] if i < len(audio_durations) and audio_durations[i] else 0
            accum_dur = accum_durations[i] if i < len(accum_durations) else 0
            label_txt = f"{i+1}. {sentence[:40]} (audio: {aud_dur:.2f}s, total: {accum_dur:.2f}s)"
            sentence_label = widgets.Label(label_txt)
            bg_default = "(無)"
            bg_dd_options = ["(無)"] + image_files
            bg_dd = widgets.Dropdown(options=bg_dd_options, value=bg_default, description="")
            vid_dd = widgets.Dropdown(options=video_options, value="(無)", description="")
            sentence_labels.append(sentence_label)
            bg_selectors.append(bg_dd)
            vid_selectors.append(vid_dd)
            # Preview button
            img_preview_widget = widgets.Output()
            def preview_fun(img_dd, img_preview_widget):
                def show(_):
                    img_name = img_dd.value
                    if img_name != "(無)":
                        img_path = os.path.join(subdir, img_name)
                        if not os.path.exists(img_path):
                            img_path = os.path.join(ammunition_dir, img_name)
                        if os.path.exists(img_path):
                            img_preview_widget.clear_output()
                            with img_preview_widget:
                                display(IPyImage(filename=img_path))
                return show
            preview_btn = widgets.Button(description="預覽背景")
            preview_btn.on_click(preview_fun(bg_dd, img_preview_widget))
            bg_preview_btns.append(widgets.VBox([preview_btn, img_preview_widget]))

        col1_title = widgets.HTML("<b>句子 / 時間</b>")
        col2_title = widgets.HTML("<b>背景圖片</b>")
        col3_title = widgets.HTML("<b>背景預覽</b>")
        col4_title = widgets.HTML("<b>影片剪輯</b>")

        sentence_labels_col = widgets.VBox([col1_title] + sentence_labels)
        bg_selectors_col = widgets.VBox([col2_title] + bg_selectors)
        bg_preview_col  = widgets.VBox([col3_title] + bg_preview_btns)
        vid_selectors_col = widgets.VBox([col4_title] + vid_selectors)
        table_ui = widgets.HBox([sentence_labels_col, bg_selectors_col, bg_preview_col, vid_selectors_col])

        instructions = widgets.HTML("""
        <b>分鏡表編輯器（語意分場景AI背景/手動選檔 + 分句細緻分鏡表 + 預覽功能）</b><br>
        上方：場景分段字幕、AI背景生成/手動選檔（產生/選擇後自動分配背景，分鏡表即時更新）<br>
        下方：分句分鏡表可手動調整背景、影片等（預設已依場景分配背景）<br>
        儲存時背景空白自動沿用上一句設定。
        """)
        print("[Debug] UI 準備顯示")
        display(instructions, table_ui)

        # --- 4️⃣ Scene UI, AI image generation and manual selection ---
        def sanitize_title(title):
            return re.sub(r"[^\u4e00-\u9fa5A-Za-z0-9]", "", title)[:10] or "scene"

        def dalle3_generate(prompt, idx, seg_sentences, size="1792x1024"):
            scene_title = sanitize_title(seg_sentences[0]) if seg_sentences else f"scene{idx+1}"
            fname = os.path.join(subdir, f"ai_scene_{idx+1}_{scene_title}.jpg")
            print(f"[Debug] 呼叫DALL·E 生成場景{idx+1}: {scene_title}")
            try:
                response = openai.images.generate(
                    model="dall-e-3",
                    prompt=prompt,
                    size=size,
                    n=1,
                    quality="standard"
                )
                print("[Debug] DALL·E response url:", response.data[0].url)
                url = response.data[0].url
                imgdata = requests.get(url).content
                img = Image.open(BytesIO(imgdata))
                img_resized = img.resize((1920, 1080), resample=Image.LANCZOS)
                img_resized.save(fname, format="JPEG")
                print(f"[Debug] AI場景圖片已儲存：{fname}")
                return fname
            except Exception as e:
                print("[ERROR] DALL·E圖片生成失敗:", e)
                raise

        scene_widgets = []
        for idx, (start, end) in enumerate(scene_ranges):
            seg_sentences = valid_sentences[start:end]
            seg_text = "\n".join([f"{i+1}. {s}" for i, s in enumerate(seg_sentences)])
            prompt = f"Minimalist, modern, unique illustration for a YouTube thumbnail. Scene {idx+1}: {seg_text}"

            label = widgets.HTML(f"<b>場景{idx+1} 字幕:</b><br><pre>{seg_text}</pre>")
            img_preview = widgets.Output()
            img_path_label = widgets.Label("尚未選取圖片")
            select_img_dd = widgets.Dropdown(options=["(請選擇)"] + image_files, value="(請選擇)", description="")

            def on_gen_clicked(b, idx=idx, prompt=prompt, seg_sentences=seg_sentences, start=start, end=end, img_preview=img_preview):
                print(f"[Debug] 產生 AI 背景 (idx={idx})")
                try:
                    fname = dalle3_generate(prompt, idx, seg_sentences)
                    scene_images[idx] = fname
                    scene_img_names[idx] = os.path.basename(fname)
                    img_preview.clear_output()
                    img_path_label.value = f"AI生成：{os.path.basename(fname)}"
                    with img_preview:
                        display(IPyImage(filename=fname))
                    for i in range(start, end):
                        bg_selectors[i].value = os.path.basename(fname)
                except Exception as e:
                    img_preview.clear_output()
                    img_path_label.value = f"生成失敗: {e}"
                    print(f"[ERROR] 產生AI圖片失敗 idx={idx}: {e}")

            def on_select_img_change(change, idx=idx, start=start, end=end, img_preview=img_preview):
                print(f"[Debug] 選擇手動背景 idx={idx}")
                if change["name"] == "value" and change["new"] != "(請選擇)":
                    chosen = change["new"]
                    scene_images[idx] = os.path.join(ammunition_dir, chosen)
                    scene_img_names[idx] = chosen
                    img_preview.clear_output()
                    img_path_label.value = f"使用檔案：{chosen}"
                    img_path = os.path.join(ammunition_dir, chosen)
                    if os.path.exists(img_path):
                        with img_preview:
                            display(IPyImage(filename=img_path))
                    for i in range(start, end):
                        bg_selectors[i].value = chosen

            select_img_dd.observe(lambda change, idx=idx, start=start, end=end, img_preview=img_preview: on_select_img_change(change, idx, start, end, img_preview), names="value")
            btn_gen = widgets.Button(description="重新產生", button_style="danger")
            btn_gen.on_click(functools.partial(on_gen_clicked, idx=idx, prompt=prompt, seg_sentences=seg_sentences, start=start, end=end, img_preview=img_preview))

            seg_box = widgets.VBox([
                label,
                widgets.HBox([widgets.Label("沿用圖片："), select_img_dd]),
                btn_gen,
                img_path_label,
                img_preview
            ])
            scene_widgets.append(seg_box)

        scene_ui = widgets.VBox(scene_widgets)
        print("[Debug] 分鏡場景UI顯示")
        display(scene_ui)

        # --- 5️⃣ Save visual plan ---
        save_btn = widgets.Button(description="儲存分鏡表", button_style="success")
        output = widgets.Output()

        def get_duration(path):
            print(f"[Debug] get_duration called for path: {path}")
            if not os.path.exists(path):
                print(f"[Debug] 路徑不存在: {path}")
                return None
            result = subprocess.run([
                "ffprobe", "-v", "error", "-show_entries",
                "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path
            ], capture_output=True)
            try:
                dur = float(result.stdout.decode().strip())
                print(f"[Debug] 讀取到 duration: {dur}")
                return dur
            except Exception as e:
                print(f"[ERROR] get_duration fail: {e}")
                return None

        def save_plan(b):
            print("[Debug] 儲存分鏡表開始")
            with output:
                output.clear_output()
                visual_plan = []
                warnings = []
                bg_list = []
                for i, bg_dd in enumerate(bg_selectors):
                    val = bg_dd.value
                    if val and val != "(無)":
                        f1 = os.path.join(subdir, val)
                        f2 = os.path.join(ammunition_dir, val)
                        bg_path = f1 if os.path.exists(f1) else f2 if os.path.exists(f2) else val
                        bg_list.append(bg_path)
                    else:
                        bg_list.append(None)
                last_bg = None
                for i in range(len(bg_list)):
                    if bg_list[i] is None and last_bg is not None:
                        bg_list[i] = last_bg
                    elif bg_list[i] is not None:
                        last_bg = bg_list[i]
                for i, sentence in enumerate(valid_sentences):
                    bg = bg_list[i]
                    vid_label = vid_selectors[i].value
                    vid = None
                    vid_dur = None
                    if vid_label != "(無)":
                        if vid_label in video_map:
                            vid = video_map[vid_label]
                            vid_dur = get_duration(os.path.join(ammunition_dir, vid))
                        else:
                            vid = vid_label.split(" (")[0]
                            vid_dur = get_duration(os.path.join(ammunition_dir, vid))
                    audio_fname = valid_audio_files[i]
                    audio_dur = audio_durations[i] if i < len(audio_durations) and audio_durations[i] else 0
                    accum_dur = accum_durations[i] if i < len(accum_durations) else 0
                    if vid and vid_dur and audio_dur and abs(vid_dur - audio_dur) > 1.0:
                        warnings.append(f"⚠️ 句{i+1}影片({vid_dur:.2f}s)與語音({audio_dur:.2f}s)時長不符。")
                    visual_plan.append({
                        "sentence_idx": i,
                        "sentence": sentence,
                        "audio_file": audio_fname,
                        "audio_duration": audio_dur,
                        "accum_duration": accum_dur,
                        "background": bg,
                        "video_clip": vid,
                        "video_duration": vid_dur
                    })
                visual_plan_path = os.path.join(subdir, "visual_plan.yaml")
                with open(visual_plan_path, "w", encoding="utf-8") as f:
                    yaml.safe_dump(visual_plan, f, allow_unicode=True)
                print(f"分鏡表已儲存至 {visual_plan_path}")
                if warnings:
                    print("\n".join(warnings))
                print("前3項：")
                for vp in visual_plan[:3]:
                    print(vp)

        save_btn.on_click(save_plan)
        print("[Debug] 儲存分鏡表按鈕顯示")
        display(save_btn, output)
        print("="*40)
        print(f"專案名稱: {config.get('project_title', config.get('title', '未設定'))}")
        print(f"Current subdir: {subdir}")
        print(f"目前工作目錄：{os.getcwd()}")
        print("="*40)

    except Exception as e:
        print("▇▇▇▇ [ERROR/FATAL] step8_5_scene_editor Exception:", type(e), e)
        import traceback; traceback.print_exc()

In [None]:
# STEP 9: Mix voice.mp3 and BGM into voice_bgm.mp3
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 9: Mixing voice.mp3 and BGM into voice_bgm.mp3...")

import os
import yaml
import subprocess

# Reload config for BGM info
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
voice_path = os.path.join(subdir, "voice.mp3")
bgm_path = os.path.join(subdir, config.get('bgm', 'bgm.mp3'))
output_path = os.path.join(subdir, "voice_bgm.mp3")
bgm_volume = config.get('bgm_volume', 0.3)

if not os.path.exists(voice_path):
    print(f"[ERROR] 找不到語音檔: {voice_path}")
elif not os.path.exists(bgm_path):
    print(f"[ERROR] 找不到背景音樂檔: {bgm_path}")
elif os.path.exists(output_path):
    overwrite = input(f"{output_path} 已存在，要覆蓋嗎？(y/n): ")
    if overwrite.lower() != 'y':
        print("跳過混音，保留舊檔案。")
        print("STEP 9 Complete!")
    else:
        print("重新混音 voice.mp3 和 BGM。")
        cmd = [
            "ffmpeg", "-y",
            "-i", voice_path,
            "-i", bgm_path,
            "-filter_complex", f"[1:a]volume={bgm_volume}[bgm];[0:a][bgm]amix=inputs=2:duration=first:dropout_transition=2",
            "-c:a", "mp3", output_path
        ]
        print("執行混音指令：", " ".join(cmd))
        subprocess.run(cmd, cwd=subdir)
        print("STEP 9 Complete!")
else:
    print("開始混音 voice.mp3 和 BGM。")
    cmd = [
        "ffmpeg", "-y",
        "-i", voice_path,
        "-i", bgm_path,
        "-filter_complex", f"[1:a]volume={bgm_volume}[bgm];[0:a][bgm]amix=inputs=2:duration=first:dropout_transition=2",
        "-c:a", "mp3", output_path
    ]
    print("執行混音指令：", " ".join(cmd))
    subprocess.run(cmd, cwd=subdir)
    print("STEP 9 Complete!")

In [None]:
# STEP 10: Generate final video (final.mp4) with background image, mixed audio, and subtitles
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 10: Generating final video (final.mp4) using background, voice_bgm.mp3, and subtitle.srt...")

import os
import yaml
import subprocess

# Reload config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
bg_image = os.path.join(subdir, config.get('background', 'background.jpg'))
audio_file = os.path.join(subdir, "voice_bgm.mp3")
subtitle_file = os.path.join(subdir, "subtitle.srt")
video_resolution = config.get('video_resolution', '1920,1080')
output_file = os.path.join(subdir, "final.mp4")

if not os.path.exists(bg_image):
    print(f"[ERROR] 找不到背景圖片: {bg_image}")
elif not os.path.exists(audio_file):
    print(f"[ERROR] 找不到混音語音檔: {audio_file}")
elif not os.path.exists(subtitle_file):
    print(f"[ERROR] 找不到字幕檔: {subtitle_file}")
elif os.path.exists(output_file):
    overwrite = input(f"{output_file} 已存在，要覆蓋嗎？(y/n): ")
    if overwrite.lower() != 'y':
        print("跳過影片產生，保留舊檔案。")
        print("STEP 10 Complete!")
    else:
        print("重新產生影片。")
        cmd = [
            "ffmpeg", "-y",
            "-loop", "1",
            "-i", bg_image,
            "-i", audio_file,
            "-vf", f"subtitles={subtitle_file},scale={video_resolution}",
            "-c:v", "libx264", "-tune", "stillimage",
            "-c:a", "aac",
            "-b:a", "192k",
            "-pix_fmt", "yuv420p",
            "-shortest",
            output_file
        ]
        print("執行影片產生指令：", " ".join(cmd))
        subprocess.run(cmd, cwd=subdir)
        print(f"影片檔已產生: {output_file}")
        print("STEP 10 Complete!")
else:
    print("開始產生影片。")
    cmd = [
        "ffmpeg", "-y",
        "-loop", "1",
        "-i", bg_image,
        "-i", audio_file,
        "-vf", f"subtitles={subtitle_file},scale={video_resolution}",
        "-c:v", "libx264", "-tune", "stillimage",
        "-c:a", "aac",
        "-b:a", "192k",
        "-pix_fmt", "yuv420p",
        "-shortest",
        output_file
    ]
    print("執行影片產生指令：", " ".join(cmd))
    subprocess.run(cmd, cwd=subdir)
    print(f"影片檔已產生: {output_file}")
    print("STEP 10 Complete!")

In [None]:
# STEP 11: Generate YouTube Thumbnail (thumbnail.jpg)
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 11: Creating YouTube thumbnail image (thumbnail.jpg)...")

import os
import yaml
from PIL import Image, ImageDraw, ImageFont

# Reload config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
thumbnail_path = os.path.join(subdir, config.get('thumbnail', 'thumbnail.jpg'))
background_path = os.path.join(subdir, config.get('background', 'background.jpg'))
project_title = config.get('project_title', '新專案')
subtitle_font = os.path.join(subdir, config.get('subtitle_font', 'NotoSansCJKtc-Regular.otf'))
subtitle_fontsize = config.get('subtitle_fontsize', 60)

if os.path.exists(thumbnail_path):
    overwrite = input(f"{thumbnail_path} 已存在，要覆蓋嗎？(y/n): ")
    if overwrite.lower() != 'y':
        print("跳過縮圖產生，保留舊檔案。")
        print("STEP 11 Complete!")
    else:
        print("重新產生縮圖。")
        try:
            base_img = Image.open(background_path).convert("RGB")
            draw = ImageDraw.Draw(base_img)
            font = ImageFont.truetype(subtitle_font, subtitle_fontsize)
            w, h = base_img.size
            text = project_title
            text_w, text_h = draw.textsize(text, font=font)
            draw.text(((w - text_w) / 2, h * 0.85), text, font=font, fill="white")
            base_img.save(thumbnail_path)
            print(f"縮圖已產生: {thumbnail_path}")
        except Exception as e:
            print(f"[ERROR] Failed to create thumbnail: {e}")
        print("STEP 11 Complete!")
else:
    print("開始產生縮圖。")
    try:
        base_img = Image.open(background_path).convert("RGB")
        draw = ImageDraw.Draw(base_img)
        font = ImageFont.truetype(subtitle_font, subtitle_fontsize)
        w, h = base_img.size
        text = project_title
        text_w, text_h = draw.textsize(text, font=font)
        draw.text(((w - text_w) / 2, h * 0.85), text, font=font, fill="white")
        base_img.save(thumbnail_path)
        print(f"縮圖已產生: {thumbnail_path}")
    except Exception as e:
        print(f"[ERROR] Failed to create thumbnail: {e}")
    print("STEP 11 Complete!")

In [None]:
# STEP 12: Prepare YouTube Export Info (title, description, files)
from IPython.display import clear_output; clear_output(wait=True)
print("STEP 12: Preparing YouTube export info (title, description, files)...")

import os
import yaml

# Reload config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

subdir = os.getcwd()
project_title = config.get('project_title', '新專案')
project_author = config.get('project_author', '')
attribution = config.get('attribution', '')
thumbnail_path = os.path.join(subdir, config.get('thumbnail', 'thumbnail.jpg'))
video_path = os.path.join(subdir, "final.mp4")
description_path = os.path.join(subdir, "youtube_description.txt")

# Build YouTube description text
description_lines = [
    f"標題: {project_title}",
    f"作者: {project_author}",
    "",
    "說明：",
    attribution,
    "",
    "（自動生成，請在上傳前自行補充/修改）"
]
description_text = '\n'.join(description_lines)

with open(description_path, "w", encoding="utf-8") as f:
    f.write(description_text)
print(f"已產生 YouTube 說明檔: {description_path}")

print("請用以下檔案進行 YouTube 上傳：")
print(f"影片檔: {video_path}")
print(f"縮圖檔: {thumbnail_path}")
print(f"說明檔: {description_path}")

print("STEP 12 Complete! YouTube export info is ready.")

# TTS Video Project Pipeline
This notebook covers all steps from input text, TTS voice synthesis, subtitle generation, audio/video concatenation, image processing, and final export for YouTube.

In [None]:
!pip install azure-cognitiveservices-speech pysubs2 pillow snownlp python-dotenv tqdm pyyaml

In [None]:
# Cell 1: STEP 0 專案資料/初始化
print("STEP 0: 初始化與專案資料")
import os
import yaml

main_dir = os.getcwd()
config_path = os.path.join(main_dir, "config.yaml")
with open(config_path, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

# 檢查背景圖
bg_img = os.path.join(main_dir, config.get('background', 'background.jpg'))
if not os.path.exists(bg_img):
    print(f"請將背景圖 {config.get('background', 'background.jpg')} 放到主工作區！")
else:
    print(f"背景圖已在主工作區：{bg_img}")

# 檢查配樂
bgm_file = os.path.join(main_dir, config.get('bgm', 'bgm.mp3'))
if not os.path.exists(bgm_file):
    print(f"請將背景音樂 {config.get('bgm', 'bgm.mp3')} 放到主工作區！")
else:
    print(f"背景音樂已在主工作區：{bgm_file}")

# 其他初始化
print("主工作區：", main_dir)

In [None]:
# Cell 2: STEP 1 TTS語音合成
print("STEP 1: TTS語音合成")
# 假設 sentences 已準備好
import glob

sentences = [...]  # 請根據你的資料來源調整
tts_files = []
for i, text in enumerate(sentences):
    tts_path = os.path.join(main_dir, f"voice_{i}.mp3")
    if not os.path.exists(tts_path):
        # 請填入你自己的 TTS 合成程序
        print(f"合成語音：voice_{i}.mp3")
        # tts_synthesize(text, tts_path)
    else:
        print(f"語音檔已存在：voice_{i}.mp3")
    tts_files.append(tts_path)
print("TTS語音合成完畢，共產生", len(tts_files), "個語音檔")

In [None]:
import os
import yaml
from dotenv import load_dotenv

# Set main working directory
main_dir = r"C:\Users\flyre\mynotebooks"
os.chdir(main_dir)
subdir = main_dir

# Load environment variables
load_dotenv()
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")
if not AZURE_SPEECH_KEY or not AZURE_SPEECH_REGION or not MINIMAX_SPEECH_KEY:
    raise ValueError("Missing AZURE_SPEECH_KEY or AZURE_SPEECH_REGION or MINIMAX_SPEECH_KEY in .env file!")

# Load config.yaml if it exists, otherwise use empty config
if os.path.exists('config.yaml'):
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
else:
    config = {}

print("="*40)
print(f"專案名稱: {config.get('project_title', config.get('title', '未設定'))}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)

In [None]:
# cell 3
print("專案名稱:", config.get("project_title", config.get("title", "未設定")))
print("Current subdir:", subdir)
print("目前工作目錄：", os.getcwd())

import ipywidgets as widgets
from IPython.display import display, clear_output, Audio
import yaml
import os
from dotenv import load_dotenv
import requests
import tempfile

load_dotenv()

label_style = {'description_width': '120px'}
wide_layout = widgets.Layout(width='1100px')
wide_textarea_layout = widgets.Layout(width='1100px', height='250px')
wide_preview_textarea_layout = widgets.Layout(width='1100px', height='125px')

# Azure voices
azure_voice_options = [
    'zh-TW-YunJheNeural', 'zh-TW-HsiaoChenNeural', 'zh-CN-YunxiNeural',
    'en-US-JennyNeural', 'en-US-AriaNeural', 'en-US-GuyNeural',
    'ja-JP-NanamiNeural', 'ja-JP-KeitaNeural'
]

# MINIMAX accents and voices
minimax_accent_options = [
    ('中文-普通話', 'Chinese (Mandarin)'),
    ('英文', 'English'),
    ('日文', 'Japanese')
]
minimax_voice_dict = {
    'Chinese (Mandarin)': [
        'Chinese (Mandarin)_Warm_Bestie',
        'Chinese (Mandarin)_Lyrical_Voice',
        'Chinese (Mandarin)_Bright_Light',
        'Chinese (Mandarin)_Sweet_Girl',
        'Chinese (Mandarin)_Young_Man',
        'Chinese (Mandarin)_Deep_Dad',
        'Chinese (Mandarin)_Calm_Mom',
        'Chinese (Mandarin)_Narrator_Professional',
        'Chinese (Mandarin)_Narrator_Soft',
        'Chinese (Mandarin)_Child_Boy',
        'Chinese (Mandarin)_Child_Girl',
        'Chinese (Mandarin)_Elder_Grandpa',
        'Chinese (Mandarin)_Elder_Grandma',
        'Chinese (Mandarin)_News_Anchor',
        'Chinese (Mandarin)_Cartoon_Bear',
        'Chinese (Mandarin)_Cartoon_Duck',
        'Chinese (Mandarin)_Cartoon_Fox',
        'Chinese (Mandarin)_Cartoon_Robot'
    ],
    'English': [
        'English_Graceful_Lady',
        'English_Persuasive_Man'
    ],
    'Japanese': [
        'Japanese_Whisper_Belle'
    ]
}

# Input widgets
input_text_widget = widgets.Textarea(
    value='', description='文字內容:', style=label_style, layout=wide_textarea_layout
)
preview_text_widget = widgets.Textarea(
    value='', description='試聽文字:', style=label_style, layout=wide_preview_textarea_layout
)
preview_button = widgets.Button(description='語音試聽', button_style='info', layout=widgets.Layout(width='350px', height='60px'))
audio_output = widgets.Output(layout=wide_layout)
save_button = widgets.Button(description='確定儲存', button_style='success', layout=widgets.Layout(width='350px', height='60px'))
output = widgets.Output(layout=wide_layout)

tts_server_widget = widgets.Dropdown(
    options=[('Azure', 'azure'), ('MINIMAX', 'minimax')],
    value='azure', description='TTS伺服器:', style=label_style, layout=wide_layout
)

# Azure widgets
azure_voice_widget = widgets.Dropdown(
    options=azure_voice_options, value=azure_voice_options[0],
    description='Azure語音:', style=label_style, layout=wide_layout
)
azure_speed_widget = widgets.FloatSlider(value=1.0, min=0.5, max=2.0, step=0.01, description='語速:', style=label_style, layout=wide_layout)
azure_pitch_widget = widgets.IntSlider(value=0, min=-12, max=12, step=1, description='語調:', style=label_style, layout=wide_layout)

# MINIMAX widgets
minimax_model_widget = widgets.Dropdown(
    options=['speech-02-hd', 'speech-2.5-hd-preview'], value='speech-02-hd', description='MINIMAX模型:', style=label_style, layout=wide_layout)
minimax_accent_widget = widgets.Dropdown(
    options=minimax_accent_options, value='Chinese (Mandarin)', description='MINIMAX腔調:', style=label_style, layout=wide_layout
)
minimax_voice_widget = widgets.Dropdown(
    options=minimax_voice_dict['Chinese (Mandarin)'], value=minimax_voice_dict['Chinese (Mandarin)'][0],
    description='MINIMAX語音:', style=label_style, layout=wide_layout
)
minimax_emotion_widget = widgets.Dropdown(
    options=['calm', 'happy', 'sad', 'angry', 'fearful', 'disgusted', 'surprised'],
    value='calm', description='MINIMAX情感:', style=label_style, layout=wide_layout)
minimax_vol_widget = widgets.FloatSlider(value=1.0, min=0.1, max=10.0, step=0.1, description='MINIMAX音量:', style=label_style, layout=wide_layout)
minimax_speed_widget = widgets.FloatSlider(value=1.0, min=0.5, max=2.0, step=0.01, description='語速:', style=label_style, layout=wide_layout)
minimax_pitch_widget = widgets.IntSlider(value=0, min=-12, max=12, step=1, description='語調:', style=label_style, layout=wide_layout)
minimax_audio_format_widget = widgets.Dropdown(
    options=['mp3', 'wav'], value='mp3', description='MINIMAX音檔格式:', style=label_style, layout=wide_layout)
minimax_sample_rate_widget = widgets.Dropdown(
    options=[32000, 44100], value=32000, description='MINIMAX取樣率:', style=label_style, layout=wide_layout)
minimax_bitrate_widget = widgets.Dropdown(
    options=[128000, 256000], value=128000, description='MINIMAX比特率:', style=label_style, layout=wide_layout)
minimax_channel_widget = widgets.Dropdown(
    options=[1, 2], value=1, description='MINIMAX聲道:', style=label_style, layout=wide_layout)

# Accent change updates voice list
def update_minimax_voices(*args):
    accent = minimax_accent_widget.value
    minimax_voice_widget.options = minimax_voice_dict[accent]
    minimax_voice_widget.value = minimax_voice_dict[accent][0]
minimax_accent_widget.observe(update_minimax_voices, names='value')

# --- Project info widgets ---
project_title_widget = widgets.Text(value='test', description='專案名稱:', style=label_style, layout=wide_layout)
project_author_widget = widgets.Text(value='', description='作者:', style=label_style, layout=wide_layout)
bgm_volume_widget = widgets.FloatSlider(value=0.3, min=0, max=1, step=0.01, description='BGM音量:', style=label_style, layout=wide_layout)
attribution_widget = widgets.Text(value='', description='版權說明:', style=label_style, layout=wide_layout)
subtitle_font_widget = widgets.Text(value='NotoSansCJKtc-Regular.otf', description='字幕字型:', style=label_style, layout=wide_layout)
subtitle_fontsize_widget = widgets.IntSlider(value=30, min=10, max=80, step=1, description='字幕字體大小:', style=label_style, layout=wide_layout)
video_resolution_widget = widgets.Text(value='1920,1080', description='影片解析度:', style=label_style, layout=wide_layout)

# --- Dynamically list images and audio from ammunition directory ---
def get_ammunition_files(extension_list):
    files = []
    if os.path.isdir("ammunition"):
        for fname in os.listdir("ammunition"):
            if any(fname.lower().endswith(ext) for ext in extension_list):
                files.append(fname)
    return files if files else ["(無符合檔案)"]

background_image_options = get_ammunition_files(['.jpg', '.jpeg', '.png', '.bmp', '.webp'])
bgm_audio_options = get_ammunition_files(['.mp3', '.wav', '.aac', '.m4a', '.flac', '.ogg'])

background_file_widget = widgets.Dropdown(
    options=background_image_options,
    value=background_image_options[0],
    description='背景檔:', style=label_style, layout=wide_layout
)
bgm_file_widget = widgets.Dropdown(
    options=bgm_audio_options,
    value=bgm_audio_options[0],
    description='BGM檔:', style=label_style, layout=wide_layout
)

thumbnail_file_widget = widgets.Text(value='thumbnail.jpg', description='縮圖檔:', style=label_style, layout=wide_layout)
text_file_widget = widgets.Text(value='text.txt', description='文字檔:', style=label_style, layout=wide_layout)

project_info_box = widgets.VBox([
    project_title_widget, project_author_widget, bgm_volume_widget,
    attribution_widget, subtitle_font_widget, subtitle_fontsize_widget,
    video_resolution_widget, background_file_widget, bgm_file_widget,
    thumbnail_file_widget, text_file_widget
])

# --- Dynamic param box ---
param_box = widgets.VBox()
def update_param_box(*args):
    if tts_server_widget.value == 'azure':
        param_box.children = [azure_voice_widget, azure_speed_widget, azure_pitch_widget]
    else:
        param_box.children = [
            minimax_model_widget, minimax_accent_widget, minimax_voice_widget, minimax_emotion_widget,
            minimax_vol_widget, minimax_speed_widget, minimax_pitch_widget,
            minimax_audio_format_widget, minimax_sample_rate_widget, minimax_bitrate_widget, minimax_channel_widget
        ]
update_param_box()
tts_server_widget.observe(update_param_box, names='value')

# ---- TTS Functions ----
def azure_tts_api(text, voice, speed, pitch):
    AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
    AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
    if not AZURE_SPEECH_KEY or not AZURE_SPEECH_REGION:
        print("Azure API key 或 region 未設定。")
        return None
    try:
        import azure.cognitiveservices.speech as speechsdk
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_synthesis_voice_name = voice
        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
        result = synthesizer.speak_text_async(text).get()
        if result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speechsdk.CancellationDetails(result)
            print(f"Azure TTS失敗: {cancellation_details.reason}\n{cancellation_details.error_details}")
            return None
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            tf = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
            tf.write(result.audio_data)
            tf.close()
            return tf.name
        else:
            print(f"Azure TTS失敗: {getattr(result, 'error_details', str(result.reason))}")
    except Exception as e:
        print("Azure TTS執行錯誤:", str(e))
    return None

def minimax_tts_api(text, model, voice, emotion, vol, speed, pitch, audio_format, sample_rate, bitrate, channel):
    MINIMAX_SPEECH_KEY = os.getenv('MINIMAX_SPEECH_KEY')
    url = f"https://api.minimax.io/v1/t2a_v2?GroupId=1982992498867311582"
    headers = {
        "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "text": text,
        "voice_setting": {
            "voice_id": voice,
            "speed": speed,
            "vol": vol,
            "pitch": pitch,
            "emotion": emotion
        },
        "audio_setting": {
            "sample_rate": sample_rate,
            "bitrate": bitrate,
            "format": audio_format,
            "channel": channel
        },
        "output_format": "url",
        "language_boost": "auto",
        "subtitle_enable": False
    }
    try:
        r = requests.post(url, headers=headers, json=payload)
        if r.status_code == 200:
            data = r.json()
            base_resp = data.get("base_resp", {})
            if base_resp.get("status_code") == 0:
                audio_url = data.get("data", {}).get("audio")
                if audio_url:
                    r2 = requests.get(audio_url)
                    if r2.status_code == 200:
                        with tempfile.NamedTemporaryFile(delete=False, suffix='.'+audio_format) as tf:
                            tf.write(r2.content)
                            temp_audio_path = tf.name
                        return temp_audio_path
                    else:
                        print("MINIMAX音檔下載失敗：", r2.status_code)
                else:
                    print("MINIMAX未取得音檔URL，請檢查API回應！")
            else:
                print("MINIMAX API錯誤：", base_resp.get("status_msg"))
        else:
            print("MINIMAX TTS API失敗，狀態碼：", r.status_code)
            print("回應：", r.text[:300])
            print("Payload used:\n", payload)
    except Exception as e:
        print("MINIMAX TTS試聽錯誤：", e)
    return None

def on_preview_clicked(b):
    with audio_output:
        audio_output.clear_output()
        preview_text = preview_text_widget.value.strip()
        if not preview_text:
            print("請輸入欲試聽的語音文字！")
            return
        if tts_server_widget.value == 'azure':
            temp_audio_path = azure_tts_api(
                preview_text, azure_voice_widget.value, azure_speed_widget.value, azure_pitch_widget.value)
        else:
            temp_audio_path = minimax_tts_api(
                preview_text, minimax_model_widget.value, minimax_voice_widget.value,
                minimax_emotion_widget.value, minimax_vol_widget.value,
                minimax_speed_widget.value, minimax_pitch_widget.value,
                minimax_audio_format_widget.value, minimax_sample_rate_widget.value,
                minimax_bitrate_widget.value, minimax_channel_widget.value)
        if temp_audio_path:
            display(Audio(temp_audio_path, autoplay=True))
            print(f"{tts_server_widget.value}語音試聽成功！")

preview_button.on_click(on_preview_clicked)

def on_save_clicked(b):
    # Enforce project_title is not empty
    project_title_value = project_title_widget.value.strip() if project_title_widget.value.strip() else "test"
    project_title_widget.value = project_title_value  # update widget if needed

    config = {
        'input_text': input_text_widget.value,
        'preview_text': preview_text_widget.value,
        'tts_server': tts_server_widget.value,
        'azure_voice': azure_voice_widget.value,
        'azure_speed': azure_speed_widget.value,
        'azure_pitch': azure_pitch_widget.value,
        'minimax_model': minimax_model_widget.value,
        'minimax_accent': minimax_accent_widget.value,
        'minimax_voice': minimax_voice_widget.value,
        'minimax_emotion': minimax_emotion_widget.value,
        'minimax_vol': minimax_vol_widget.value,
        'minimax_speed': minimax_speed_widget.value,
        'minimax_pitch': minimax_pitch_widget.value,
        'minimax_audio_format': minimax_audio_format_widget.value,
        'minimax_sample_rate': minimax_sample_rate_widget.value,
        'minimax_bitrate': minimax_bitrate_widget.value,
        'minimax_channel': minimax_channel_widget.value,
        'project_title': project_title_value,
        'project_author': project_author_widget.value,
        'bgm_volume': bgm_volume_widget.value,
        'attribution': attribution_widget.value,
        'subtitle_font': subtitle_font_widget.value,
        'subtitle_fontsize': subtitle_fontsize_widget.value,
        'video_resolution': video_resolution_widget.value,
        'background': background_file_widget.value,
        'bgm': bgm_file_widget.value,
        'thumbnail': thumbnail_file_widget.value,
        'text': text_file_widget.value,
    }
    with output:
        output.clear_output()
        # 儲存 config.yaml
        yaml.safe_dump(config, open('config.yaml', 'w', encoding='utf-8'), allow_unicode=True)
        # 同步寫入 text.txt
        try:
            with open(text_file_widget.value, 'w', encoding='utf-8') as f:
                f.write(input_text_widget.value)
            print('設定已儲存到 config.yaml，且文字內容已同步寫入 text.txt！')
        except Exception as e:
            print(f"[ERROR] 寫入 text.txt 失敗：{e}")
        print(config)
        if project_title_value == "test":
            print("⚠️ 專案名稱為 test，請在專案開始前確認填寫正確名稱。")

save_button.on_click(on_save_clicked)

# ---- Display all widgets ----
display(
    input_text_widget,
    preview_text_widget,
    preview_button,
    audio_output,
    tts_server_widget,
    param_box,
    project_info_box,
    save_button,
    output
)
print("="*40)
print(f"專案名稱: {project_title}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)

In [None]:
#cell 4-8

import os
import yaml
import shutil
import datetime
import re
from snownlp import SnowNLP
import subprocess
from tqdm.notebook import tqdm
from dotenv import load_dotenv

print("=== STEP 0: 專案資料 ===")
main_dir = os.getcwd()
ammunition_dir = os.path.join(main_dir, "ammunition")
print("Current subdir:", main_dir)
print("目前工作目錄：", os.getcwd())
print("資源目錄(ammunition):", ammunition_dir)

print("\n=== STEP 1: 讀取 config 及同步更新 text.txt ===")
# ---- Load config ----
try:
    with open('config.yaml', encoding='utf-8') as f:
        config = yaml.safe_load(f)
except Exception as e:
    print(f"[ERROR] Failed to load config.yaml: {e}")
    raise

project_title = config.get('project_title', config.get('title', 'My YouTube Video'))
project_author = config.get('project_author', 'Anonymous')
bgm_volume = config.get('bgm_volume', 0.3)
attribution = config.get('attribution', 'Assets used under free license. See description.')

background_file = os.path.join(ammunition_dir, config.get('background', 'background.jpg'))
bgm_file = os.path.join(ammunition_dir, config.get('bgm', 'bgm.mp3'))
subtitle_font = os.path.join(ammunition_dir, config.get('subtitle_font', 'NotoSansCJKtc-Regular.otf'))
thumbnail_file = os.path.join(ammunition_dir, config.get('thumbnail', 'thumbnail.jpg'))
text_file = os.path.join(ammunition_dir, config.get('text', 'text.txt'))

required_files = [background_file, bgm_file, text_file, subtitle_font, thumbnail_file]
for fname in required_files:
    if not os.path.exists(fname):
        print(f"[WARNING] Missing file: {fname}")

print("="*30)

# --- 關鍵步驟：先用 config['input_text'] 覆蓋 text.txt ---
input_text = config.get('input_text', '')
with open(text_file, 'w', encoding='utf-8') as f:
    f.write(input_text)

# --- 讀取 text.txt 的內容為句子 ---
try:
    with open(text_file, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
except Exception as e:
    print(f"[ERROR] Failed to read text file: {e}")
    sentences = []

print(f"已載入 {len(sentences)} 句子，來源: {text_file}")
print(f"TTS server: {config.get('tts_server', 'azure')}")
print(f"字幕字型: {subtitle_font}, 字體大小: {config.get('subtitle_fontsize', 30)}")
print(f"影片解析度: {config.get('video_resolution', [1920, 1080])}")
print(f"背景檔: {background_file}")
print(f"BGM檔: {bgm_file}")
print(f"縮圖檔: {thumbnail_file}")
print("="*30)

print("\n=== STEP 2: 建立 subdir 並複製資源 ===")
today = datetime.datetime.now().strftime("%Y%m%d")
safe_title = re.sub(r'[\\/:*?"<>|\n\r\t]', '', project_title)[:20]
subdir = os.path.join(main_dir, f"{today}_{safe_title}")

try:
    if not os.path.exists(subdir):
        os.makedirs(subdir)
except Exception as e:
    print(f"[ERROR] Failed to create subdir: {e}")
    raise

for fname in required_files:
    target = os.path.join(subdir, os.path.basename(fname))
    try:
        if not os.path.exists(target) or not os.path.samefile(fname, target):
            shutil.copy2(fname, target)
    except Exception as e:
        print(f"[ERROR] Failed to copy {fname} to {target}: {e}")

print(f"All required files are available in subdir.")

print("\n=== STEP 3: 句子分割與清理 ===")
try:
    with open(os.path.join(subdir, os.path.basename(text_file)), encoding="utf-8") as f:
        text = f.read().strip()
except Exception as e:
    print(f"[ERROR] Failed to read {os.path.join(subdir, os.path.basename(text_file))}: {e}")
    text = ""

def is_pronounceable(s):
    return bool(re.search(r'[\u4e00-\u9fffA-Za-z0-9]', s))

def clean_markdown(s):
    s = re.sub(r"^[#\-\*\s>]+", "", s)
    s = re.sub(r"(\*|`|_|>|#|\[|\]|\(|\)|\-|~|=|>)", "", s)
    s = re.sub(r"[「」]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

try:
    sentences = [s.strip() for s in SnowNLP(text).sentences if s.strip()]
except Exception as e:
    print(f"[ERROR] SnowNLP sentence segmentation failed: {e}")
    sentences = [text] if text else []

sentences = [clean_markdown(s) for s in sentences if is_pronounceable(s)]
sentences = [s for s in sentences if is_pronounceable(s)]

print(f"Total sentences after filtering: {len(sentences)}")
for idx, s in enumerate(sentences):
    print(f"{idx}: '{s}'")

print("\n=== STEP 4: TTS 語音合成 ===")
load_dotenv()
tts_server = config.get('tts_server', 'azure')
audio_files, durations, error_log = [], [], []
failed_sentences = []
max_retries = 3

if tts_server == 'azure':
    import azure.cognitiveservices.speech as speechsdk
    AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
    AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
    VOICE = config.get('azure_voice', 'zh-TW-YunJheNeural')
    print(f"[CHECK] Azure voice_id (voice_name) used for synthesis: {VOICE}")
    for i, sentence in enumerate(sentences):
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_synthesis_voice_name = VOICE
        speech_config.set_speech_synthesis_output_format(
            speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
        )
        mp3_fname = os.path.join(subdir, f"voice_{i}.mp3")
        audio_config = speechsdk.audio.AudioOutputConfig(filename=mp3_fname)
        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = synthesizer.speak_text_async(sentence).get()
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print(f"[OK] 合成完成: {mp3_fname}")
            audio_files.append(mp3_fname)
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speechsdk.CancellationDetails(result)
            print(f"[ERROR] 合成失敗: {sentence}")
            print(f"Reason: {cancellation_details.reason}")
            print(f"Details: {cancellation_details.error_details}")
            failed_sentences.append((i, sentence))
        else:
            print(f"[ERROR] TTS失敗(未知): {sentence}, Reason: {result.reason}")
            failed_sentences.append((i, sentence))
elif tts_server == 'minimax':
    import requests
    MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")
    MINIMAX_URL = "https://api.minimax.io/v1/t2a_v2?GroupId=1982992498867311582"
    MODEL = config.get("minimax_model", "speech-02-hd")
    VOICE = config.get("minimax_voice", "Chinese (Mandarin)_Warm_Bestie")
    EMOTION = config.get("minimax_emotion", "calm")
    VOL = config.get("minimax_vol", 1.0)
    SPEED = config.get("minimax_speed", 1.0)
    PITCH = config.get("minimax_pitch", 0)
    AUDIO_FORMAT = config.get("minimax_audio_format", "mp3")
    SAMPLE_RATE = config.get("minimax_sample_rate", 32000)
    BITRATE = config.get("minimax_bitrate", 128000)
    CHANNEL = config.get("minimax_channel", 1)
    headers = {
        "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
        "Content-Type": "application/json"
    }
    print(f"[CHECK] MINIMAX voice_id (voice_name) used for synthesis: {VOICE}")
    for i, sentence in enumerate(sentences):
        payload = {
            "model": MODEL,
            "text": sentence,
            "voice_setting": {
                "voice_id": VOICE,
                "speed": SPEED,
                "vol": VOL,
                "pitch": PITCH,
                "emotion": EMOTION
            },
            "audio_setting": {
                "sample_rate": SAMPLE_RATE,
                "bitrate": BITRATE,
                "format": AUDIO_FORMAT,
                "channel": CHANNEL
            },
            "output_format": "url",
            "language_boost": "auto",
            "subtitle_enable": False
        }
        r = requests.post(MINIMAX_URL, headers=headers, json=payload)
        if r.status_code == 200:
            data = r.json()
            base_resp = data.get("base_resp", {})
            if base_resp.get("status_code") == 0:
                audio_url = data.get("data", {}).get("audio")
                if audio_url:
                    r2 = requests.get(audio_url)
                    if r2.status_code == 200:
                        fname = os.path.join(subdir, f"voice_{i}.{AUDIO_FORMAT}")
                        with open(fname, "wb") as f:
                            f.write(r2.content)
                        print(f"[OK] MINIMAX合成完成: {fname}")
                        audio_files.append(fname)
                    else:
                        print(f"[ERROR] MINIMAX音檔下載失敗: {r2.status_code}")
                        failed_sentences.append((i, sentence))
                else:
                    print(f"[ERROR] MINIMAX未取得音檔URL: {data}")
                    failed_sentences.append((i, sentence))
            else:
                print(f"[ERROR] MINIMAX API錯誤: {base_resp.get('status_msg', '')}")
                failed_sentences.append((i, sentence))
        else:
            print(f"[ERROR] MINIMAX TTS API HTTP錯誤: {r.status_code}")
            failed_sentences.append((i, sentence))
else:
    print("[ERROR] 未支援的 TTS server 設定：", tts_server)

print("TTS synthesis complete. Valid files:", len(audio_files))
if failed_sentences:
    print("Failed sentences:")
    for idx, s in failed_sentences:
        print(f"Sentence {idx}: '{s}'")

# === STEP 5: 驗證音檔並產生分鏡表用資料 ===
valid_sentences = []
valid_audio_files = []
valid_durations = []

for i, sentence in enumerate(sentences):
    mp3_fname = os.path.join(subdir, f"voice_{i}.mp3")
    if os.path.exists(mp3_fname) and os.path.getsize(mp3_fname) > 0:
        r = subprocess.run([
            "ffprobe", "-v", "error", "-show_entries",
            "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", mp3_fname
        ], capture_output=True)
        try:
            duration_val = float(r.stdout.decode().strip())
            if duration_val > 0.01 and duration_val < 30:
                valid_durations.append(duration_val)
                valid_audio_files.append(mp3_fname)
                valid_sentences.append(sentence)
                print(f"OK: {i}: '{sentence}' ({duration_val:.2f}s)")
            else:
                print(f"Skipped {i}: '{sentence}' - duration {duration_val:.2f}s")
        except Exception as e:
            print(f"Error reading duration for {mp3_fname}: {e}")
    else:
        print(f"Missing or empty audio: {i}: '{sentence}'")

print(f"\nKept {len(valid_audio_files)} valid audio files and sentences")
if valid_sentences and valid_sentences[-1] == sentences[-1]:
    print("✅ Last sentence included:", valid_sentences[-1])
else:
    print("❌ Last sentence missing! Check synthesis and validation steps.")

# 關鍵：分鏡表資料存成 global 變數，供 cell8.1 widget 讀取
globals()["valid_sentences"] = valid_sentences
globals()["valid_audio_files"] = valid_audio_files

# 合併所有有效語音檔成 voice.mp3（供後續用）
files_txt = os.path.join(subdir, "files.txt")
with open(files_txt, "w", encoding="utf-8") as f:
    for af in valid_audio_files:
        f.write(f"file '{af}'\n")

voice_mp3 = os.path.join(subdir, "voice.mp3")
concat_result = subprocess.run(
    ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", files_txt, "-c", "copy", voice_mp3],
    capture_output=True
)
if concat_result.returncode != 0:
    print("[FFmpeg Error] Failed to concatenate audio files.")
    print(concat_result.stderr.decode())
else:
    print("Concatenated voice.mp3 created successfully.")

print("="*40)
print(f"專案名稱: {project_title}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)

In [None]:
# cell 8.5
import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display, Image as IPyImage, FileLink
import requests
import ipywidgets as widgets
import yaml
from dotenv import load_dotenv
import subprocess
import re
import functools

# --- 基本環境讀取 ---
load_dotenv()
openai.api_key = os.getenv("OPENAI_API")

print("專案名稱:", config.get("project_title", config.get("title", "未設定")))
print("Current subdir:", subdir)
print("目前工作目錄：", os.getcwd())

ammunition_dir = os.path.join(main_dir, "ammunition")
text_path = os.path.join(ammunition_dir, "text.txt")
with open(text_path, encoding="utf-8") as f:
    script_text = f.read().strip()

# --- 1️⃣ 語意分割字幕為場景（GPT-3.5，最多4個）---
def semantic_scene_split(subtitles, max_scene=4):
    prompt = (
        "請將以下字幕依語意分割為不超過4個場景，每個場景列出起迄句子編號（如：1-5，6-8），只需輸出每個場景的起迄範圍，不需解釋：\n"
        + "\n".join([f"{i+1}. {s}" for i, s in enumerate(subtitles)])
    )
    resp = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    content = resp.choices[0].message.content
    ranges = []
    for line in content.splitlines():
        m = re.match(r"(\d+)\s*-\s*(\d+)", line)
        if m:
            start = int(m.group(1)) - 1
            end = int(m.group(2))
            ranges.append((start, end))
    if not ranges:
        ranges = [(0, len(subtitles))]
    return ranges[:max_scene]

scene_ranges = semantic_scene_split(valid_sentences)
scene_images = [None] * len(scene_ranges)
scene_img_names = [None] * len(scene_ranges)

# --- 2️⃣ 分場景UI + AI圖生成/手動選檔案 ---
def sanitize_title(title):
    return re.sub(r"[^\u4e00-\u9fa5A-Za-z0-9]", "", title)[:10] or "scene"

def dalle3_generate(prompt, idx, seg_sentences, size="1792x1024"):
    scene_title = sanitize_title(seg_sentences[0]) if seg_sentences else f"scene{idx+1}"
    fname = os.path.join(subdir, f"ai_scene_{idx+1}_{scene_title}.jpg")
    print(f"呼叫DALL·E生成場景{idx+1}: {scene_title}")
    response = openai.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size=size,
        n=1,
        quality="standard"
    )
    url = response.data[0].url
    imgdata = requests.get(url).content
    img = Image.open(BytesIO(imgdata))
    img_resized = img.resize((1920, 1080), resample=Image.LANCZOS)
    img_resized.save(fname, format="JPEG")
    print(f"AI場景圖片已儲存：{fname}")
    return fname

try:
    all_files = os.listdir(ammunition_dir)
except Exception as e:
    print(f"[ERROR] Failed to list ammunition_dir: {e}")
    all_files = []

image_files = [f for f in all_files if f.lower().endswith(('.jpg', '.png'))]

# --- 3️⃣ 分鏡表（先建立bg_selectors，方便後續直接value設定） ---
video_files = [f for f in all_files if f.lower().endswith(('.mp4', '.mov', '.avi'))]
music_files = [f for f in all_files if f.lower().endswith(('.mp3', '.wav'))]

video_info = []
for vf in video_files:
    path = os.path.join(ammunition_dir, vf)
    result = subprocess.run([
        "ffprobe", "-v", "error", "-show_entries",
        "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path
    ], capture_output=True)
    try:
        dur = float(result.stdout.decode().strip())
    except Exception:
        dur = None
    video_info.append((vf, dur))
video_options = ["(無)"] + [f"{fn} ({d:.2f}s)" if d else fn for fn, d in video_info]
video_map = {f"{fn} ({d:.2f}s)": fn for fn, d in video_info if d}
image_options = ["(無)"] + image_files

audio_durations = [None] * len(valid_sentences)
for i in range(len(valid_sentences)):
    af = valid_audio_files[i] if i < len(valid_audio_files) else None
    if af and os.path.exists(af):
        result = subprocess.run([
            "ffprobe", "-v", "error", "-show_entries",
            "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", af
        ], capture_output=True)
        try:
            audio_durations[i] = float(result.stdout.decode().strip())
        except Exception:
            audio_durations[i] = None
accum_durations = []
running_total = 0.0
for d in audio_durations:
    running_total += d if d else 0
    accum_durations.append(running_total)

sentence_labels = []
bg_selectors = []
vid_selectors = []
bg_preview_btns = []

def get_scene_img_for_idx(i):
    for idx, (start, end) in enumerate(scene_ranges):
        if start <= i < end:
            return scene_images[idx] if scene_images[idx] else "(無)"
    return "(無)"

for i, sentence in enumerate(valid_sentences):
    aud_dur = audio_durations[i] if i < len(audio_durations) and audio_durations[i] else 0
    accum_dur = accum_durations[i] if i < len(accum_durations) else 0
    label_txt = f"{i+1}. {sentence[:40]} (audio: {aud_dur:.2f}s, total: {accum_dur:.2f}s)"
    sentence_label = widgets.Label(label_txt)
    # 預設背景(初始化時尚未有AI圖)
    bg_default = "(無)"
    bg_dd_options = ["(無)"] + image_files
    bg_dd = widgets.Dropdown(options=bg_dd_options, value=bg_default, description="")
    vid_dd = widgets.Dropdown(options=video_options, value="(無)", description="")
    sentence_labels.append(sentence_label)
    bg_selectors.append(bg_dd)
    vid_selectors.append(vid_dd)
    # 預覽按鈕
    def preview_fun(img_name=bg_default):
        def show(_):
            if img_name != "(無)":
                img_path = os.path.join(subdir, img_name)
                if not os.path.exists(img_path):
                    img_path = os.path.join(ammunition_dir, img_name)
                if os.path.exists(img_path):
                    display(IPyImage(filename=img_path))
        return show
    preview_btn = widgets.Button(description="預覽背景")
    preview_btn.on_click(preview_fun(img_name=bg_default))
    bg_preview_btns.append(preview_btn)

col1_title = widgets.HTML("<b>句子 / 時間</b>")
col2_title = widgets.HTML("<b>背景圖片</b>")
col3_title = widgets.HTML("<b>背景預覽</b>")
col4_title = widgets.HTML("<b>影片剪輯</b>")

sentence_labels_col = widgets.VBox([col1_title] + sentence_labels)
bg_selectors_col = widgets.VBox([col2_title] + bg_selectors)
bg_preview_col  = widgets.VBox([col3_title] + bg_preview_btns)
vid_selectors_col = widgets.VBox([col4_title] + vid_selectors)
table_ui = widgets.HBox([sentence_labels_col, bg_selectors_col, bg_preview_col, vid_selectors_col])

instructions = widgets.HTML("""
<b>分鏡表編輯器（語意分場景AI背景/手動選檔 + 分句細緻分鏡表 + 預覽功能）</b><br>
上方：場景分段字幕、AI背景生成/手動選檔（產生/選擇後自動分配背景，分鏡表即時更新）<br>
下方：分句分鏡表可手動調整背景、影片等（預設已依場景分配背景）<br>
儲存時背景空白自動沿用上一句設定。
""")

display(instructions, table_ui)

# --- 4️⃣ 場景UI，沿用/重新產生都能即時填入分鏡表 ---
scene_widgets = []
for idx, (start, end) in enumerate(scene_ranges):
    seg_sentences = valid_sentences[start:end]
    seg_text = "\n".join([f"{i+1}. {s}" for i, s in enumerate(seg_sentences)])
    prompt = f"Minimalist, modern, unique illustration for a YouTube thumbnail. Scene {idx+1}: {seg_text}"

    label = widgets.HTML(f"<b>場景{idx+1} 字幕:</b><br><pre>{seg_text}</pre>")
    img_preview = widgets.Output()
    img_path_label = widgets.Label("尚未選取圖片")
    select_img_dd = widgets.Dropdown(options=["(請選擇)"] + image_files, value="(請選擇)", description="")

    def on_gen_clicked(b, idx=idx, prompt=prompt, seg_sentences=seg_sentences, start=start, end=end):
        try:
            fname = dalle3_generate(prompt, idx, seg_sentences)
            scene_images[idx] = fname
            scene_img_names[idx] = os.path.basename(fname)
            img_preview.clear_output()
            img_path_label.value = f"AI生成：{os.path.basename(fname)}"
            display(IPyImage(filename=fname))
            # 分配此場景所有句子的背景（分鏡表即時顯示）
            for i in range(start, end):
                bg_selectors[i].value = os.path.basename(fname)
        except Exception as e:
            img_preview.clear_output()
            img_path_label.value = f"生成失敗: {e}"

    def on_select_img_change(change, idx=idx, start=start, end=end):
        if change["name"] == "value" and change["new"] != "(請選擇)":
            chosen = change["new"]
            scene_images[idx] = os.path.join(ammunition_dir, chosen)
            scene_img_names[idx] = chosen
            img_preview.clear_output()
            img_path_label.value = f"使用檔案：{chosen}"
            display(IPyImage(filename=os.path.join(ammunition_dir, chosen)))
            # 分配此場景所有句子的背景（分鏡表即時顯示）
            for i in range(start, end):
                bg_selectors[i].value = chosen

    select_img_dd.observe(lambda change, idx=idx, start=start, end=end: on_select_img_change(change, idx, start, end), names="value")
    btn_gen = widgets.Button(description="重新產生", button_style="danger")
    btn_gen.on_click(functools.partial(on_gen_clicked, idx=idx, prompt=prompt, seg_sentences=seg_sentences, start=start, end=end))

    seg_box = widgets.VBox([
        label,
        widgets.HBox([widgets.Label("沿用圖片："), select_img_dd]),
        btn_gen,
        img_path_label,
        img_preview
    ])
    scene_widgets.append(seg_box)

scene_ui = widgets.VBox(scene_widgets)
display(scene_ui)

# --- 5️⃣ 儲存分鏡表（自動補齊背景，分鏡欄位全保留）---
save_btn = widgets.Button(description="儲存分鏡表", button_style="success")
output = widgets.Output()

def get_duration(path):
    if not os.path.exists(path):
        return None
    result = subprocess.run([
        "ffprobe", "-v", "error", "-show_entries",
        "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path
    ], capture_output=True)
    try:
        return float(result.stdout.decode().strip())
    except Exception:
        return None

def save_plan(b):
    with output:
        output.clear_output()
        visual_plan = []
        warnings = []
        # 取得所有背景圖選擇
        bg_list = []
        for i, bg_dd in enumerate(bg_selectors):
            val = bg_dd.value
            if val and val != "(無)":
                f1 = os.path.join(subdir, val)
                f2 = os.path.join(ammunition_dir, val)
                bg_path = f1 if os.path.exists(f1) else f2 if os.path.exists(f2) else val
                bg_list.append(bg_path)
            else:
                bg_list.append(None)
        last_bg = None
        for i in range(len(bg_list)):
            if bg_list[i] is None and last_bg is not None:
                bg_list[i] = last_bg
            elif bg_list[i] is not None:
                last_bg = bg_list[i]
        for i, sentence in enumerate(valid_sentences):
            bg = bg_list[i]
            vid_label = vid_selectors[i].value
            vid = None
            vid_dur = None
            if vid_label != "(無)":
                if vid_label in video_map:
                    vid = video_map[vid_label]
                    vid_dur = get_duration(os.path.join(ammunition_dir, vid))
                else:
                    vid = vid_label.split(" (")[0]
                    vid_dur = get_duration(os.path.join(ammunition_dir, vid))
            audio_fname = valid_audio_files[i]
            audio_dur = audio_durations[i] if i < len(audio_durations) and audio_durations[i] else 0
            accum_dur = accum_durations[i] if i < len(accum_durations) else 0
            if vid and vid_dur and audio_dur and abs(vid_dur - audio_dur) > 1.0:
                warnings.append(f"⚠️ 句{i+1}影片({vid_dur:.2f}s)與語音({audio_dur:.2f}s)時長不符。")
            visual_plan.append({
                "sentence_idx": i,
                "sentence": sentence,
                "audio_file": audio_fname,
                "audio_duration": audio_dur,
                "accum_duration": accum_dur,
                "background": bg,
                "video_clip": vid,
                "video_duration": vid_dur
            })
        visual_plan_path = os.path.join(subdir, "visual_plan.yaml")
        with open(visual_plan_path, "w", encoding="utf-8") as f:
            yaml.safe_dump(visual_plan, f, allow_unicode=True)
        print(f"分鏡表已儲存至 {visual_plan_path}")
        if warnings:
            print("\n".join(warnings))
        print("前3項：")
        for vp in visual_plan[:3]:
            print(vp)

save_btn.on_click(save_plan)
display(save_btn, output)
print("="*40)
print(f"專案名稱: {config.get('project_title', config.get('title', '未設定'))}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)

In [None]:
import os
import yaml
import datetime
import re
from snownlp import SnowNLP
import ipywidgets as widgets
from IPython.display import display, clear_output
from dotenv import load_dotenv
import subprocess

print("=== STEP 0: 專案資料 ===")
main_dir = os.getcwd()
with open('config.yaml', encoding='utf-8') as f:
    config = yaml.safe_load(f)
project_title = config.get('project_title', config.get('title', 'My YouTube Video'))
today = datetime.datetime.now().strftime("%Y%m%d")
safe_title = re.sub(r'[\\/:*?"<>|\n\r\t]', '', project_title)[:20]
subdir = os.path.join(main_dir, f"{today}_{safe_title}")
ammunition_dir = os.path.join(main_dir, "ammunition")
text_file = os.path.join(ammunition_dir, config.get('text', 'text.txt'))
input_text = config.get('input_text', '')

print("Current subdir (引用):", subdir)
print("目前工作目錄：", os.getcwd())
print("資源目錄(ammunition):", ammunition_dir)

# 更新 text.txt
with open(text_file, 'w', encoding='utf-8') as f:
    f.write(input_text)

# 句子分割
def is_pronounceable(s):
    return bool(re.search(r'[\u4e00-\u9fffA-Za-z0-9]', s))
def clean_markdown(s):
    s = re.sub(r"^[#\-*\s>]+", "", s)
    s = re.sub(r"(\*|`|_|>|#|\[|\]|\(|\)|\-|~|=|>)", "", s)
    s = re.sub(r"[「」]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()
try:
    with open(text_file, encoding="utf-8") as f:
        text = f.read().strip()
    sentences = [s.strip() for s in SnowNLP(text).sentences if s.strip()]
except Exception as e:
    print(f"[ERROR] SnowNLP sentence segmentation failed: {e}")
    sentences = [text] if text else []
sentences = [clean_markdown(s) for s in sentences if is_pronounceable(s)]
sentences = [s for s in sentences if is_pronounceable(s)]
print(f"Total sentences after filtering: {len(sentences)}")

print("\n=== STEP 1: TTS語音合成 ===")
load_dotenv()
tts_server = config.get('tts_server', 'azure')
audio_files = []
failed_sentences = []

tts_files_exist = True
for i in range(len(sentences)):
    fname = os.path.join(subdir, f"voice_{i}.mp3")
    if not (os.path.exists(fname) and os.path.getsize(fname) > 0):
        tts_files_exist = False
        break

if tts_files_exist:
    choice = widgets.ToggleButtons(
        options=['全部跳過', '全部重新TTS'],
        description='所有語音檔已存在，是否要全部重新TTS？',
        style={'description_width': 'initial'}
    )
    output = widgets.Output()
    display(choice, output)
    def synthesize_all_tts():
        global audio_files, failed_sentences
        audio_files = []
        failed_sentences = []
        if tts_server == 'azure':
            import azure.cognitiveservices.speech as speechsdk
            AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
            AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
            VOICE = config.get('azure_voice', 'zh-TW-YunJheNeural')
            print(f"[CHECK] Azure voice_id (voice_name) used for synthesis: {VOICE}")
            for i, sentence in enumerate(sentences):
                mp3_fname = os.path.join(subdir, f"voice_{i}.mp3")
                speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
                speech_config.speech_synthesis_voice_name = VOICE
                speech_config.set_speech_synthesis_output_format(
                    speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
                )
                audio_config = speechsdk.audio.AudioOutputConfig(filename=mp3_fname)
                synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
                result = synthesizer.speak_text_async(sentence).get()
                if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                    print(f"[OK] 合成完成: {mp3_fname}")
                    audio_files.append(mp3_fname)
                elif result.reason == speechsdk.ResultReason.Canceled:
                    cancellation_details = speechsdk.CancellationDetails(result)
                    print(f"[ERROR] 合成失敗: {sentence}")
                    print(f"Reason: {cancellation_details.reason}")
                    print(f"Details: {cancellation_details.error_details}")
                    failed_sentences.append((i, sentence))
                else:
                    print(f"[ERROR] TTS失敗(未知): {sentence}, Reason: {result.reason}")
                    failed_sentences.append((i, sentence))
        elif tts_server == 'minimax':
            import requests
            MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")
            MINIMAX_URL = "https://api.minimax.io/v1/t2a_v2?GroupId=1982992498867311582"
            MODEL = config.get("minimax_model", "speech-02-hd")
            VOICE = config.get("minimax_voice", "Chinese (Mandarin)_Warm_Bestie")
            EMOTION = config.get("minimax_emotion", "calm")
            VOL = config.get("minimax_vol", 1.0)
            SPEED = config.get("minimax_speed", 1.0)
            PITCH = config.get("minimax_pitch", 0)
            AUDIO_FORMAT = config.get("minimax_audio_format", "mp3")
            SAMPLE_RATE = config.get("minimax_sample_rate", 32000)
            BITRATE = config.get("minimax_bitrate", 128000)
            CHANNEL = config.get("minimax_channel", 1)
            headers = {
                "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
                "Content-Type": "application/json"
            }
            print(f"[CHECK] MINIMAX voice_id (voice_name) used for synthesis: {VOICE}")
            for i, sentence in enumerate(sentences):
                fname = os.path.join(subdir, f"voice_{i}.{AUDIO_FORMAT}")
                payload = {
                    "model": MODEL,
                    "text": sentence,
                    "voice_setting": {
                        "voice_id": VOICE,
                        "speed": SPEED,
                        "vol": VOL,
                        "pitch": PITCH,
                        "emotion": EMOTION
                    },
                    "audio_setting": {
                        "sample_rate": SAMPLE_RATE,
                        "bitrate": BITRATE,
                        "format": AUDIO_FORMAT,
                        "channel": CHANNEL
                    },
                    "output_format": "url",
                    "language_boost": "auto",
                    "subtitle_enable": False
                }
                r = requests.post(MINIMAX_URL, headers=headers, json=payload)
                if r.status_code == 200:
                    data = r.json()
                    base_resp = data.get("base_resp", {})
                    if base_resp.get("status_code") == 0:
                        audio_url = data.get("data", {}).get("audio")
                        if audio_url:
                            r2 = requests.get(audio_url)
                            if r2.status_code == 200:
                                with open(fname, "wb") as f:
                                    f.write(r2.content)
                                print(f"[OK] MINIMAX合成完成: {fname}")
                                audio_files.append(fname)
                            else:
                                print(f"[ERROR] MINIMAX音檔下載失敗: {r2.status_code}")
                                failed_sentences.append((i, sentence))
                        else:
                            print(f"[ERROR] MINIMAX未取得音檔URL: {data}")
                            failed_sentences.append((i, sentence))
                    else:
                        print(f"[ERROR] MINIMAX API錯誤: {base_resp.get('status_msg', '')}")
                        failed_sentences.append((i, sentence))
                else:
                    print(f"[ERROR] MINIMAX TTS API HTTP錯誤: {r.status_code}")
                    failed_sentences.append((i, sentence))
        else:
            print("[ERROR] 未支援的 TTS server 設定：", tts_server)
        print("TTS synthesis complete. Valid files:", len(audio_files))
        if failed_sentences:
            print("Failed sentences:")
            for idx, s in failed_sentences:
                print(f"Sentence {idx}: '{s}'")
        else:
            print("所有語音檔已重新生成完成。")

    def on_choice_change(change):
        if change['name'] == 'value':
            with output:
                clear_output()
                if change['new'] == '全部跳過':
                    print("已選擇跳過語音合成，全部僅引用 subdir 下現有檔案。")
                else:
                    print("開始重新TTS生成所有語音...")
                    synthesize_all_tts()
    choice.observe(on_choice_change, names='value')
else:
    print("部分語音檔不存在，將直接生成缺漏語音檔...")
    # 直接補齊缺漏語音檔（可複製 synthesize_all_tts() 內容，但僅補缺漏檔案）

print("\n=== STEP 2: 合併語音檔 ===")
voice_list = [os.path.join(subdir, f"voice_{i}.mp3") for i in range(len(sentences))]
valid_voice_list = [f for f in voice_list if os.path.exists(f) and os.path.getsize(f) > 0]
concat_list_path = os.path.join(subdir, "voice_list.txt")
with open(concat_list_path, "w", encoding="utf-8") as f:
    for vf in valid_voice_list:
        f.write(f"file '{vf}'\n")
concat_voice_path = os.path.join(subdir, "voice.mp3")
subprocess.run([
    "ffmpeg", "-y", "-f", "concat", "-safe", "0",
    "-i", concat_list_path, "-c", "copy", concat_voice_path
])
print("合併完成:", concat_voice_path)

print("\n=== STEP 3: 產生字幕檔 (.srt) ===")
# 假設每句字幕時間依語音長度（需安裝ffprobe），也可簡單估算每句2秒
srt_path = os.path.join(subdir, "subtitle.srt")
start_time = 0.0
with open(srt_path, "w", encoding="utf-8") as f:
    for idx, vf in enumerate(valid_voice_list):
        # 取得每句語音長度
        result = subprocess.run([
            "ffprobe", "-v", "error", "-show_entries",
            "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", vf
        ], capture_output=True)
        try:
            dur = float(result.stdout.decode().strip())
        except Exception:
            dur = 2.0
        end_time = start_time + dur
        st = "%02d:%02d:%06.3f" % (int(start_time//3600), int((start_time%3600)//60), start_time%60)
        et = "%02d:%02d:%06.3f" % (int(end_time//3600), int((end_time%3600)//60), end_time%60)
        # SRT格式: hh:mm:ss,ms
        st_srt = "%02d:%02d:%02d,%03d" % (int(start_time//3600), int((start_time%3600)//60), int(start_time%60), int((start_time%1)*1000))
        et_srt = "%02d:%02d:%02d,%03d" % (int(end_time//3600), int((end_time%3600)//60), int(end_time%60), int((end_time%1)*1000))
        f.write(f"{idx+1}\n{st_srt} --> {et_srt}\n{sentences[idx]}\n\n")
        start_time = end_time
print("字幕檔完成:", srt_path)

print("\n=== STEP 4: 合成配樂 ===")
bgm_file = os.path.join(ammunition_dir, config.get('bgm', 'bgm.mp3'))
bgm_volume = float(config.get('bgm_volume', 0.3))
voice_with_bgm_path = os.path.join(subdir, "voice_bgm.mp3")
subprocess.run([
    "ffmpeg", "-y",
    "-i", concat_voice_path,
    "-i", bgm_file,
    "-filter_complex", f"[1:a]volume={bgm_volume}[bgm];[0:a][bgm]amix=inputs=2:duration=first:dropout_transition=3",
    "-c:a", "mp3",
    voice_with_bgm_path
])
print("語音+配樂合成完成:", voice_with_bgm_path)

from IPython.display import clear_output
clear_output(wait=True)

import shutil
import os
import subprocess

print("\n=== STEP 5: 產生影片（全部搬到主程式目錄） ===")

main_dir = os.getcwd()

# 檔案來源
background_file_src = os.path.join(subdir, "background.jpg")
voice_with_bgm_path_src = os.path.join(subdir, "voice_bgm.mp3")
srt_path_src = os.path.join(subdir, "subtitle.srt")

# 檢查檔案
for fname in ["background.jpg", "voice_bgm.mp3", "subtitle.srt"]:
    fpath = os.path.join(subdir, fname)
    print(f"{fname}: {os.path.exists(fpath)} ({fpath})")

if not os.path.exists(background_file_src):
    source_bg = os.path.join(ammunition_dir, config.get('background', 'background.jpg'))
    if os.path.exists(source_bg):
        shutil.copy2(source_bg, background_file_src)
        print("補上 background.jpg 到 subdir")

# 搬檔到主程式目錄（只搬有的）
for src, dst in [
    (background_file_src, os.path.join(main_dir, "background.jpg")),
    (voice_with_bgm_path_src, os.path.join(main_dir, "voice_bgm.mp3")),
    (srt_path_src, os.path.join(main_dir, "subtitle.srt"))
]:
    if os.path.exists(src):
        shutil.copy2(src, dst)
    else:
        print(f"缺少檔案：{src}，請先確認流程前面已正確產生。")

video_resolution = config.get('video_resolution', '1920,1080')
width, height = map(int, video_resolution.split(','))

cmd = [
    "ffmpeg", "-y",
    "-loop", "1", "-i", "background.jpg",
    "-i", "voice_bgm.mp3",
    "-vf", f"scale={width}:{height},subtitles=subtitle.srt",
    "-shortest",
    "-c:v", "libx264", "-c:a", "aac",
    "-b:a", "192k",
    "final.mp4"
]
print("執行指令：", " ".join(cmd))

result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=main_dir)
print("ffmpeg stdout:", result.stdout.decode('utf-8', errors='ignore'))
print("ffmpeg stderr:", result.stderr.decode('utf-8', errors='ignore'))

final_mp4_file = os.path.join(main_dir, "final.mp4")
final_mp4_dst = os.path.join(subdir, "final.mp4")
if os.path.exists(final_mp4_file):
    shutil.copy2(final_mp4_file, final_mp4_dst)
    print("影片合成完成:", final_mp4_dst)
    print("影片複本已保留於:", final_mp4_file)
else:
    print("影片合成失敗，請檢查錯誤訊息。")

In [None]:
# cell 8.2

import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display, Image as IPyImage, FileLink
import requests
from dotenv import load_dotenv

# 讀取 .env，取得 OPENAI_API
load_dotenv()
openai.api_key = os.getenv("OPENAI_API")

# 讀取 text.txt 劇本
text_path = os.path.join(ammunition_dir, "text.txt")
with open(text_path, encoding="utf-8") as f:
    script_text = f.read().strip()

# 1️⃣ 劇本自動斷句分鏡（最多4場景）
def auto_split_script(text, max_scene=4):
    lines = [l.strip() for l in text.split('\n') if l.strip()]
    n = min(max_scene, len(lines))
    if n == 1: return [text]
    chunk_len = len(lines) // n if len(lines) >= n else 1
    scenes = []
    for i in range(n):
        start = i * chunk_len
        end = (i+1) * chunk_len if i < n-1 else len(lines)
        scene = ' '.join(lines[start:end])
        if scene: scenes.append(scene)
    return scenes[:max_scene]

scenes = auto_split_script(script_text, max_scene=4)
print(f"自動分割為 {len(scenes)} 個場景。")

# 2️⃣ 為每個場景生成獨特的簡約風繪圖 prompt
def make_prompt(scene, idx):
    # 強化簡約、獨特、現代感與YouTube縮圖規格
    return (
        f"Minimalist, modern, unique illustration for a YouTube thumbnail, "
        f"1920x1080, clean design, simple shapes, flat colors, high contrast. "
        f"Scene {idx+1}: {scene}. The image should be visually striking and creative."
    )

prompts = [make_prompt(scene, i) for i, scene in enumerate(scenes)]

# 3️⃣ 呼叫 OPENAI DALL·E 3 API 產生影像，預覽與下載
def dalle3_generate(prompt, idx, size="1920x1080"):
    print(f"呼叫DALL·E生成場景{idx+1}...")
    response = openai.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size=size,
        n=1,
        quality="standard"
    )
    url = response.data[0].url
    imgdata = requests.get(url).content
    fname = os.path.join(subdir, f"scene_{idx+1}.jpg")
    with open(fname, "wb") as f:
        f.write(imgdata)
    img = Image.open(BytesIO(imgdata))
    display(IPyImage(data=imgdata, format='jpeg'))
    print(f"已儲存場景{idx+1}圖片：{fname}")
    return fname

scene_images = []
for idx, prompt in enumerate(prompts):
    print(f"場景{idx+1} prompt：{prompt}")
    fname = dalle3_generate(prompt, idx)
    scene_images.append(fname)

print("\n下載圖片：")
for fname in scene_images:
    display(FileLink(fname))

In [None]:
# cell 15
print("專案名稱:", config.get("project_title", config.get("title", "未設定")))
print("Current subdir:", subdir)
print("目前工作目錄：", os.getcwd())

from IPython.display import Video as ShowVideo, Image as ShowImage
import os
from datetime import datetime, timedelta

# --- Use consistent variables and paths ---
project_title = config.get('project_title', config.get('title', 'test'))
project_author = config.get('project_author', config.get('author', '未設定'))
attribution = config.get('attribution', '')
thumbnail_name = config.get('thumbnail', 'thumbnail.jpg')
thumbnail_path = os.path.join(subdir, thumbnail_name)

final_video_name = f"final_{project_title}.mp4"
outname_final = os.path.join(main_dir, final_video_name)

# --- Display thumbnail and final video ---
if os.path.exists(thumbnail_path):
    display(ShowImage(thumbnail_path))
else:
    print(f"Thumbnail not found: {thumbnail_path}")

if os.path.exists(outname_final):
    display(ShowVideo(url=outname_final))
else:
    print(f"Final video not found: {outname_final}")

# --- Export YouTube metadata ---
youtube_metadata_path = os.path.join(subdir, "youtube_metadata.txt")
subtitle_srt_path = os.path.join(subdir, "subtitle.srt")  # ASS to SRT conversion not shown, but path reserved
dt_utc_now = datetime.utcnow() + timedelta(hours=8)

with open(youtube_metadata_path, "w", encoding="utf-8") as meta:
    meta.write(f"Title: {project_title}\n")
    meta.write(f"Author: {project_author}\n")
    meta.write(f"Date: {dt_utc_now.strftime('%Y-%m-%d')}\n")
    meta.write(f"Description: {attribution}\n")
    meta.write(f"Subtitle SRT: {subtitle_srt_path}\n")
    meta.write(f"Thumbnail: {thumbnail_path}\n")

print(f"Exported metadata for YouTube upload: {youtube_metadata_path}")
print("="*40)
print(f"專案名稱: {project_title}")
print(f"Current subdir: {subdir}")
print(f"目前工作目錄：{os.getcwd()}")
print("="*40)

In [None]:
# cell 8.2

import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display, Image as IPyImage, FileLink
import requests
from dotenv import load_dotenv

# 讀取 .env，取得 OPENAI_API
load_dotenv()
openai.api_key = os.getenv("OPENAI_API")

text_path = os.path.join(ammunition_dir, "text.txt")
with open(text_path, encoding="utf-8") as f:
    script_text = f.read().strip()

def auto_split_script(text, max_scene=4):
    lines = [l.strip() for l in text.split('\n') if l.strip()]
    n = min(max_scene, len(lines))
    if n == 1: return [text]
    chunk_len = len(lines) // n if len(lines) >= n else 1
    scenes = []
    for i in range(n):
        start = i * chunk_len
        end = (i+1) * chunk_len if i < n-1 else len(lines)
        scene = ' '.join(lines[start:end])
        if scene: scenes.append(scene)
    return scenes[:max_scene]

scenes = auto_split_script(script_text, max_scene=4)
print(f"自動分割為 {len(scenes)} 個場景。")

def make_prompt(scene, idx):
    return (
        f"Minimalist, modern, unique illustration for a YouTube thumbnail. "
        f"Scene {idx+1}: {scene}. Flat color, simple composition, high contrast. "
        f"1920x1080 ratio, striking and creative."
    )

prompts = [make_prompt(scene, i) for i, scene in enumerate(scenes)]

def dalle3_generate(prompt, idx, size="1792x1024"):  # 用 DALL·E 支援的最大橫幅
    print(f"呼叫DALL·E生成場景{idx+1}...")
    response = openai.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size=size,         # 支援 '1024x1024', '1024x1792', '1792x1024'
        n=1,
        quality="standard"
    )
    url = response.data[0].url
    imgdata = requests.get(url).content
    img = Image.open(BytesIO(imgdata))
    # Resize 成 1920x1080
    img_resized = img.resize((1920, 1080), resample=Image.LANCZOS)
    fname = os.path.join(subdir, f"scene_{idx+1}.jpg")
    img_resized.save(fname, format="JPEG")
    display(IPyImage(data=BytesIO(img_resized.tobytes()).getvalue(), format='jpeg'))
    print(f"已儲存場景{idx+1}圖片：{fname}")
    return fname

scene_images = []
for idx, prompt in enumerate(prompts):
    print(f"場景{idx+1} prompt：{prompt}")
    fname = dalle3_generate(prompt, idx)
    scene_images.append(fname)

print("\n下載圖片：")
for fname in scene_images:
    display(FileLink(fname))

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv

# 讀取 .env
load_dotenv()
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
VOICE = "zh-TW-YunJheNeural"
SPEED = 1.0
PITCH = 0

def azure_tts_api(text, voice, speed, pitch):
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
    speech_config.speech_synthesis_voice_name = voice
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    result = synthesizer.speak_text_async(text).get()
    if result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speechsdk.CancellationDetails(result)
        print(f"Azure TTS失敗: {cancellation_details.reason}\n{cancellation_details.error_details}")
        return None
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        import tempfile
        tf = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
        tf.write(result.audio_data)
        tf.close()
        print(f"合成完成 (preview): {tf.name}")
        return tf.name
    else:
        print(f"Azure TTS失敗: {getattr(result, 'error_details', str(result.reason))}")
    return None

def azure_tts_to_file(text, voice, speed, pitch, filename):
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
    speech_config.speech_synthesis_voice_name = voice
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )
    audio_config = speechsdk.audio.AudioOutputConfig(filename=filename)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
    ssml = f"""<speak version='1.0' xml:lang='zh-TW'>
      <voice name='{voice}'>
        <prosody rate='{speed}' pitch='{pitch}'>{text}</prosody>
      </voice>
    </speak>"""
    result = synthesizer.speak_ssml_async(ssml).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print(f"[OK] 批次合成完成: {filename}")
        return True
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speechsdk.CancellationDetails(result)
        print(f"[ERROR] 批次合成失敗: {filename}")
        print(f"Reason: {cancellation_details.reason}")
        print(f"Details: {cancellation_details.error_details}")
        print(f"SSML: {ssml}")
        return False
    else:
        print(f"[ERROR] TTS失敗(未知): {filename}, Reason: {result.reason}")
        return False

# ----------- 測試 preview (cell3) -----------
print("=== PREVIEW 測試 ===")
azure_tts_api("專案名稱為 test", VOICE, SPEED, PITCH)

# ----------- 批次合成 (cell4-8) -----------
print("=== 批次合成測試 ===")
sentences = [
    "專案名稱為 test",
    "請在專案開始前確認填寫正確名稱"
]
output_dir = "tts_batch_test"
os.makedirs(output_dir, exist_ok=True)

for idx, sentence in enumerate(sentences):
    fname = os.path.join(output_dir, f"voice_{idx}.mp3")
    azure_tts_to_file(sentence, VOICE, SPEED, PITCH, fname)

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv

load_dotenv()
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
VOICE = "zh-TW-YunJheNeural"

output_dir = "tts_batch_test"
os.makedirs(output_dir, exist_ok=True)

for idx, sentence in enumerate([
    "專案名稱為 test",
    "請在專案開始前確認填寫正確名稱"
]):
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
    speech_config.speech_synthesis_voice_name = VOICE
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )
    fname = os.path.join(output_dir, f"voice_{idx}.mp3")
    audio_config = speechsdk.audio.AudioOutputConfig(filename=fname)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
    result = synthesizer.speak_text_async(sentence).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print(f"[OK] 批次合成完成: {fname}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speechsdk.CancellationDetails(result)
        print(f"[ERROR] 批次合成失敗: {sentence}")
        print(f"Reason: {cancellation_details.reason}")
        print(f"Details: {cancellation_details.error_details}")

In [None]:
import os
import requests
from dotenv import load_dotenv
import tempfile

# 讀取 .env
load_dotenv()
MINIMAX_SPEECH_KEY = os.environ.get("MINIMAX_SPEECH_KEY")

# 基本參數
MINIMAX_URL = "https://api.minimax.io/v1/t2a_v2?GroupId=1982992498867311582"
MODEL = "speech-02-hd"
VOICE = "Chinese (Mandarin)_Warm_Bestie"
EMOTION = "calm"
VOL = 1.0
SPEED = 1.0
PITCH = 0
AUDIO_FORMAT = "mp3"
SAMPLE_RATE = 32000
BITRATE = 128000
CHANNEL = 1

sentence = "專案名稱為 test"

headers = {
    "Authorization": f"Bearer {MINIMAX_SPEECH_KEY}",
    "Content-Type": "application/json"
}
payload = {
    "model": MODEL,
    "text": sentence,
    "voice_setting": {
        "voice_id": VOICE,
        "speed": SPEED,
        "vol": VOL,
        "pitch": PITCH,
        "emotion": EMOTION
    },
    "audio_setting": {
        "sample_rate": SAMPLE_RATE,
        "bitrate": BITRATE,
        "format": AUDIO_FORMAT,
        "channel": CHANNEL
    },
    "output_format": "url",
    "language_boost": "auto",
    "subtitle_enable": False
}

print("MINIMAX TTS測試 payload:")
print(payload)

r = requests.post(MINIMAX_URL, headers=headers, json=payload)
if r.status_code == 200:
    data = r.json()
    base_resp = data.get("base_resp", {})
    if base_resp.get("status_code") == 0:
        audio_url = data.get("data", {}).get("audio")
        print("音檔URL:", audio_url)
        if audio_url:
            r2 = requests.get(audio_url)
            if r2.status_code == 200:
                tf = tempfile.NamedTemporaryFile(delete=False, suffix="."+AUDIO_FORMAT)
                tf.write(r2.content)
                tf.close()
                print(f"MINIMAX TTS合成成功！檔案: {tf.name}")
            else:
                print("MINIMAX音檔下載失敗，狀態碼：", r2.status_code)
        else:
            print("MINIMAX未取得音檔URL，API回應：", data)
    else:
        print("MINIMAX API錯誤：", base_resp.get("status_msg", "未知"), "\nAPI回應：", data)
else:
    print("MINIMAX TTS API失敗，狀態碼：", r.status_code)
    print("回應：", r.text[:500])