In [None]:
#load whisper model
import stable_whisper
model = stable_whisper.load_model("large-v3")

In [2]:
import numpy as np
from moviepy.editor import VideoFileClip, ImageClip, CompositeVideoClip
from PIL import ImageFont, Image, ImageDraw

# 使用 PIL 創建包含字幕的圖像
def make_text_image(txt, size=(1280, 720), fontsize=36, font="msyh.ttc"):
    """
    使用 PIL 創建包含字幕的圖像
    """
    # 創建背景是透明的圖像
    img = Image.new('RGBA', size, color=(0, 0, 0, 0))  # 設置透明背景
    draw = ImageDraw.Draw(img)
    
    # 使用 TrueType 字型，大小可調
    font = ImageFont.truetype(font, fontsize)
    
    # 設置文本顏色
    text_color = (255, 255, 255)  # 白色
    
    # 計算文本邊界框大小（替代 textsize）
    bbox = draw.textbbox((0, 0), txt, font=font)
    text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
    
    # 設置文本位置（居中）
    x = (size[0] - text_width) / 2
    y = size[1] - text_height - 20  # 距離底部20像素
    
    # 寫入文本
    draw.text((x, y), txt, font=font, fill=text_color)
    
    # 將 PIL 圖像轉換為 NumPy 數組，以便 MoviePy 使用
    img_array = np.array(img)
    
    # 返回 ImageClip 對象，這樣 MoviePy 才能處理
    return ImageClip(img_array).set_duration(3).set_position(("center", "bottom"))

# 解析 TSV 文件
def parse_tsv(tsv_path):
    """
    解析 TSV 字幕文件，將其轉換為適用於影片的字幕 (start, end, text) 格式
    """
    subtitles = []
    with open(tsv_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.split('\t')  # 分割行
            if len(parts) >= 3:
                start_time = int(parts[0]) / 1000.0  # 轉換毫秒為秒
                end_time = int(parts[1]) / 1000.0  # 轉換毫秒為秒
                text = parts[2]
                subtitles.append(((start_time, end_time), text))
    return subtitles

# 轉換 chunks 進行字幕處理
def create_subtitles_from_chunks(chunks):
    subtitles = []
    for chunk in chunks:
        start_time, end_time = chunk["timestamp"]
        text = chunk["text"]
        subtitles.append(((start_time, end_time), text))
    return subtitles

def add_subtitles_to_video(video_path, tsv_path, output_path):
    video = VideoFileClip(video_path)

    # 解析 TSV 字幕文件
    subtitles = parse_tsv(tsv_path)
    
    subtitle_clips = []
    
    for (start, end), text in subtitles:
        subtitle_clip = make_text_image(text)
        subtitle_clip = subtitle_clip.set_start(start).set_end(end)
        subtitle_clips.append(subtitle_clip)
    
    # 使用 CompositeVideoClip 合成視頻和字幕
    final_video = CompositeVideoClip([video] + subtitle_clips)
    
    # 將帶有字幕的影片寫出
    final_video.write_videofile(output_path, codec="libx264", fps=24)


In [3]:
import time
from googletrans import Translator

# 創建一個 Translator 物件
translator = Translator()

def trans_tsv(filename : str , src : str , dest : str):
    with open(filename,"r") as f:
        lines = f.readlines()
    
    with open(filename[:-4] + "_" + src + "_2_" + dest + ".tsv","w") as f:
        for line in lines:
            if len(line.strip().split()) != 0:
                new_line = line.strip().split()[0] + "\t" + line.strip().split()[1] + "\t" + translator.translate("".join(line.strip().split()[2:]), src=src, dest=dest).text + "\n"
                f.write(new_line)
                print(new_line)
                # time.sleep(3)
            else:
                f.write("\n")

    return filename[:-4] + "_" + src + "_2_" + dest + ".tsv","w"


In [None]:
video_path = input(r"輸入影片路徑:")
result = model.transcribe(video_path)   #要產生字幕的video
# result = model.transcribe(video_path, task="translate", language="zh")  
result.to_tsv('audio.tsv')               #儲存名稱
tsv_path = trans_tsv("audio.tsv", "ja" , "zh-TW")   #翻譯日文 ->  中文

In [None]:
output_path = "output_with_subtitles.mp4"  #輸出影片路徑

add_subtitles_to_video(video_path, tsv_path[0], output_path)