In [1]:
import os
import subprocess
from pydub import AudioSegment
import math


def extract_audio_from_video(video_path, audio_path):
    # "-y": overwrite audio file
    command = ["ffmpeg", "-i", video_path, "-vn", audio_path, "-y"]
    subprocess.run(command)


def cut_audio_in_chunks(audio_path, chunk_size, chunks_folder):
    # create folder if not exists
    os.makedirs(chunks_folder, exist_ok=True)

    track = AudioSegment.from_mp3(audio_path)
    chunk_leng = chunk_size * 60 * 1000
    chunks = math.ceil(len(track) / chunk_leng)

    for i in range(chunks):
        # print(i)
        start_time = i * chunk_leng
        end_time = (i + 1) * chunk_leng
        # print(f"start: {start_time}, end: {end_time}")
        chunk = track[start_time:end_time]
        chunk.export(f"{chunks_folder}/chunk_{i}.mp3", format="mp3")

In [None]:
extract_audio_from_video("../files/mcp.mp4", "../files/mcp.mp3")
cut_audio_in_chunks("../files/mcp.mp3", 1, "../files/chunks")

In [3]:
import openai
import glob

openai.api_type = "openai"


def transcribe_chunks(chunks_folder, destination):
    files = glob.glob(f"{chunks_folder}/*.mp3")
    # final_transcript = ""
    for file in files:
        # "r": read, "b" binary, "a": append
        # with open(file, "rb") as audio_file:
        with open(file, "rb") as audio_file, open(destination, "a") as text_file:
            transcript = openai.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language="ko",
            )
            text_file.write(transcript.text)


transcribe_chunks("../files/chunks", "../files/transcript.txt")

In [None]:
from deep_translator import GoogleTranslator


def translate_to_english(korean_text_path, english_text_path):
    with open(korean_text_path, "r", encoding="utf-8") as f:
        korean_text = f.read()

    # split by sentence
    sentences = korean_text.split(". ")
    english_sentences = []

    for s in sentences:
        if s.strip():
            try:
                translated = GoogleTranslator(source="ko", target="en").translate(s)
                english_sentences.append(translated)
            except Exception as e:
                print(f"⚠️ translation failed: {s[:30]}... -> {e}")
                english_sentences.append("[translation failed]")

    final_text = ". ".join(english_sentences)

    # save text
    with open(english_text_path, "w", encoding="utf-8") as f:
        f.write(final_text)

    print(f"✅ translated text saved: {english_text_path}")
    return english_text_path


translate_to_english("../files/transcript.txt", "../files/translated_english.txt")

In [None]:
import re
import nltk
from TTS.api import TTS
from nltk.tokenize import sent_tokenize


def generate_english_speech_from_file(english_text_path, english_audio_output_path):

    with open(english_text_path, "r", encoding="utf-8") as f:
        english_text = f.read()
        # * "I’m happy." → "Im happy"
        # cleaned_english_text = re.sub(r"[^a-zA-Z0-9\s]", "", english_text)
        # * "I’m happy 😊" → "I'm happy"
        cleaned_english_text = re.sub(r"[^\x00-\x7F]+", "", english_text)

    tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
    # * fast pitch is faster but less natural
    # tts = TTS(model_name="tts_models/en/ljspeech/fast_pitch", progress_bar=False)

    nltk.download("punkt")

    sentences = sent_tokenize(cleaned_english_text)
    total = len(sentences)
    combined = AudioSegment.silent(duration=0)

    for i, sentence in enumerate(sentences):
        if len(sentence.strip()) < 5:
            print(f"⚠️ Skipping ({i+1}/{total}): too short.")
            continue
        temp_file = f"../files/temp/temp_{i+1}.wav"
        print(f"🎙 Generating ({i+1}/{total}) → {sentence[:50]}...")
        tts.tts_to_file(text=sentence.strip(), file_path=temp_file)
        chunk = AudioSegment.from_wav(temp_file)
        combined += chunk

    combined.export(english_audio_output_path, format="wav")

    print(f"✅ audio file generated: {english_audio_output_path}")
    return english_audio_output_path


generate_english_speech_from_file(
    "../files/translated_english.txt", "../files/english_audio.wav"
)

In [12]:
def merge_audio_with_video(original_video, new_audio, output_video):
    command = [
        "ffmpeg",
        "-y",
        "-i",
        original_video,
        "-i",
        new_audio,
        "-c:v",
        "copy",
        "-map",
        "0:v:0",
        "-map",
        "1:a:0",
        "-shortest",
        output_video,
    ]
    subprocess.run(command)
    return output_video

In [None]:
merge_audio_with_video(
    "../files/mcp.mp4", "../files/english_audio.wav", "../files/mcp_dubbed.mp4"
)