In [None]:
import os
import shutil
from google.cloud import speech, translate_v2 as translate, texttospeech
from pydub import AudioSegment
from pydub.silence import split_on_silence
from moviepy.editor import VideoFileClip, AudioFileClip

# --- Configuration ---
# Find language codes here: https://cloud.google.com/speech-to-text/docs/languages
SOURCE_LANGUAGE_CODE = "en-US"  # English (United States)
TARGET_LANGUAGE_CODE = "es"     # Spanish
TARGET_TTS_VOICE_NAME = "es-ES-Wavenet-B" # Spanish (Spain) voice, find more here: https://cloud.google.com/text-to-speech/docs/voices

# --- Main Dubbing Function ---

def dub_movie(video_path, output_path):
    """
    Main function to dub a video from a source language to a target language.
    """
    print("--- Starting Movie Dubbing Process ---")

    # Create a temporary directory for intermediate files
    temp_dir = "temp_dubbing_files"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    try:
        # 1. Extract Audio from Video
        print("\n[Step 1/7] Extracting audio from video...")
        original_audio_path = os.path.join(temp_dir, "original_audio.wav")
        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(original_audio_path, codec='pcm_s16le') # Use WAV for pydub
        print(f"Successfully extracted audio to {original_audio_path}")

        # 2. Split Audio on Silence
        print("\n[Step 2/7] Splitting audio into chunks based on silence...")
        audio = AudioSegment.from_wav(original_audio_path)
        audio_chunks = split_on_silence(
            audio,
            min_silence_len=700,      # Minimum silence length in ms
            silence_thresh=-40,       # Silence threshold in dBFS
            keep_silence=400          # Keep some silence at the end of chunks
        )
        if not audio_chunks:
            print("Could not split audio. The audio might be too quiet or have no silence.")
            return

        chunk_paths = []
        for i, chunk in enumerate(audio_chunks):
            chunk_path = os.path.join(temp_dir, f"chunk_{i}.wav")
            chunk.export(chunk_path, format="wav")
            chunk_paths.append(chunk_path)
        print(f"Split audio into {len(chunk_paths)} chunks.")

        # 3, 4, 5: Process each chunk (Transcribe -> Translate -> Synthesize)
        dubbed_chunk_paths = []
        for i, chunk_path in enumerate(chunk_paths):
            print(f"\n--- Processing Chunk {i+1}/{len(chunk_paths)} ---")
            
            # 3. Transcribe
            print(f"[Step 3/7] Transcribing chunk {i+1}...")
            original_text = transcribe_audio_chunk(chunk_path, SOURCE_LANGUAGE_CODE)
            if not original_text:
                print("Transcription failed or returned no text. Skipping chunk.")
                # To maintain timing, we create a silent audio chunk of the same duration
                original_chunk_duration = len(AudioSegment.from_wav(chunk_path))
                silent_chunk = AudioSegment.silent(duration=original_chunk_duration)
                dubbed_chunk_path = os.path.join(temp_dir, f"dubbed_chunk_{i}.wav")
                silent_chunk.export(dubbed_chunk_path, format="wav")
                dubbed_chunk_paths.append(dubbed_chunk_path)
                continue

            print(f"  > Original Text: {original_text}")

            # 4. Translate
            print(f"[Step 4/7] Translating chunk {i+1}...")
            translated_text = translate_text(original_text, TARGET_LANGUAGE_CODE, SOURCE_LANGUAGE_CODE.split('-')[0])
            print(f"  > Translated Text: {translated_text}")

            # 5. Synthesize Speech
            print(f"[Step 5/7] Synthesizing speech for chunk {i+1}...")
            dubbed_chunk_path = os.path.join(temp_dir, f"dubbed_chunk_{i}.wav")
            synthesize_speech(translated_text, TARGET_LANGUAGE_CODE, TARGET_TTS_VOICE_NAME, dubbed_chunk_path)
            dubbed_chunk_paths.append(dubbed_chunk_path)
            print(f"  > Synthesized audio saved to {dubbed_chunk_path}")

        # 6. Combine Dubbed Audio Chunks
        print("\n[Step 6/7] Combining all dubbed audio chunks...")
        final_dubbed_audio = AudioSegment.empty()
        for path in dubbed_chunk_paths:
            final_dubbed_audio += AudioSegment.from_wav(path)
        
        final_audio_path = os.path.join(temp_dir, "final_dubbed_audio.mp3")
        final_dubbed_audio.export(final_audio_path, format="mp3")
        print(f"Final dubbed audio track created at {final_audio_path}")

        # 7. Merge new audio with original video
        print("\n[Step 7/7] Merging dubbed audio with the original video...")
        final_audio_clip = AudioFileClip(final_audio_path)
        # Set the video's audio to the new dubbed audio
        final_video = video_clip.set_audio(final_audio_clip)
        final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
        print(f"--- SUCCESS! Dubbed movie saved to {output_path} ---")

    except Exception as e:
        print(f"\nAn error occurred: {e}")
    finally:
        # Clean up temporary files
        if os.path.exists(temp_dir):
            print("\nCleaning up temporary files...")
            # shutil.rmtree(temp_dir) # Uncomment to automatically delete temp files

# --- Helper Functions for Google Cloud APIs ---

def transcribe_audio_chunk(chunk_path, language_code):
    """Transcribes a single audio chunk using Google Speech-to-Text."""
    client = speech.SpeechClient()
    with open(chunk_path, "rb") as audio_file:
        content = audio_file.read()
    
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=AudioSegment.from_wav(chunk_path).frame_rate,
        language_code=language_code,
    )
    
    response = client.recognize(config=config, audio=audio)
    
    if response.results:
        return response.results[0].alternatives[0].transcript
    return ""

def translate_text(text, target_language, source_language):
    """Translates text using Google Translation API."""
    client = translate.Client()
    result = client.translate(text, target_language=target_language, source_language=source_language)
    return result["translatedText"]

def synthesize_speech(text, language_code, voice_name, output_filename):
    """Synthesizes speech from text using Google Text-to-Speech."""
    client = texttospeech.TextToSpeechClient()
    
    input_text = texttospeech.SynthesisInput(text=text)
    
    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code, name=voice_name
    )
    
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16 # Use WAV for easy concatenation
    )
    
    response = client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )
    
    with open(output_filename, "wb") as out:
        out.write(response.audio_content)

# --- Main execution block ---

if __name__ == "__main__":
    # Ensure you have a short video clip (e.g., 15-30 seconds) for testing
    # Long movies will take a lot of time and may be costly
    input_video_file = "input.mp4" 
    output_video_file = "dubbed_output.mp4"

    if not os.path.exists(input_video_file):
        print(f"Error: Input video file not found at '{input_video_file}'")
        print("Please place a video file named 'input.mp4' in the same directory as the script.")
    else:
        dub_movie(input_video_file, output_video_file)


In [None]:
import os
import shutil
import math
from google.cloud import speech, translate_v2 as translate, texttospeech
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
from spleeter.separator import Separator

# --- Configuration ---
SOURCE_LANGUAGE_CODE = "en-US"  # Language of the original video
TARGET_LANGUAGE_CODE = "es"     # Language to dub into
TARGET_LANGUAGE_TRANSLATE = "es" # Google Translate uses 'es', not 'es-ES'

# Map speaker tags from diarization to specific TTS voices
# Find voice names here: https://cloud.google.com/text-to-speech/docs/voices
# Add more speakers and voices as needed.
SPEAKER_VOICES = {
    1: {"name": "es-ES-Wavenet-B", "gender": "MALE"},    # Speaker 1 -> Spanish Male
    2: {"name": "es-ES-Wavenet-C", "gender": "FEMALE"},  # Speaker 2 -> Spanish Female
    # Default/fallback voice for any other speakers found
    "default": {"name": "es-ES-Wavenet-D", "gender": "MALE"},
}
SPLEETER_MODEL = 'spleeter:2stems' # Model for separating vocals from accompaniment

# --- Main Dubbing Function ---

def dub_movie_advanced(video_path, output_path):
    """
    Main function to dub a video using advanced techniques:
    - Vocal separation with Spleeter
    - Speaker diarization
    - Dynamic timing adjustment
    """
    print("--- Starting Advanced Movie Dubbing Process ---")
    
    # Create a temporary directory
    temp_dir = "temp_advanced_dubbing"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    try:
        # 1. Extract Full Audio from Video
        print("\n[Step 1/8] Extracting full audio track...")
        original_audio_path = os.path.join(temp_dir, "original_audio.wav")
        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(original_audio_path, codec='pcm_s16le')
        print(f"  > Full audio saved to {original_audio_path}")

        # 2. Separate Vocals from Background Music using Spleeter
        print("\n[Step 2/8] Separating vocals from background music...")
        vocals_path, background_path = separate_audio_components(original_audio_path, temp_dir)
        print(f"  > Vocals saved to: {vocals_path}")
        print(f"  > Background music saved to: {background_path}")

        # 3. Transcribe with Speaker Diarization
        print("\n[Step 3/8] Transcribing vocals with speaker identification...")
        diarized_segments = transcribe_with_diarization(vocals_path, SOURCE_LANGUAGE_CODE)
        if not diarized_segments:
            print("  > No speech detected or diarization failed. Aborting.")
            return
        print(f"  > Transcription complete. Found {len(diarized_segments)} speech segments.")

        # 4. Translate and Synthesize each segment
        print("\n[Step 4-6/8] Translating, Synthesizing, and Adjusting Timing for each segment...")
        # Create a silent audio track with the same duration as the original vocals
        original_vocal_audio = AudioSegment.from_wav(vocals_path)
        dubbed_vocal_track = AudioSegment.silent(duration=len(original_vocal_audio))

        for i, segment in enumerate(diarized_segments):
            text, start_time, end_time, speaker_tag = segment['text'], segment['start'], segment['end'], segment['speaker']
            print(f"\n--- Processing Segment {i+1}/{len(diarized_segments)} (Speaker {speaker_tag}) ---")
            print(f"  > Original Text: {text}")

            # 4. Translate
            translated_text = translate_text(text, TARGET_LANGUAGE_TRANSLATE, SOURCE_LANGUAGE_CODE.split('-')[0])
            print(f"  > Translated Text: {translated_text}")

            # 5. Synthesize Speech with the correct voice
            voice_config = SPEAKER_VOICES.get(speaker_tag, SPEAKER_VOICES["default"])
            dubbed_segment_path = os.path.join(temp_dir, f"dubbed_segment_{i}.wav")

            # Here you could add logic to use SSML for emotional tone
            # For simplicity, we use standard synthesis. See synthesize_speech_ssml for an example.
            synthesize_speech(translated_text, TARGET_LANGUAGE_CODE, voice_config["name"], dubbed_segment_path)
            
            # 6. Adjust Timing
            dubbed_segment_audio = AudioSegment.from_wav(dubbed_segment_path)
            original_duration = (end_time - start_time) * 1000  # in ms
            
            adjusted_audio = adjust_audio_speed(dubbed_segment_audio, original_duration)
            
            # Overlay the adjusted audio onto the main dubbed vocal track at the correct start time
            dubbed_vocal_track = dubbed_vocal_track.overlay(adjusted_audio, position=start_time * 1000)
            print(f"  > Segment timing adjusted and placed on track at {start_time:.2f}s.")

        dubbed_vocals_path = os.path.join(temp_dir, "final_dubbed_vocals.wav")
        dubbed_vocal_track.export(dubbed_vocals_path, format="wav")
        print("\n  > Full dubbed vocal track created.")

        # 7. Merge Dubbed Vocals with Original Background Music
        print("\n[Step 7/8] Merging dubbed vocals with original background track...")
        background_audio = AudioSegment.from_wav(background_path)
        final_audio = background_audio.overlay(dubbed_vocal_track)
        
        final_audio_path = os.path.join(temp_dir, "final_dubbed_audio.mp3")
        final_audio.export(final_audio_path, format="mp3")
        print(f"  > Final audio mix created at {final_audio_path}")
        
        # 8. Merge Final Audio with Original Video
        print("\n[Step 8/8] Merging final audio with the video...")
        final_audio_clip = AudioFileClip(final_audio_path)
        final_video = video_clip.set_audio(final_audio_clip)
        final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
        
        print(f"\n--- SUCCESS! Advanced dubbed movie saved to {output_path} ---")

    except Exception as e:
        print(f"\nAn error occurred: {e}")
        import traceback
        traceback.print_exc()
    finally:
        # Clean up
        if os.path.exists(temp_dir):
            # shutil.rmtree(temp_dir) # Uncomment to auto-delete temp files
            print("\nTemporary files kept in 'temp_advanced_dubbing' for inspection.")


# --- Helper Functions ---

def separate_audio_components(audio_path, output_dir):
    """Separates audio into vocals and accompaniment using Spleeter."""
    separator = Separator(SPLEETER_MODEL)
    separator.separate_to_file(audio_path, output_dir)
    
    # Spleeter creates a folder named after the input file
    input_filename = os.path.splitext(os.path.basename(audio_path))[0]
    spleeter_output_dir = os.path.join(output_dir, input_filename)
    
    vocals_path = os.path.join(spleeter_output_dir, "vocals.wav")
    background_path = os.path.join(spleeter_output_dir, "accompaniment.wav")
    
    if not os.path.exists(vocals_path) or not os.path.exists(background_path):
        raise FileNotFoundError("Spleeter did not produce the expected output files.")
        
    return vocals_path, background_path

def transcribe_with_diarization(audio_path, language_code):
    """Transcribes audio and identifies speakers."""
    client = speech.SpeechClient()
    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    # Get audio properties for accurate config
    audio_segment = AudioSegment.from_wav(audio_path)
    
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=audio_segment.frame_rate,
        language_code=language_code,
        enable_speaker_diarization=True,
        diarization_speaker_count=len(SPEAKER_VOICES) -1 # Let Google know how many speakers to look for
    )

    print("  > Sending audio to Google for transcription (this may take a while)...")
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=900) # Timeout in seconds

    # Process diarization results
    result = response.results[-1] # The last result has the full transcript
    words_info = result.alternatives[0].words
    
    segments = []
    current_segment = None

    for word_info in words_info:
        speaker_tag = word_info.speaker_tag
        if current_segment is None or current_segment['speaker'] != speaker_tag:
            # End previous segment
            if current_segment:
                current_segment['end'] = word_info.start_time.total_seconds()
                segments.append(current_segment)
            # Start new segment
            current_segment = {
                'speaker': speaker_tag,
                'start': word_info.start_time.total_seconds(),
                'text': word_info.word
            }
        else:
            current_segment['text'] += " " + word_info.word
    
    # Add the last segment
    if current_segment:
        current_segment['end'] = words_info[-1].end_time.total_seconds()
        segments.append(current_segment)
        
    return segments

def adjust_audio_speed(audio_segment, target_duration_ms):
    """Speeds up or slows down an audio segment to match a target duration."""
    original_duration_ms = len(audio_segment)
    if original_duration_ms == 0 or target_duration_ms == 0:
        return audio_segment

    speed_ratio = original_duration_ms / target_duration_ms
    
    if abs(1.0 - speed_ratio) < 0.01: # Don't adjust if already very close
        return audio_segment

    print(f"    - Adjusting speed. Original: {original_duration_ms}ms, Target: {target_duration_ms}ms, Ratio: {speed_ratio:.2f}")
    # speedup() in pydub requires a playback_speed > 1 for faster, < 1 for slower
    # Our ratio is inverted, so we use it directly.
    return audio_segment.speedup(playback_speed=speed_ratio)


# The standard helper functions from before, with an added SSML example
def translate_text(text, target, source):
    client = translate.Client()
    result = client.translate(text, target_language=target, source_language=source)
    return result["translatedText"]

def synthesize_speech(text, lang_code, voice_name, output_file):
    client = texttospeech.TextToSpeechClient()
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(language_code=lang_code, name=voice_name)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.LINEAR16)
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    with open(output_file, "wb") as out:
        out.write(response.audio_content)

def synthesize_speech_ssml(ssml_text, lang_code, voice_name, output_file):
    """Synthesizes speech from SSML text for more expressive control."""
    client = texttospeech.TextToSpeechClient()
    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
    voice = texttospeech.VoiceSelectionParams(language_code=lang_code, name=voice_name)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.LINEAR16)
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    with open(output_file, "wb") as out:
        out.write(response.audio_content)

# --- Execution Block ---

if __name__ == "__main__":
    input_video = "input.mp4"
    output_video = "dubbed_advanced_output.mp4"

    if not os.path.exists(input_video):
        print(f"Error: Input video '{input_video}' not found.")
    else:
        dub_movie_advanced(input_video, output_video)

In [None]:
import os
import shutil
import subprocess
from elevenlabs.client import ElevenLabs
from elevenlabs import Voice, VoiceSettings
from google.cloud import translate_v2 as translate
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from spleeter.separator import Separator

# --- Configuration ---
# Google Cloud
SOURCE_LANGUAGE_CODE = "en-US"
TARGET_LANGUAGE_CODE = "es-ES"
TARGET_LANGUAGE_TRANSLATE = "es"

# ElevenLabs (Replace with your API Key)
ELEVENLABS_API_KEY = "YOUR_ELEVENLABS_API_KEY"
if ELEVENLABS_API_KEY == "YOUR_ELEVENLABS_API_KEY":
    raise ValueError("Please replace 'YOUR_ELEVENLABS_API_KEY' with your actual ElevenLabs API key.")
client = ElevenLabs(api_key=ELEVENLABS_API_KEY)

# File Paths
SPLEETER_MODEL = 'spleeter:2stems'
WAV2LIP_PATH = "Wav2Lip" # Path to the cloned Wav2Lip directory

# --- Main Professional Dubbing Function ---

def dub_movie_professional(video_path, output_path):
    print("--- Starting Professional-Grade Movie Dubbing ---")
    
    temp_dir = "temp_professional_dubbing"
    if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)

    try:
        # 1. Extract and Separate Audio
        print("\n[Step 1/5] Extracting and Separating Audio...")
        original_audio_path = os.path.join(temp_dir, "original_audio.wav")
        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(original_audio_path, codec='pcm_s16le')
        vocals_path, background_path = separate_audio_components(original_audio_path, temp_dir)
        print("  > Audio separation complete.")

        # 2. Transcribe and Diarize (using our previous function)
        print("\n[Step 2/5] Transcribing and Identifying Speakers...")
        # Note: We're reusing the Google-based diarization function. It's good enough for this.
        from advanced_dubber import transcribe_with_diarization # Assuming you have the previous script
        diarized_segments = transcribe_with_diarization(vocals_path, SOURCE_LANGUAGE_CODE)
        if not diarized_segments:
            print("  > No speech detected. Aborting.")
            return
        print(f"  > Transcription complete. Found {len(diarized_segments)} speech segments.")

        # 3. Clone Voices, Translate, and Synthesize with Emotion
        print("\n[Step 3/5] Cloning Voices, Translating, and Synthesizing...")
        original_vocal_audio = AudioSegment.from_wav(vocals_path)
        dubbed_vocal_track = AudioSegment.silent(duration=len(original_vocal_audio))
        cloned_voices = {}

        for i, segment in enumerate(diarized_segments):
            speaker_tag = segment['speaker']
            print(f"\n--- Processing Segment {i+1}/{len(diarized_segments)} (Speaker {speaker_tag}) ---")

            # A. Clone voice if we haven't seen this speaker before
            if speaker_tag not in cloned_voices:
                print(f"  > New speaker detected (Tag {speaker_tag}). Cloning voice...")
                start_ms = int(segment['start'] * 1000)
                end_ms = int(segment['end'] * 1000)
                # Take a sample of the speaker's voice for cloning
                voice_sample_audio = original_vocal_audio[start_ms:end_ms]
                sample_path = os.path.join(temp_dir, f"speaker_{speaker_tag}_sample.wav")
                voice_sample_audio.export(sample_path, format="wav")
                
                cloned_voices[speaker_tag] = client.voices.add(
                    name=f"ClonedSpeaker_{speaker_tag}",
                    description=f"Auto-cloned voice for speaker {speaker_tag}",
                    files=[sample_path],
                )
                print(f"  > Voice cloned successfully. Voice ID: {cloned_voices[speaker_tag].voice_id}")

            # B. Translate text
            translated_text = translate_text(segment['text'], TARGET_LANGUAGE_TRANSLATE, SOURCE_LANGUAGE_CODE.split('-')[0])
            print(f"  > Translated: {translated_text}")

            # C. Synthesize with the cloned voice
            voice_to_use = cloned_voices[speaker_tag]
            audio_response = client.generate(
                text=translated_text,
                voice=Voice(
                    voice_id=voice_to_use.voice_id,
                    settings=VoiceSettings(stability=0.5, similarity_boost=0.75, style=0.1, use_speaker_boost=True)
                ),
                model="eleven_multilingual_v2"
            )
            
            dubbed_segment_path = os.path.join(temp_dir, f"dubbed_segment_{i}.mp3")
            with open(dubbed_segment_path, "wb") as f:
                f.write(audio_response)

            # D. Adjust timing and overlay
            dubbed_segment_audio = AudioSegment.from_mp3(dubbed_segment_path)
            original_duration = (segment['end'] - segment['start']) * 1000
            adjusted_audio = adjust_audio_speed(dubbed_segment_audio, original_duration)
            dubbed_vocal_track = dubbed_vocal_track.overlay(adjusted_audio, position=segment['start'] * 1000)
        
        # 4. Create Final Pre-Lip-Sync Video
        print("\n[Step 4/5] Merging final audio to create pre-sync video...")
        dubbed_vocals_path = os.path.join(temp_dir, "final_dubbed_vocals.wav")
        dubbed_vocal_track.export(dubbed_vocals_path, format="wav")

        background_audio = AudioSegment.from_wav(background_path)
        final_audio = background_audio.overlay(dubbed_vocal_track)
        
        final_audio_path = os.path.join(temp_dir, "final_dubbed_audio.wav") # Use WAV for Wav2Lip
        final_audio.export(final_audio_path, format="wav")

        pre_sync_video_path = os.path.join(temp_dir, "video_presync.mp4")
        final_audio_clip = AudioFileClip(final_audio_path)
        video_clip.set_audio(final_audio_clip).write_videofile(pre_sync_video_path, codec="libx264", audio_codec="aac")
        print(f"  > Pre-sync video created at: {pre_sync_video_path}")

        # 5. Execute Wav2Lip for Final Lip-Sync
        print("\n[Step 5/5] Executing Wav2Lip for final lip-sync (this will take a long time)...")
        run_wav2lip(pre_sync_video_path, final_audio_path, output_path)
        
        print(f"\n--- SUCCESS! Professionally dubbed and lip-synced movie saved to {output_path} ---")

    except Exception as e:
        print(f"\nAn error occurred: {e}")
        import traceback
        traceback.print_exc()
    finally:
        # Clean up cloned voices from ElevenLabs account to avoid clutter
        if 'cloned_voices' in locals():
            for speaker_tag, voice in cloned_voices.items():
                print(f"Cleaning up cloned voice for speaker {speaker_tag}...")
                client.voices.delete(voice.voice_id)
        # shutil.rmtree(temp_dir) # Uncomment for auto-cleanup


# --- Helper Functions (Re-used and New) ---

def separate_audio_components(audio_path, output_dir):
    separator = Separator(SPLEETER_MODEL)
    separator.separate_to_file(audio_path, output_dir, codec='wav')
    input_filename = os.path.splitext(os.path.basename(audio_path))[0]
    spleeter_output_dir = os.path.join(output_dir, input_filename)
    return os.path.join(spleeter_output_dir, "vocals.wav"), os.path.join(spleeter_output_dir, "accompaniment.wav")

def translate_text(text, target, source):
    client = translate.Client()
    return client.translate(text, target_language=target, source_language=source)["translatedText"]

def adjust_audio_speed(audio_segment, target_duration_ms):
    ratio = len(audio_segment) / target_duration_ms
    return audio_segment.speedup(playback_speed=ratio) if ratio > 0 else audio_segment

def run_wav2lip(video_path, audio_path, output_path):
    """Executes the Wav2Lip inference script using a subprocess."""
    checkpoint = os.path.join(WAV2LIP_PATH, 'checkpoints', 'wav2lip_gan.pth')
    
    # Ensure paths are absolute for the subprocess
    video_path_abs = os.path.abspath(video_path)
    audio_path_abs = os.path.abspath(audio_path)
    output_path_abs = os.path.abspath(output_path)
    
    # Command to run. Adjust padding if faces are cut off.
    command = [
        'python', 'inference.py',
        '--checkpoint_path', checkpoint,
        '--face', video_path_abs,
        '--audio', audio_path_abs,
        '--outfile', output_path_abs,
        # '--pads', '0', '20', '0', '0' # Example padding: top, bottom, left, right
    ]
    
    print(f"  > Running command: {' '.join(command)}")
    # We run this from within the Wav2Lip directory
    subprocess.run(command, cwd=WAV2LIP_PATH, check=True)

# --- Execution Block ---

if __name__ == "__main__":
    # Import the diarization function from our previous script
    # This assumes 'advanced_dubber.py' is in the same directory.
    try:
        from advanced_dubber import transcribe_with_diarization
    except ImportError:
        print("\nERROR: Could not import 'transcribe_with_diarization'.")
        print("Please ensure the 'advanced_dubber.py' script from the previous step is in the same directory.\n")
        exit()

    input_video = "input.mp4"
    output_video = "dubbed_professional_output.mp4"

    if not os.path.exists(input_video):
        print(f"Error: Input video '{input_video}' not found.")
    elif not os.path.exists(WAV2LIP_PATH):
        print(f"Error: Wav2Lip directory not found at '{WAV2LIP_PATH}'.")
    else:
        dub_movie_professional(input_video, output_video)

In [None]:
import requests

API_KEY = ""  # or set via environment variable ELEVENLABS_API_KEY
AUDIO_PATH = "/home/csc/Documents/Multilingual-Transcriber/shared_data/movieslist/rishtey/audio_files/rishtey_part5__audio.mp3"  # support WAV, MP3, etc.

url = "https://api.elevenlabs.io/v1/speech-to-text"
headers = {
    "xi-api-key": API_KEY
}

with open(AUDIO_PATH, "rb") as f:
    files = {
        "file": f
    }
    data = {
        "model_id": "scribe_v1",
        "diarize": True
    }
    response = requests.post(url, headers=headers, data=data, files=files)

result = response.json()

# Print full transcript
print("\nFull Transcript:\n", result.get("text", ""))

# Print diarized segments
if "words" in result:
    for w in result["words"]:
        start = w.get("start", 0)
        end = w.get("end", 0)
        speaker = w.get("speaker", w.get("speaker_id", "Unknown"))
        text = w.get("text", "")
        if text:
            print(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}: {text}")


Full Transcript:
 (परिचय गीत बजता है) तुम ना होते तो एक भी कदम मैं तो चलता नहीं। तुमसे ही मिली है मुझे आज एक नई जिंदगी। सच मैं कहता हूं इस जहां में है तुम सा कोई नहीं। एक पल के लिए तुम्हारे बिन मुझको जीना नहीं। हो मेरी जान तुम पापा। मेरे भगवान तुम पापा। (क्लोज़ की ध्वनि) (पृष्ठभूमि में संगीत बजता है) (पत्थर के घड़से से जूझ रहे हैं) (ध्वनि प्रभाव) अरे प्यारे अंकल आप मुझे कहां लेकर जा रहे हो? ओए तू चलता सही। एँठे खड़ा जा, एँठे खड़ा जा। उहहह हाँ उहदेख! (पृष्ठभूमि में संगीत बजता है) (हंसते हुए) थैंक यू पापा, थैंक यू सो मच। यू लाइक आईटी? या आई रियली लाइक आईटी। चलाएगा? हाँ पापा। समाज सिट। आह, रेडी? येस। ओके? ओके। कम ऑन, लेट्स गो। यस! हाँ कर्न्ट, कर्न्ट। शाबाश, शाबाश करण, शाबाश। हाँ, हाँ डरना मत, डर मा, डर मा। कम ऑन टर्न, टर्न, टर्न, टर्न। अब मैं चला लूंगा। प-प-पेता तू गिरेगा। पापा छोड़िए मैं खुद चला लूंगा। नहीं बेटा तू गिर जाएगा, संभाल के। पापा प्लीज छोड़िये। छोड़ूं मैं? हाँ प्लीज। ओके। संभाल के करण, संभाल के। अरे किसी को ठोक, ठोक मत करना। अरे वो साधा होता है संभाल के चला देगा, तू चिंता काह

In [9]:
# Print diarized segments
if "words" in result:
    for w in result["words"]:
        start = w.get("start", 0)
        end = w.get("end", 0)
        speaker = w.get("speaker", w.get("speaker_id", "Unknown"))
        text = w.get("text", "")
        if len(text) > 1  :
            print(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}: {text}")

[0.00s - 1.18s] Speaker speaker_0: (परिचय गीत बजता है)
[3.84s - 4.26s] Speaker speaker_0: तुम
[4.40s - 4.58s] Speaker speaker_0: ना
[4.76s - 5.64s] Speaker speaker_0: होते
[5.76s - 6.08s] Speaker speaker_0: तो
[6.10s - 6.66s] Speaker speaker_0: एक
[6.86s - 7.28s] Speaker speaker_0: भी
[7.38s - 7.94s] Speaker speaker_0: कदम
[8.32s - 8.70s] Speaker speaker_0: मैं
[8.78s - 9.00s] Speaker speaker_0: तो
[9.10s - 10.02s] Speaker speaker_0: चलता
[10.14s - 12.24s] Speaker speaker_0: नहीं।
[12.64s - 13.42s] Speaker speaker_0: तुमसे
[13.52s - 13.94s] Speaker speaker_0: ही
[14.04s - 14.72s] Speaker speaker_0: मिली
[14.82s - 15.26s] Speaker speaker_0: है
[15.38s - 15.92s] Speaker speaker_0: मुझे
[16.26s - 16.90s] Speaker speaker_0: आज
[17.04s - 17.38s] Speaker speaker_0: एक
[17.56s - 18.26s] Speaker speaker_0: नई
[18.38s - 20.06s] Speaker speaker_0: जिंदगी।
[21.34s - 21.78s] Speaker speaker_0: सच
[21.92s - 22.12s] Speaker speaker_0: मैं
[22.22s - 23.18s] Speaker speaker_0: कहता
[23.32s - 23.56s] S

In [None]:
import requests

API_KEY = ""  # or set via environment variable ELEVENLABS_API_KEY
AUDIO_PATH = "/home/csc/Documents/Multilingual-Transcriber/shared_data/movieslist/rishtey/audio_files/rishtey_part6__audio.mp3"  # support WAV, MP3, etc.

url = "https://api.elevenlabs.io/v1/speech-to-text"

headers = {
    "xi-api-key": API_KEY
}

with open(AUDIO_PATH, "rb") as f:
    files = {"file": f}
    data = {
        "model_id": "scribe_v1",
        "diarize": True
    }
    response = requests.post(url, headers=headers, data=data, files=files)

result = response.json()

print("\nFull Transcript:\n", result.get("text", ""))

# Group by speaker
if "words" in result:
    grouped_transcript = []
    current_speaker = None
    current_text = []
    start_time = None

    for w in result["words"]:
        speaker = w.get("speaker", w.get("speaker_id", "Unknown"))
        word_text = w.get("text", "")
        word_start = w.get("start", 0)
        word_end = w.get("end", 0)

        if current_speaker is None:
            # First word
            current_speaker = speaker
            current_text = [word_text]
            start_time = word_start
            end_time = word_end
        elif speaker == current_speaker:
            # Same speaker, keep adding words
            current_text.append(word_text)
            end_time = word_end
        else:
            # Speaker changed, save previous
            grouped_transcript.append({
                "speaker": current_speaker,
                "start": start_time,
                "end": end_time,
                "text": " ".join(current_text)
            })
            # Start new speaker group
            current_speaker = speaker
            current_text = [word_text]
            start_time = word_start
            end_time = word_end

    # Save the last group
    if current_text:
        grouped_transcript.append({
            "speaker": current_speaker,
            "start": start_time,
            "end": end_time,
            "text": " ".join(current_text)
        })

    # Print nicely
    print("\nDiarized Transcript (Grouped):\n")
    for segment in grouped_transcript:
        print(f"Speaker {segment['speaker']} ({segment['start']:.2f}s - {segment['end']:.2f}s): {segment['text']}")


Full Transcript:
 , टरी को। (एलईडी ध्वनि) अअअ! (चिल्लाते हुए) क्या हुआ? मुंबई इलेक्ट्रिसिटी बोर्ड में काम करता है क्या? मुंबई इलेक्ट्रिसिटी बोर्ड? टच किया तो आपका बॉडी में झनझना कर एक करंट लगा। अच्छा? हो। पापा। तेरे पापा की तो धाबा... (एलईडी ध्वनि) ये... तेरे को पापा बोला? हां, मुझे पापा बोला। तेरा पोरगा ये? पोरगा? अरे, तेरा बेटा है ये? हां, मेरा बेटा है। देवा... देवा... देवा... देवा... (रोते हुए) मैं बर्बाद हो गई देवा... देवा... देवा... अरे! क्या बस! काहे को इतना रोती है? अरे, काहे सांगू मामा? ऊँ! ये आंसू मेरे दिल की जुबान है। मेरा हंसना कोई घराना बहकाने... अरे ले आराम... अरे दिगंबर! हाँँह! कौन है ये लड़की? काहे रोए जा रही है? अरे, मेरी भांजी है वैजयंती। कलीज़ अलीबाग से आई है। दिल नहीं साथ में कलिंगड़ लाई है। ना जाने किस से टकराई है। आज ते ही सारा मोहल्ला सर पर उठाई है। देवा... देवा... देवा... अच्छा, अच्छा, अच्छा। तू ठहर मैं पूछती हूं। हाँ, पूछो। हमको बता, क्या हुआ? हाँँ? अरे, काहे सांगू काकी? मेरी लव स्टोरी शुरू होने से पहले ही थी एंड हो गई। दोपहर की घड़ी थी। हम दोनों की आंख लड़ी थी।

In [None]:
import requests

import requests

API_KEY = ""  # or set via environment variable ELEVENLABS_API_KEY
AUDIO_PATH = "/home/csc/Documents/Multilingual-Transcriber/shared_data/movieslist/rishtey/audio_files/rishtey_part5__audio.mp3"  # support WAV, MP3, etc.

url = "https://api.elevenlabs.io/v1/speech-to-text"

headers = {
    "xi-api-key": API_KEY
}

with open(AUDIO_PATH, "rb") as f:
    files = {"file": f}
    data = {
        "model_id": "scribe_v1",
        "diarize": True
    }
    response = requests.post(url, headers=headers, data=data, files=files)

result = response.json()

print("\nFull Transcript:\n", result.get("text", ""))


def group_speaker_segments(words, max_words_per_line=12):
    """Groups transcript by speaker and wraps lines after max_words_per_line."""
    grouped_transcript = []
    current_speaker = None
    current_line = []
    start_time = None
    end_time = None

    for w in words:
        speaker = w.get("speaker", w.get("speaker_id", "Unknown"))
        word_text = w.get("text", "").strip()
        word_start = w.get("start")
        word_end = w.get("end")

        if not word_text or word_start is None or word_end is None:
            continue

        if current_speaker is None:
            # First word
            current_speaker = speaker
            start_time = word_start
            end_time = word_end
            current_line = [word_text]
        elif speaker == current_speaker:
            # Same speaker
            current_line.append(word_text)
            end_time = word_end

            if len(current_line) >= max_words_per_line:
                grouped_transcript.append({
                    "speaker": current_speaker,
                    "start": start_time,
                    "end": end_time,
                    "text": " ".join(current_line)
                })
                current_line = []
                start_time = word_start  # start new block from this word
        else:
            # Speaker changed — flush current lines
            if current_line:
                grouped_transcript.append({
                    "speaker": current_speaker,
                    "start": start_time,
                    "end": end_time,
                    "text": " ".join(current_line)
                })

            # Start new speaker block
            current_speaker = speaker
            current_line = [word_text]
            start_time = word_start
            end_time = word_end

    # Flush last line
    if current_line:
        grouped_transcript.append({
            "speaker": current_speaker,
            "start": start_time,
            "end": end_time,
            "text": " ".join(current_line)
        })

    return grouped_transcript


# Process only if words exist
if "words" in result:
    grouped_transcript = group_speaker_segments(result["words"], max_words_per_line=12)

    print("\nDiarized Transcript (Max 12 words per line):\n")
    for segment in grouped_transcript:
        start = f"{segment['start']:.2f}" if segment['start'] is not None else "?"
        end = f"{segment['end']:.2f}" if segment['end'] is not None else "?"
        print(f"Speaker {segment['speaker']} ({start}s - {end}s): {segment['text']}")
else:
    print("No word-level data found in response.")




Full Transcript:
 (परिचय गीत बजता है) तुम ना होते तो एक भी कदम मैं तो चलता नहीं। तुमसे ही मिली है मुझे आज एक नई जिंदगी। सच मैं कहता हूं इस जहां में है तुम सा कोई नहीं। एक पल के लिए तुम्हारे बिन मुझको जीना नहीं। हो मेरी जान तुम पापा। मेरे भगवान तुम पापा। (लल्लन सिंह और शुभांशु दत्त की संगीत बजता है) अरे प्यारे अंकल आप मुझे कहां लेकर जा रहे हो? ओए तू चलता सही। एँटे खड़ा जा, एँटे खड़ा जा। एं.. एं... उध्द देख। (बैकग्राउंड में संगीत बजता है) (हंसते हुए) थैंक यू पापा, थैंक यू सो मच। यू लाइक इट? या आई रियली लाइक इट। चलाएगा? हां पापा। कमान सिट। अअअ आह! रेडी? येस पापा। ओके। कमन लेट्स गो। यस। हां टर्न, टर्न शाबाश, शाबाश करण, शाबाश। हां, हां डरना मत। डर मत, डर मत। कमॉन। टर्न, टर्न, टर्न, टर्न। अबे मैं चला लूंगा। बेटा तू गिरेगा। पापा छोड़िए मैं खुद चला लूंगा। नहीं बेटा तू गिर जाएगा, सभ्य रखें। पापा प्लीज छोड़िए। छोड़ दूं में? हा प्लीज। ओके। सम्भाल के करण, सम्भाल के। अरे किसी को ठोक, भोंप मत करना। अरे साधु अच्छा सम्भाल के चला देगा, तू चिंता काहे को करता है। करण! चल, चल, चल कर लस्सी पीते हैं। (बाइक