<a href="https://colab.research.google.com/github/discoveriesdeepsea/jobtracker-1/blob/main/VideoCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Clear existing packages to avoid conflicts
!pip uninstall -y numpy chatterbox-tts torchaudio transformers librosa resampy numba scikit-learn

# Step 2: Clear pip cache
!pip cache purge

# Step 3: Install compatible versions
!pip install numpy==1.26.0
!pip install llvmlite==0.44.0  # Explicitly install llvmlite to ensure numba compatibility
!pip install numba==0.60.0    # Use a specific numba version compatible with numpy==1.26.0
!pip install torchaudio==2.6.0 --no-build-isolation
!pip install chatterbox-tts

# Step 4: Restart runtime to ensure new package versions are loaded
import os
os.kill(os.getpid(), 9)  # Force restart the runtime

# Note: After running this cell, the runtime will restart. Run the next cell manually.

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mFound existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.52.3
Uninstalling transformers-4.52.3:
  Successfully uninstalled transformers-4.52.3
Found existing installation: librosa 0.11.0
Uninstalling librosa-0.11.0:
  Successfully uninstalled librosa-0.11.0
[0mFound existing installation: numba 0.60.0
Uninstalling numba-0.60.0:
  Successfully uninstalled numba-0.60.0
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
[0mFiles removed: 0
Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[

In [1]:
import os
import torch
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
from IPython.display import Audio, display

# ✅ Set up directories
output_dir = "/content/voiceover_segments"
os.makedirs(output_dir, exist_ok=True)
AUDIO_PROMPT_PATH = "/content/chunk1.wav"

# ✅ Initialize ChatterboxTTS model on GPU
try:
    model = ChatterboxTTS.from_pretrained(device="cuda")
except Exception as e:
    print(f"❌ Model initialization failed: {e}")
    raise

# ✅ Voiceover lines (can include one or many)
voiceover_lines = [
    "Every morning the sun rises… not to repeat yesterday, but to give you another chance… to rise with it.",
    "You were born for more than just survival. You were born to rise. To lead. To shine.",
    "This is your reminder: You’re not done yet. You’re just getting started.",
    "Keep going. The world is waiting."
]

# ✅ Generate audio chunks
all_wavs = []
for idx, line in enumerate(voiceover_lines):
    print(f"🎙️ Generating segment {idx+1}/{len(voiceover_lines)}...")

    # Use voice prompt if available
    try:
        if os.path.exists(AUDIO_PROMPT_PATH):
            wav = model.generate(line, audio_prompt_path=AUDIO_PROMPT_PATH)
        else:
            wav = model.generate(line)

        # Save individual file
        segment_path = os.path.join(output_dir, f"segment_{idx+1}.wav")
        ta.save(segment_path, wav, model.sr)
        all_wavs.append(wav)
        print(f"✅ Saved: {segment_path}")
        display(Audio(segment_path))

    except Exception as e:
        print(f"❌ Failed to generate segment {idx+1}: {e}")

# ✅ Final output stitching
if all_wavs:
    print("\n🔗 Stitching all segments into final voiceover...")
    final_audio = torch.cat(all_wavs, dim=1)
    final_path = "/content/final_voiceover.wav"
    ta.save(final_path, final_audio, model.sr)
    print(f"\n✅ Final voiceover saved as: {final_path}")
    display(Audio(final_path))
else:
    print("⚠️ No audio segments were generated.")


ve.pt:   0%|          | 0.00/5.70M [00:00<?, ?B/s]

t3_cfg.pt:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

s3gen.pt:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

conds.pt:   0%|          | 0.00/107k [00:00<?, ?B/s]

  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)


loaded PerthNet (Implicit) at step 250,000
🎙️ Generating segment 1/4...


  self.gen = func(*args, **kwds)
Sampling:   0%|          | 0/1000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
Sampling:  18%|█▊        | 175/1000 [00:06<00:31, 26.56it/s]


✅ Saved: /content/voiceover_segments/segment_1.wav


🎙️ Generating segment 2/4...


Sampling:  18%|█▊        | 179/1000 [00:06<00:28, 28.46it/s]


✅ Saved: /content/voiceover_segments/segment_2.wav


🎙️ Generating segment 3/4...


Sampling:   9%|▉         | 91/1000 [00:03<00:34, 26.63it/s]


✅ Saved: /content/voiceover_segments/segment_3.wav


🎙️ Generating segment 4/4...


Sampling:   8%|▊         | 75/1000 [00:02<00:30, 29.91it/s]


✅ Saved: /content/voiceover_segments/segment_4.wav



🔗 Stitching all segments into final voiceover...

✅ Final voiceover saved as: /content/final_voiceover.wav


In [9]:
!pip list

Package                               Version
------------------------------------- -------------------
absl-py                               1.4.0
accelerate                            1.7.0
aiofiles                              24.1.0
aiohappyeyeballs                      2.6.1
aiohttp                               3.11.15
aiosignal                             1.3.2
alabaster                             1.0.0
albucore                              0.0.24
albumentations                        2.0.8
ale-py                                0.11.1
altair                                5.5.0
annotated-types                       0.7.0
antlr4-python3-runtime                4.9.3
anyio                                 4.9.0
argon2-cffi                           23.1.0
argon2-cffi-bindings                  21.2.0
array_record                          0.7.2
arviz                                 0.21.0
astropy                               7.1.0
astropy-iers-data                     0.2025.5.26.0.

In [4]:
!pip install invisible_watermark

Collecting invisible_watermark
  Downloading invisible_watermark-0.2.0-py3-none-any.whl.metadata (8.2 kB)
Downloading invisible_watermark-0.2.0-py3-none-any.whl (1.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.6 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: invisible_watermark
Successfully installed invisible_watermark-0.2.0


In [5]:
# Step 1: Install required packages
# We're adding diffusers, transformers, accelerate, and other dependencies for Hugging Face Stable Diffusion
!pip install --quiet openai requests pillow google-colab diffusers transformers accelerate safetensors torch invisible_watermark

import os
import requests
from PIL import Image
from io import BytesIO
from openai import OpenAI
from google.colab import userdata
import time
import torch # PyTorch is needed for Hugging Face Diffusers
from diffusers import StableDiffusionXLPipeline # Using SDXL Pipeline

# Step 2: Retrieve API keys from Colab Secrets
# REPLICATE_API_KEY is no longer used for image generation in this version.
# GROQ_API_KEY is still needed for generating image prompts.
try:
    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
    # REPLICATE_API_KEY = userdata.get("REPLICATE_API_TOKEN") # No longer primary for image generation
    if not GROQ_API_KEY:
        raise ValueError("GROQ_API_KEY not found in Colab Secrets")
    # if not REPLICATE_API_KEY: # No longer strictly needed if not using Replicate
    #     print("⚠️ REPLICATE_API_TOKEN not found, but we will use Hugging Face Diffusers.")
    print("✅ Groq API key retrieved successfully")
except Exception as e:
    print(f"❌ Error retrieving API keys from Colab Secrets: {e}")
    raise

# Initialize Groq client (for text generation)
groq_client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")
GROQ_MODEL = "llama-3.3-70b-versatile" # Or your preferred Groq model

# Initialize Hugging Face Diffusers Pipeline for Image Generation (this will be done once)
# We'll load the model inside the function to manage memory, or load it once globally if preferred
# For now, let's define it globally to avoid reloading for each image, but be mindful of Colab's RAM.
# If you run into memory issues, move the model loading into the generate_image_with_diffusers function.

print("🛠️ Setting up Stable Diffusion XL pipeline globally...")
print("This may take a few minutes to download the model weights the first time.")

try:
    if torch.cuda.is_available():
        pipe = StableDiffusionXLPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float16,  # Use float16 to save memory
            variant="fp16",             # Use fp16 variant
            use_safetensors=True
        )
        pipe = pipe.to("cuda") # Move pipeline to GPU
        # Optional: Enable memory-saving features if you encounter OOM errors
        # pipe.enable_attention_slicing()
        # pipe.enable_model_cpu_offload() # More aggressive memory saving
        print("✅ Stable Diffusion XL pipeline loaded successfully on GPU.")
    else:
        pipe = None
        print("⚠️ CUDA (GPU) not available. Image generation will not work effectively.")
        print("Please ensure you have a GPU runtime selected in Colab: Runtime > Change runtime type > GPU.")
except Exception as e:
    pipe = None
    print(f"❌ Error loading Stable Diffusion XL pipeline: {e}")
    print("Image generation might not work. Ensure you have a GPU runtime and enough resources.")

# Step 3: Define your voiceover lines
voiceover_lines = [
    "Every morning the sun rises… not to repeat yesterday, but to give you another chance… to rise with it.",
    "No one else can run your race. No one else carries your fire. You have dreams. Ideas. A purpose… planted deep inside of you for a reason.",
    "Keep going. The world is waiting."
]

# Step 4: Get image prompt ideas using Groq
def get_visual_prompt(line):
    prompt_instruction = f"Create a vivid, cinematic image description for an AI image generation model that captures the emotional and motivational essence of this message: \"{line}\". The description should be detailed (around 100-150 words), inspiring, suitable for a high-quality visual, and evoke a specific mood or scene. Avoid any copyrighted elements. Focus on visual details, lighting, composition, and emotion."
    try:
        response = groq_client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt_instruction}],
            temperature=0.7,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"⚠️ Groq API error: {e}")
        return None

# Step 5: Generate images using Hugging Face Diffusers (Stable Diffusion XL)
def generate_image_with_diffusers(prompt_text, save_path, index, pipeline):
    if not prompt_text:
        print(f"❌ No prompt generated for frame {index}. Skipping image generation.")
        return
    if pipeline is None:
        print(f"❌ Image generation pipeline not available for frame {index}. Skipping.")
        return

    print(f"🖼️ Generating image for frame {index} with prompt: \"{prompt_text[:100]}...\"")
    try:
        # Define parameters
        negative_prompt = "blurry, low quality, jpeg artifacts, kötü, çirkin, distorted, extra limbs, text, logos, watermarks, ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, mutation, mutated, extra fingers, fewer digits, worst quality, signature, username, artist name, deformed"
        num_inference_steps = 30  # SDXL can produce good results with fewer steps
        guidance_scale = 7.0      # How much the prompt should be adhered to
        width = 1024
        height = 1024

        # Ensure pipeline is on the correct device (it should be if loaded globally to GPU)
        # if pipeline.device.type != "cuda":
        #    pipeline.to("cuda")

        with torch.inference_mode(): # Conserves memory during inference
            image = pipeline(
                prompt=prompt_text,
                negative_prompt=negative_prompt,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale,
                width=width,
                height=height,
                num_images_per_prompt=1
            ).images[0]

        image_save_path = os.path.join(save_path, f"frame_{index:02}.jpg")
        image.save(image_save_path)
        print(f"✅ Saved frame_{index:02}.jpg to {image_save_path}")

        # Optional: Clear some GPU memory if you process many images and face issues,
        # though with global pipeline, this is less of an issue unless images are large.
        # del image
        # torch.cuda.empty_cache()

    except Exception as e:
        print(f"⚠️ Hugging Face Diffusers error for frame {index}: {e}")
        if "out of memory" in str(e).lower():
            print("CUDA out of memory. The SDXL model is large.")
            print("Consider these options:")
            print("1. Restart Colab Runtime: Runtime > Disconnect and delete runtime, then reconnect and run again.")
            print("2. Ensure GPU is T4 or better: Check Runtime > Change runtime type.")
            print("3. Enable model CPU offloading when loading the pipeline: `pipe.enable_model_cpu_offload()` (slower but saves VRAM).")
            print("4. Reduce image dimensions (width, height) if possible, though SDXL is best at 1024x1024.")
        # Attempt to clear cache if an error occurs
        torch.cuda.empty_cache()


# Step 6: Create folder to store images
image_output_path = "/content/voice_frames_hf" # Changed path slightly to differentiate
os.makedirs(image_output_path, exist_ok=True)
print(f"📁 Image output directory: {image_output_path}")

# Step 7: Generate image prompts and download images
if pipe is not None: # Only proceed if the pipeline loaded successfully
    for idx, line in enumerate(voiceover_lines, start=1):
        print(f"\n🎯 Processing line {idx}: {line}")
        visual_prompt = get_visual_prompt(line)
        print(f"🔍 Image Prompt: {visual_prompt}")
        generate_image_with_diffusers(visual_prompt, image_output_path, idx, pipe)
        print("-" * 50) # Separator
        time.sleep(1) # Small delay
else:
    print("❌ Image generation pipeline failed to load. Cannot generate images.")
    print("👉 Please ensure you have a GPU runtime selected in Colab (Runtime > Change runtime type > Select GPU).")
    print("👉 If you still have issues, the Colab instance might not have enough VRAM for SDXL. Try restarting or using a Colab Pro subscription for more powerful GPUs.")

print("\n✨ Script finished.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m1.3/1.6 MB[0m [31m38.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25h

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


✅ Groq API key retrieved successfully
🛠️ Setting up Stable Diffusion XL pipeline globally...
This may take a few minutes to download the model weights the first time.


model_index.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

scheduler_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/246M [00:00<?, ?B/s]

text_encoder_2/model.fp16.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/5.14G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

vae_1_0/diffusion_pytorch_model.fp16.saf(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Stable Diffusion XL pipeline loaded successfully on GPU.
📁 Image output directory: /content/voice_frames_hf

🎯 Processing line 1: Every morning the sun rises… not to repeat yesterday, but to give you another chance… to rise with it.


Token indices sequence length is longer than the specified maximum sequence length for this model (155 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['is serene, with a few wispy clouds scattered across the sky, symbolizing the limitless possibilities of a new day. the composition is balanced, with the sun\'s rays illuminating the figure, emphasizing the connection between the individual and the rising sun. the mood is one of optimism and empowerment, inviting the viewer to seize the day and rise with the sun, filled with promise and potential."']


🔍 Image Prompt: "A warm, golden light creeps over the horizon as the sun rises, casting a gentle glow on the landscape. The sky transitions from deep blues to soft pinks and oranges, evoking a sense of hope and renewal. In the foreground, a solitary figure stands at the edge of a cliff, arms outstretched and face uplifted, basking in the radiance. The atmosphere is serene, with a few wispy clouds scattered across the sky, symbolizing the limitless possibilities of a new day. The composition is balanced, with the sun's rays illuminating the figure, emphasizing the connection between the individual and the rising sun. The mood is one of optimism and empowerment, inviting the viewer to seize the day and rise with the sun, filled with promise and potential."
🖼️ Generating image for frame 1 with prompt: ""A warm, golden light creeps over the horizon as the sun rises, casting a gentle glow on the landsca..."


Token indices sequence length is longer than the specified maximum sequence length for this model (155 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['is serene, with a few wispy clouds scattered across the sky, symbolizing the limitless possibilities of a new day. the composition is balanced, with the sun\'s rays illuminating the figure, emphasizing the connection between the individual and the rising sun. the mood is one of optimism and empowerment, inviting the viewer to seize the day and rise with the sun, filled with promise and potential."']


  0%|          | 0/30 [00:00<?, ?it/s]

✅ Saved frame_01.jpg to /content/voice_frames_hf/frame_01.jpg
--------------------------------------------------

🎯 Processing line 2: No one else can run your race. No one else carries your fire. You have dreams. Ideas. A purpose… planted deep inside of you for a reason.


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', as if envisioning a dream yet to be realized. the surrounding landscape is vast and open, representing the limitless possibilities that lie ahead. the overall mood is one of quiet confidence and unwavering resolve, as if the figure is poised on the threshold of a great journey, ready to unleash their unique passion and purpose upon the world."']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', as if envisioning a dream yet to be realized. the surrounding landscape is vast and open, representing the limitless possibilities that lie ahead. the overall mood is one of quiet confidence and unwavering resolve, as if the figure is poised on the threshold of a great journey, ready to unleash their unique passion and purpose upon the world."']


🔍 Image Prompt: "A solo figure stands at the edge of a misty, golden horizon, with a warm, soft light illuminating their determined face. The atmosphere is serene, with a few stray clouds drifting lazily across the sky. A subtle, fiery glow emanates from within the figure, symbolizing the inner spark that drives them. Their eyes are cast forward, fixed on a distant point, as if envisioning a dream yet to be realized. The surrounding landscape is vast and open, representing the limitless possibilities that lie ahead. The overall mood is one of quiet confidence and unwavering resolve, as if the figure is poised on the threshold of a great journey, ready to unleash their unique passion and purpose upon the world."
🖼️ Generating image for frame 2 with prompt: ""A solo figure stands at the edge of a misty, golden horizon, with a warm, soft light illuminating t..."


  0%|          | 0/30 [00:00<?, ?it/s]

✅ Saved frame_02.jpg to /content/voice_frames_hf/frame_02.jpg
--------------------------------------------------

🎯 Processing line 3: Keep going. The world is waiting.


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["the distance, the silhouettes of towering mountains rise, shrouded in mist, symbolizing the challenges and mysteries that lie ahead. the atmosphere is filled with an otherworldly glow, as if the universe itself is urging the figure forward. the overall mood is one of hope, courage, and unwavering perseverance, inviting the viewer to embark on their own journey, with the promise that the world is waiting to be discovered. soft, warm light illuminates the scene, casting long shadows and emphasizing the figure's unyielding spirit."]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["the distance, the silhouettes of towering mountains rise, shrouded in mist, symbolizing the challenges and mysteries that lie ahead. the atmosphere is filled with an otherworldly glow, as if the universe itself is urging the figure forward. the overall m

🔍 Image Prompt: Imagine a lone figure standing at the edge of a vast, misty landscape, with a worn dirt path unfolding before them. The sky above is a swirling vortex of warm golden light, as if dawn is breaking on a new horizon. The figure's back is to the viewer, with their arms outstretched and fists clenched, conveying determination and resilience. In the distance, the silhouettes of towering mountains rise, shrouded in mist, symbolizing the challenges and mysteries that lie ahead. The atmosphere is filled with an otherworldly glow, as if the universe itself is urging the figure forward. The overall mood is one of hope, courage, and unwavering perseverance, inviting the viewer to embark on their own journey, with the promise that the world is waiting to be discovered. Soft, warm light illuminates the scene, casting long shadows and emphasizing the figure's unyielding spirit.
🖼️ Generating image for frame 3 with prompt: "Imagine a lone figure standing at the edge of a vast, misty la

  0%|          | 0/30 [00:00<?, ?it/s]

✅ Saved frame_03.jpg to /content/voice_frames_hf/frame_03.jpg
--------------------------------------------------

✨ Script finished.


In [8]:
import os
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
import time

# Step 1: Define paths and parameters
image_output_path = "/content/voice_frames_hf"  # Path where images are saved from the first script
audio_output_path = "/content/voiceover_segments"  # Updated path where voiceover segments are stored
output_video_path = "/content/final_video.mp4"  # Final video output path

# Ensure audio directory exists (create if not present)
os.makedirs(audio_output_path, exist_ok=True)
print(f"📁 Audio directory ensured: {audio_output_path}")
print(f"📁 Using images from: {image_output_path}")

# Step 2: Define voiceover lines and assumed audio durations
# These match the voiceover_lines from the first script
voiceover_lines = [
    "Every morning the sun rises… not to repeat yesterday, but to give you another chance… to rise with it.",
    "No one else can run your race. No one else carries your fire. You have dreams. Ideas. A purpose… planted deep inside of you for a reason.",
    "Keep going. The world is waiting."
]

# Assumed durations for each audio file (in seconds); update with actual durations if known
# You may need to measure the duration of each .wav file generated (e.g., segment_1.wav, etc.)
audio_durations = [5.0, 7.0, 3.0]  # Placeholder durations; replace with actual lengths

# Step 3: Check for image and audio files
image_files = [os.path.join(image_output_path, f"frame_{i:02}.jpg") for i in range(1, len(voiceover_lines) + 1)]
audio_files = [os.path.join(audio_output_path, f"segment_{i}.wav") for i in range(1, len(voiceover_lines) + 1)]

# Verify files exist
for idx, (img, aud) in enumerate(zip(image_files, audio_files), start=1):
    if not os.path.exists(img):
        print(f"❌ Image file missing: {img}")
        raise FileNotFoundError(f"Image for frame {idx} not found")
    if not os.path.exists(aud):
        print(f"⚠️ Audio file missing: {aud}")
        print(f"👉 Please ensure audio files (e.g., segment_1.wav) are generated and placed in {audio_output_path}")
        print("👉 Your voiceover script generated segment_1.wav, segment_2.wav, etc.")
        print("👉 Check if all segments were generated correctly and are in /content/voiceover_segments")
        raise FileNotFoundError(f"Audio for frame {idx} not found")

# Step 4: Create video clips for each image-audio pair
clips = []
for idx, (image_path, audio_path, duration) in enumerate(zip(image_files, audio_files, audio_durations), start=1):
    print(f"🛠️ Processing clip {idx}: {image_path} with {audio_path}")
    try:
        # Load image and set duration to match audio
        clip = ImageSequenceClip([image_path], durations=[duration])
        # Load audio
        audio = AudioFileClip(audio_path)
        # Set audio to the clip
        clip = clip.set_audio(audio)
        clips.append(clip)
        print(f"✅ Clip {idx} created with duration {duration} seconds")
    except Exception as e:
        print(f"⚠️ Error processing clip {idx}: {e}")
        raise

# Step 5: Concatenate clips into a final video
try:
    print("🎥 Concatenating clips into final video...")
    final_clip = concatenate_videoclips(clips, method="compose")
    # Write the final video file
    final_clip.write_videofile(
        output_video_path,
        fps=24,  # Standard frame rate for video
        codec="libx264",  # Common video codec
        audio_codec="aac",  # Common audio codec
        verbose=False,
        logger=None
    )
    print(f"✅ Video saved to: {output_video_path}")
except Exception as e:
    print(f"❌ Error creating final video: {e}")
    raise

# Step 6: Clean up (close clips to free memory)
for clip in clips:
    clip.close()
if 'final_clip' in locals():
    final_clip.close()

print("\n✨ Video creation finished.")

📁 Audio directory ensured: /content/voiceover_segments
📁 Using images from: /content/voice_frames_hf
🛠️ Processing clip 1: /content/voice_frames_hf/frame_01.jpg with /content/voiceover_segments/segment_1.wav
✅ Clip 1 created with duration 5.0 seconds
🛠️ Processing clip 2: /content/voice_frames_hf/frame_02.jpg with /content/voiceover_segments/segment_2.wav
✅ Clip 2 created with duration 7.0 seconds
🛠️ Processing clip 3: /content/voice_frames_hf/frame_03.jpg with /content/voiceover_segments/segment_3.wav
✅ Clip 3 created with duration 3.0 seconds
🎥 Concatenating clips into final video...
✅ Video saved to: /content/final_video.mp4

✨ Video creation finished.
