# Music Video Synthesis
* Extract lyrics from song with timestamps
* Compose scenes, include timestamps
* Construct video text prompt for each scene
* Build videos for each scene
* Stitch together

# We will use openai whipser for stability

In [1]:
#!sudo apt install ffmpeg
#!pip install --quiet --upgrade pip
#!pip3 install torch torchvision torchaudio optimum-quanto torchao xformers
#!pip install --quiet --upgrade openai-whisper openai
# Ubuntu or Debian
#!sudo apt update && sudo apt install ffmpeg
#!pip install setuptools-rust
#!pip install -U diffusers imageio imageio_ffmpeg opencv-python moviepy transformers huggingface-hub optimum pillow safetensors
#!pip install git+https://github.com/xhinker/sd_embed.git@main
#!pip install accelerate flash_attention numba -U
#!pip install flash_attn --no-build-isolation
#!pip install -r requirements.txt -U
#!pip install numpy==1.26.4
#!pip install git+https://github.com/zRzRzRzRzRzRzR/diffusers.git@cogvideox1.1-5b -U


In [2]:
import cv2
import diffusers
import gc
import imageio
import imageio_ffmpeg
import json
import math
import moviepy as mp
import numpy as np
import os
import random
import tempfile
import time
import transformers
import torch
import torch.multiprocessing as mp
import whisper

from contextlib import contextmanager
from datetime import datetime, timedelta
from diffusers import AutoencoderKL, AutoencoderKLCogVideoX, AutoPipelineForText2Image, CogVideoXTransformer3DModel, CogVideoXPipeline, CogVideoXDPMScheduler
from diffusers import CogVideoXTransformer3DModel, CogVideoXImageToVideoPipeline, FlowMatchEulerDiscreteScheduler, CogVideoXDPMScheduler
from diffusers.image_processor import VaeImageProcessor
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
from diffusers.utils import export_to_video, load_video, load_image
from huggingface_hub import hf_hub_download, snapshot_download
from numba import cuda
from openai import OpenAI
from optimum.quanto import freeze, qfloat8, quantize, requantize
from PIL import Image
from safetensors.torch import load_file as load_safetensors, save_file as save_safetensors
from sd_embed.embedding_funcs import get_weighted_text_embeddings_flux1
from torchao.quantization import quantize_, int8_weight_only, int8_dynamic_activation_int8_weight
from diffusers.image_processor import VaeImageProcessor
from transformers import CLIPTextModel, CLIPTokenizer, T5TokenizerFast, T5EncoderModel as t_T5EncoderModel
from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Define the paths where quantized weights will be saved

dtype = torch.bfloat16
MAX_SEED = np.iinfo(np.int32).max
device = "cuda" if torch.cuda.is_available() else "cpu"
retry_limit = 3
quantization = int8_weight_only

WIDTH=1360
HEIGHT=768

2025-02-08 22:49:06.618298: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-08 22:49:06.638223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739076546.648629   37988 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739076546.651647   37988 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-08 22:49:06.664146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
# Configuration
CONFIG = {
    "openai_api_key": "",
    "openai_model": "gpt-4o-mini",
    "openai_model_large": "gpt-4o",
    "hf_token": "",
    "base_working_dir": "./images",
    "base_video_dir": "./output",
    "audio_files": [
        "/mnt/d/Share/Audio/Lightlines.mp3",
        "/mnt/d/Share/Audio/Lightlines.mp3",
        "/mnt/d/Share/Audio/Lightlines.mp3",
    ],
    "device": device,
    "dtype": dtype,
    "retry_limit": retry_limit,
    "MAX_SEED": MAX_SEED,
}

# Ensure base directories exist
os.makedirs(CONFIG["base_working_dir"], exist_ok=True)
os.makedirs(CONFIG["base_video_dir"], exist_ok=True)

In [4]:
def reset_memory(device):
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
    torch.cuda.reset_accumulated_memory_stats(device)
    
def get_openai_prompt_response(
    prompt: str,
    config: dict,
    max_tokens: int = 6000,
    temperature: float = 0.33,
    openai_model: str = "",
):
    """
    Sends a prompt to OpenAI's API and retrieves the response with retry logic.
    """
    client = OpenAI(api_key=config["openai_api_key"])
    response = client.chat.completions.create(
        max_tokens=max_tokens,
        messages=[
            {
                "role": "system",
                "content": """Act as a helpful assistant, you are an expert editor.""",
            },
            {"role": "user", "content": prompt},
        ],
        model=openai_model or config["openai_model"],
        temperature=temperature,
    )

    retry_count = 0
    while retry_count < config["retry_limit"]:
        try:
            message_content = response.choices[0].message.content
            return message_content
        except Exception as e:
            print(f"Error occurred: {e}")
            retry_count += 1
            if retry_count == config["retry_limit"]:
                print("Retry limit reached. Moving to the next iteration.")
                return ""
            else:
                print(f"Retrying... (Attempt {retry_count}/{config['retry_limit']})")
                time.sleep(1)  # Optional: wait before retrying


def load_flux_pipe():
    bfl_repo = "black-forest-labs/FLUX.1-dev"
    revision = "refs/pr/3"
    adapter_id = "alimama-creative/FLUX.1-Turbo-Alpha"

    scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(bfl_repo, subfolder="scheduler", revision=revision)
    text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
    text_encoder_2 = t_T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype, revision=revision)
    tokenizer_2 = T5TokenizerFast.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype, revision=revision)
    vae = AutoencoderKL.from_pretrained(bfl_repo, subfolder="vae", torch_dtype=dtype, revision=revision)
    transformer = FluxTransformer2DModel.from_pretrained(bfl_repo, subfolder="transformer", torch_dtype=dtype, revision=revision)
    
    quantize_(transformer, quantization())
    quantize_(text_encoder_2, quantization())
    pipe = FluxPipeline(
        scheduler=scheduler,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        text_encoder_2=text_encoder_2,
        tokenizer_2=tokenizer_2,
        vae=vae,
        transformer=transformer,
    )

    pipe = pipe.to('cuda')
    pipe.load_lora_weights(adapter_id)

    return pipe


def gen_flux_image(pipe, prompt, config: dict, height=1024, width=1024, guidance_scale=3.5, num_inference_steps=8, max_sequence_length=512, seed=-1):
    """
    Generates an image based on the provided prompt using the Flux pipeline.
    """
    if seed == -1:
        seed = random.randint(0, MAX_SEED)
        
    with torch.no_grad():
        prompt_embeds, pooled_prompt_embeds = get_weighted_text_embeddings_flux1(
            pipe        = pipe,
            prompt    = prompt
        )
        
        image = pipe(
            prompt_embeds               = prompt_embeds,
            pooled_prompt_embeds      = pooled_prompt_embeds,
            height=height,
            width=width,
            guidance_scale=guidance_scale,
            output_type="pil",
            num_inference_steps=num_inference_steps,
            max_sequence_length=max_sequence_length,
            generator=torch.Generator("cpu").manual_seed(seed)
        ).images[0]

        return image


def load_video_pipeline():
    """
    Loads and configures the video generation pipeline.
    """
    text_encoder = t_T5EncoderModel.from_pretrained("THUDM/CogVideoX1.5-5B-I2V", subfolder="text_encoder", torch_dtype=dtype)
    quantize_(text_encoder, quantization())

    transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX1.5-5B-I2V", subfolder="transformer", torch_dtype=dtype)
    quantize_(transformer, quantization())

    vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX1.5-5B-I2V", subfolder="vae", torch_dtype=dtype)
    quantize_(vae, quantization())

    # Create pipeline and run inference
    pipe = CogVideoXImageToVideoPipeline.from_pretrained(
        "THUDM/CogVideoX1.5-5B-I2V",
        text_encoder=text_encoder,
        transformer=transformer,
        vae=vae,
        torch_dtype=torch.bfloat16,
    ).to(device)
    # If you're using with lora, add this code
    #if lora_path:
    #    pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
    #    pipe.fuse_lora(lora_scale=1 / lora_rank)    
    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
    #pipe.enable_model_cpu_offload()
    pipe.vae.enable_tiling()
    #pipe.vae.enable_slicing()

    return pipe


def infer(pipe_image, prompt: str, image_input: str, config: dict, num_inference_steps: int = 50, guidance_scale: float = 7.0, seed: int = -1, num_frames: int = 49):
    """
    Generates video frames from an image and prompt using the video pipeline.
    """
    if seed == -1:
        seed = random.randint(0, 255)

    image_input = Image.open(image_input).resize(size=(720, 480))  # Convert to PIL
    image = load_image(image_input)

    video_pt = pipe_image(
        image=image,
        prompt=prompt,
        num_inference_steps=num_inference_steps,
        num_videos_per_prompt=1,
        use_dynamic_cfg=True,
        output_type="pt",
        guidance_scale=guidance_scale,
        height=HEIGHT,
        width=WIDTH,
        generator=torch.Generator(device="cpu").manual_seed(seed),
        num_frames=num_frames,
    ).frames

    return video_pt, seed


def generate_video(pipe_image, prompt, image_input, config: dict, seed_value: int = -1, video_filename: str = "", num_frames: int = 65):
    """
    Generates and saves a video from the provided image and prompt.
    """
    prompt = "Slow movements, slow camera. " + prompt
    latents, seed = infer(
        pipe_image,
        prompt,
        image_input,
        config,
        num_inference_steps=60,
        guidance_scale=6.0,
        seed=seed_value,
        num_frames=num_frames,
    )
    batch_size = latents.shape[0]
    batch_video_frames = []
    for batch_idx in range(batch_size):
        pt_image = latents[batch_idx]
        pt_image = torch.stack([pt_image[i] for i in range(pt_image.shape[0])])
        image_np = VaeImageProcessor.pt_to_numpy(pt_image)
        image_pil = VaeImageProcessor.numpy_to_pil(image_np)
        batch_video_frames.append(image_pil)

    video_path = save_video(
        batch_video_frames[0],
        fps=16,
        filename=video_filename
    )
    # After processing frames
    del latents
    del batch_video_frames
    reset_memory(device)
    return video_path


def save_video(frames, fps: int, filename: str):
    """
    Saves a list of frames as a video file.
    """
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
        temp_video_path = temp_file.name
        writer = imageio.get_writer(temp_video_path, fps=fps)
        for frame in frames:
            writer.append_data(np.array(frame))
        writer.close()

    os.rename(temp_video_path, filename)
    return filename


def convert_to_gif(video_path: str) -> str:
    """
    Converts a video file to a GIF.
    """
    clip = mp.VideoFileClip(video_path)
    clip = clip.set_fps(8)
    clip = clip.resize(height=240)
    gif_path = video_path.replace(".mp4", ".gif")
    clip.write_gif(gif_path, fps=8)
    return gif_path


def resize_if_unfit(input_video: str) -> str:
    """
    Resizes the video to the target dimensions if it does not match.
    """
    width, height = get_video_dimensions(input_video)

    if width == 720 and height == 480:
        return input_video
    else:
        return center_crop_resize(input_video)


def get_video_dimensions(input_video_path: str) -> tuple:
    """
    Retrieves the dimensions of the video.
    """
    reader = imageio_ffmpeg.read_frames(input_video_path)
    metadata = next(reader)
    return metadata["size"]


def center_crop_resize(input_video_path: str, target_width: int = 720, target_height: int = 480) -> str:
    """
    Resizes and center-crops the video to the target dimensions.
    """
    cap = cv2.VideoCapture(input_video_path)

    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    orig_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    width_factor = target_width / orig_width
    height_factor = target_height / orig_height
    resize_factor = max(width_factor, height_factor)

    inter_width = int(orig_width * resize_factor)
    inter_height = int(orig_height * resize_factor)

    target_fps = 8
    ideal_skip = max(0, math.ceil(orig_fps / target_fps) - 1)
    skip = min(5, ideal_skip)  # Cap at 5

    while (total_frames / (skip + 1)) < 49 and skip > 0:
        skip -= 1

    processed_frames = []
    frame_count = 0
    total_read = 0

    while frame_count < 49 and total_read < total_frames:
        ret, frame = cap.read()
        if not ret:
            break

        if total_read % (skip + 1) == 0:
            resized = cv2.resize(frame, (inter_width, inter_height), interpolation=cv2.INTER_AREA)

            start_x = (inter_width - target_width) // 2
            start_y = (inter_height - target_height) // 2
            cropped = resized[start_y:start_y + target_height, start_x:start_x + target_width]

            processed_frames.append(cropped)
            frame_count += 1

        total_read += 1

    cap.release()

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
        temp_video_path = temp_file.name
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(temp_video_path, fourcc, target_fps, (target_width, target_height))

        for frame in processed_frames:
            out.write(frame)

        out.release()

    return temp_video_path


def extract_last_frame(video_filename: str, output_image_filename: str):
    """
    Extracts the last frame from a video file and saves it as an image.
    """
    try:
        reader = imageio.get_reader(video_filename, 'ffmpeg')
        last_frame = None
        for frame in reader:
            last_frame = frame
        reader.close()

        if last_frame is not None:
            imageio.imwrite(output_image_filename, last_frame)
            print(f"Last frame saved successfully as '{output_image_filename}'.")
        else:
            print("The video contains no frames.")

    except FileNotFoundError:
        print(f"Error: The file '{video_filename}' was not found.")
    except ValueError as ve:
        print(f"ValueError: {ve}")
    except RuntimeError as re:
        print(f"RuntimeError: {re}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def create_scenes(text: str, video_summary: str, config: dict):
    """
    Creates scenes based on the extracted lyrics using OpenAI's API.
    """
    # Generate scenes JSON
   prompt = f'''Create a json list of diverse, unique scenes (groupings of text), scene_description (200 words or less), and action_sequence (30 words or less) from the following text.  Scenes should be groups of lyrics with new scenes when the lyric context changes.  Text: {text}   
The json list should have the start value for the first item in the scene and the text that is combined for all items in the same scene.  
The scene_description should include sensory rich details such as attire, setting, mood, lighting, and changing compositions, painting a clear visual scene consistent with the video theme and different from other scenes.  Use theme descriptions, such as graphic novel, water color, render, oil painting, etc.  Scenes should avoid depictions of literal people, unless they are close up of a single person.  Favor symbolism and artistic illustrations of concepts and feeling, emotion.  Avoid depections of literal people. Evoke strong emotions with atmospheric lighting and tone. Use descriptive phrases to capture the mood (e.g., flickering neon, golden twilight, ethereal glow).
Avoid scenes with many people moving.
The action_sequence should describe the action in the scene.  Scenes should be unique, creative, imaginative, and awe-inspiring to create an amazing video.  Create beautiful and mesmerizing scene descriptions that are creative, unique, artistic, and imaginative. Each scene must be unique, imaginative, and visually captivating, blending creativity with artistic flair. Use powerful, descriptive language to craft scenes that are awe-inspiring and leave the audience in wonder. These scenes should evoke a sense of beauty, grandeur, mystery, or anything emotional, drawing from both realistic and fantastical elements. Ensure the descriptions are immersive, emotionally resonant, and filled with unexpected twists that engage the senses and imagination, suitable for creating a stunning, cinematic video experience.  Use descriptions of special effects in the scenes.  
Action should avoid sudden or fast movement or zooms, avoid any fast camera movement.  Avoid human movements like walking, dancing, shopping, etc.
Return only the json list, less jargon. The json list fields should be: start, text, scene_description, action_sequence'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.66)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes

def revise_scenes(scenes, config: dict):
    """
    Revise scenes based on the extracted scenes.
    """
    # Generate scenes JSON
    prompt = f'''Revise the JSON scenes to update the scene_description and action_sequence to engage the senses and imagination, suitable for creating a stunning, cinematic video experience.  We want unique scenes, even ones in the same sequence. Use descriptions of special effects in the scenes.  JSON scenes: {scenes}   
The scene_description (200 words or less) should include details such as attire, setting, mood, lighting, and any significant movements or expressions, painting a clear visual scene consistent with the video theme and different from other scenes. Use theme descriptions, such as graphic novel, water color, render, oil painting, etc.  Scenes should avoid depictions of literal people, unless they are close up of a single person.  Favor symbolism and artistic illustrations of concepts and feeling, emotion.  Avoid depections of literal people. Evoke strong emotions with atmospheric lighting and tone. Use descriptive phrases to capture the mood (e.g., flickering neon, golden twilight, ethereal glow).
The action_sequence (30 words or less) should describe the action in the scene.  The goal is to create input to create a stunning, cinematic video experience.   
Action should avoid sudden or fast movement or zooms, avoid any fast camera movement. Avoid human movements like walking, dancing, shopping, etc.
Only update the scene_description and action_sequence. We do not want to have similar scene_descriptions and action_sequences for consecutive scenes, we want unique scenes that tell a brilliant, cohesive story.  Please update the scene_description and action_sequence to be differemt, creative, and consistent.  
Do not delete any items as having scenes with the given start times are important. 
Return only the json list, less jargon. The json list fields should be: start, text, scene_description, action_sequence'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.33)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes


def process_audio_scenes(audio_file: str, config: dict):
    # set maximum duration for an image basis, should be in intervals of video generation length
    video_gen_length = 4
    max_duration_seconds  = video_gen_length * 2
    """
    Processes a single audio file through the entire workflow.
    """
    # Create unique identifier based on audio file name
    audio_basename = os.path.splitext(os.path.basename(audio_file))[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = f"{audio_basename}_{timestamp}"

    # Create unique directories for images and videos
    print(f"Create unique directories for images and videos")
    audio_images_dir = os.path.join(config["base_working_dir"], unique_id)
    audio_videos_dir = os.path.join(config["base_video_dir"], unique_id)
    os.makedirs(audio_images_dir, exist_ok=True)
    os.makedirs(audio_videos_dir, exist_ok=True)

    # Step 1: Transcribe audio using Whisper
    print(f"Transcribe audio using Whisper")
    model = whisper.load_model("turbo")
    result = model.transcribe(audio_file)

    # Cleanup Whisper model memory
    del model
    reset_memory(device)

    segments = result['segments']

    # Extract list of start times and texts
    segment_texts_and_start_times = [(segment['text'].strip(), segment['start']) for segment in segments]

    # Combine texts
    text = ""
    for segment_text, start in segment_texts_and_start_times:
        text += f"Start: {start}, Text: {segment_text}\n"

    last_end_value = segments[-1]['end']

    # Path to scenes.json file
    scenes_file_path = os.path.join(audio_images_dir, "scenes.json")

    # Check if scenes.json exists
    if os.path.exists(scenes_file_path):
        print(f"Scenes file already exists at {scenes_file_path}. Skipping scene generation.")
        with open(scenes_file_path, "r") as scenes_file:
            scenes = json.load(scenes_file)
        return scenes, audio_images_dir, audio_videos_dir, last_end_value

    # Step 2: Generate video summary using OpenAI
    print(f"Generate video summary using OpenAI")
    video_summary_prompt = f'Create a short summary that describes a music video based on these lyrics: {text}'
    video_summary = get_openai_prompt_response(video_summary_prompt, config, openai_model=config["openai_model"])

    # Step 3: Create scenes based on lyrics
    print(f"Create scenes based on lyrics")
    try:
        scenes = create_scenes(text, video_summary, config)
    except:
        try:
            scenes = create_scenes(text, video_summary, config)
        except:
            try:
                scenes = create_scenes(text, video_summary, config)
            except: 
                return "", audio_images_dir, audio_videos_dir, last_end_value
            
    # we don't want scenes longer than 18 seconds
    new_scenes = []
    for i in range(len(scenes)):
        scene = scenes[i]
        if i == 0:
            start_time = 0
        else:
            start_time = scene['start']
        # Determine the end time
        if i < len(scenes) - 1:
            end_time = scenes[i + 1]['start']
        else:
            end_time = last_end_value
        duration = end_time - start_time
        # Split the scene if duration exceeds 18 seconds
        while duration > 18:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)
            start_time += max_duration_seconds
            duration = end_time - start_time
        # Append the remaining part of the scene
        if duration > 0:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)
    # Replace the original scenes with the new list
    scenes = new_scenes
    # improve the scenes with a revision
    try:
        scenes_revised = revise_scenes(scenes, config)
        scenes = scenes_revised
        print(f'revised scenes')
    except:
        try:
            scenes_revised = revise_scenes(scenes, config)
            scenes = scenes_revised
            print(f'revised scenes')
        except:
            print('cannot revise scenes')
            
    
    # Save the scenes to scenes.json
    with open(scenes_file_path, "w") as scenes_file:
        json.dump(scenes, scenes_file)
        
    return scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

def process_audio_images(config: dict, scenes, audio_images_dir):
    # Step 4: Load Flux pipeline and generate images
    print(f"Load Flux pipeline and generate images")
    flux_pipe = load_flux_pipe()
    height = HEIGHT
    width = WIDTH
    guidance_scale = 3.9
    num_inference_steps = 8
    max_sequence_length = 512
    seed = -1

    # Generate images for each scene
    image_num = 1
    for scene in scenes:
        image_prompt = scene['scene_description']
        image = gen_flux_image(flux_pipe, image_prompt, config, height, width, guidance_scale, num_inference_steps, max_sequence_length, seed)
        filename = f"image_{str(image_num).zfill(2)}.jpg"
        image_path = os.path.join(audio_images_dir, filename)
        image.save(image_path, dpi=(300, 300))
        image_num += 1

    # Move the pipeline back to CPU and delete it
    flux_pipe.to('cpu')
    del flux_pipe
    reset_memory(device)
    return

def process_audio_video(config: dict, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp):
    # Step 6: Load Video Pipeline
    print(f"Load Video Pipeline")
    video_pipe = load_video_pipeline()

    # Temporary image path
    temp_image = os.path.join(audio_images_dir, "temp_image.jpg")
    video_num = 1

    # Step 7: Generate video sequences
    for i, scene in enumerate(scenes):
        prompt = scene["action_sequence"]

        # Use the initial image for each scene
        image_input = os.path.join(audio_images_dir, f"image_{str(i+1).zfill(2)}.jpg")

        # Calculate duration to keep the video in 6-second increments
        if i + 1 < len(scenes):
            next_start_time = scenes[i + 1]["start"]
        else:
            next_start_time = last_end_value  # Use the final ending time for the last scene

        if i == 0:
            duration = next_start_time
        else:
            duration = next_start_time - scene["start"]
        num_video_segments = int((duration + 3) // 6)

        print(f"Scene {i+1} has {num_video_segments} segments")
        for j in range(num_video_segments):
            video_name = f"video_{str(video_num).zfill(2)}_{str(i+1)}_{str(j+1).zfill(2)}_{timestamp}.mp4"
            video_output_path = os.path.join(audio_videos_dir, video_name)
            generate_video(video_pipe, prompt, image_input, config, seed_value=-1, video_filename=video_output_path)
            time.sleep(1)  # Pause for 1 second

            # After generating the video, extract the last frame to use as input for the next segment
            extract_last_frame(video_output_path, temp_image)

            # Use the last frame as input for the next video segment in the same scene
            image_input = temp_image

            video_num += 1  # Increment video number for the next segment

    # Move the pipeline back to CPU before deleting
    video_pipe.to('cpu')
    del video_pipe
    reset_memory(device)
    
    return


def process_all_audios(audio_file, config: dict):
    """
    Processes a list of audio files through the workflow.
    """
    print(f"Processing audio file: {audio_file}")
    scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_audio_scenes(audio_file, config)
    print(f'{len(scenes)} scenes:\n{json.dumps(scenes, indent=4)}')
    print(f'last_end_value: {last_end_value} timestamp: {timestamp}')
    # Create starting images for scenes
    process_audio_images(config, scenes, audio_images_dir)
    return config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

def create_video():
    config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_all_audios(audio_file, CONFIG)
    process_audio_video(config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp)
    return
    


### Run new

In [5]:
# run new systems
for audio_file in CONFIG["audio_files"]:
    create_video()


Processing audio file: /mnt/d/Share/Audio/Lightlines.mp3
Create unique directories for images and videos
Transcribe audio using Whisper


  checkpoint = torch.load(fp, map_location=device)


Generate video summary using OpenAI
Create scenes based on lyrics
revised scenes
26 scenes:
[
    {
        "start": 0,
        "text": "Angels on shapes, they're the cracks in the rust\nWe drew swords on shadows in that warehouse of lies\nSaw baby wings trembling, just demons in disguise\nYou think heaven's a battlefield, nah, it's a sight\nA thread through the needle of a blood-red sky",
        "scene_description": "In a hauntingly atmospheric warehouse, shadows stretch like dark fingers across rusted beams. Ethereal angels flicker, their gossamer wings glinting in the dim light. Clad in tattered garments, hooded warriors stand poised, swords glimmering like silver in the eerie glow of a blood-red sky filtering through cracked windows. The air is thick with tension, a palpable silence hanging over the scene, as if time itself has paused. The mood is both mystical and foreboding, capturing a moment suspended between light and darkness.",
        "action_sequence": "Angels shimmer in 

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (112 > 77). Running this sequence through the model will result in indexing errors


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Load Video Pipeline


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Scene 1 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 2 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 3 has 2 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 4 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 5 has 3 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 6 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 7 has 2 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 8 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 9 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 10 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 11 has 2 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 12 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 13 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 14 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 15 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 16 has 2 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 17 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 18 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 19 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 20 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 21 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 22 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 23 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 24 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 25 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Scene 26 has 3 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250208_224909/temp_image.jpg'.
Processing audio file: /mnt/d/Share/Audio/Lightlines.mp3
Create unique directories for images and videos
Transcribe audio using Whisper


  checkpoint = torch.load(fp, map_location=device)


Generate video summary using OpenAI
Create scenes based on lyrics
revised scenes
25 scenes:
[
    {
        "start": 0,
        "text": "Angels on shapes, they're the cracks in the rust\nWe drew swords on shadows in that warehouse of lies\nSaw baby wings trembling, just demons in disguise\nYou think heaven's a battlefield, nah, it's a sight\nA thread through the needle of a blood-red sky",
        "scene_description": "In a shadowy, derelict warehouse, rusted beams stretch overhead like skeletal fingers. The air is thick with an electric tension, and flickering shadows play tricks on the walls. Cloaked figures, their attire a mix of ancient armor and tattered fabric, wield swords that shimmer with an otherworldly light. Ethereal angelic forms pulse in and out of view, their delicate wings casting ghostly reflections against the backdrop of a blood-red sky seeping through shattered windows. The atmosphere is charged with anticipation, a haunting blend of beauty and dread.",
        "act

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (113 > 77). Running this sequence through the model will result in indexing errors


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Load Video Pipeline


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Scene 1 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 2 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 3 has 2 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 4 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 5 has 3 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 6 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

Last frame saved successfully as './images/Lightlines_20250209_084711/temp_image.jpg'.
Scene 7 has 1 segments


  0%|          | 0/60 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Run previous

In [None]:
# run saved config
scenes_file_path = './images/Lightlines_20250208_215256/scenes.json'
audio_images_dir = './images/Lightlines_20250208_215256'
audio_videos_dir = './output/Lightlines_20250208_215256'
timestamp = '20250208_215256'
last_end_value = 220.0

with open(scenes_file_path, "r") as scenes_file:
    scenes = json.load(scenes_file)

process_audio_video(CONFIG, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp)