In [None]:
import cv2
import einops
import gc
import imageio
import imageio_ffmpeg
import json
import math
import moviepy as mp
import numpy as np
import os
import random
import safetensors.torch as sf
import time
import torch
import traceback
import transformers
import whisper

from datetime import datetime
from diffusers import AutoencoderKLHunyuanVideo
from diffusers_helper.bucket_tools import find_nearest_bucket
from diffusers_helper.clip_vision import hf_clip_vision_encode
from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation
from diffusers_helper.memory import fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
from diffusers_helper.thread_utils import AsyncStream, async_run
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
from hdi1 import HiDreamImagePipeline
from hdi1 import HiDreamImageTransformer2DModel
from hdi1.schedulers.fm_solvers_unipc import FlowUniPCMultistepScheduler
from hdi1.schedulers.flash_flow_match import FlashFlowMatchEulerDiscreteScheduler
from moviepy.audio.io.AudioFileClip import AudioFileClip
from openai import OpenAI
from PIL import Image
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
from transformers import SiglipImageProcessor, SiglipVisionModel

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16
MAX_SEED = np.iinfo(np.int32).max
retry_limit = 3

In [None]:
THEME = "music video animation, spiritual, intimate, beautiful, grooving with God"
CONFIG = {
    "openai_api_key": "",
    "openai_model": "gpt-4o-mini",
    "openai_model_small_reasoning": "o1-mini",
    "openai_model_large": "gpt-4o",
    "hf_token": "",
    "base_working_dir": "./images",
    "base_video_dir": "./output",
    "audio_files": [
        "/mnt/d/Share/Audio/Vibe Groovin with God.flac",    
    ],
    "device": device,
    "dtype": dtype,
    "retry_limit": retry_limit,
    "MAX_SEED": MAX_SEED,
}

HEIGHT = 540
WIDTH = 960

MODEL_PREFIX = "azaneko"
LLAMA_MODEL_NAME = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"

# Model configurations
MODEL_CONFIGS = {
    "dev": {
        "path": f"{MODEL_PREFIX}/HiDream-I1-Dev-nf4",
        "guidance_scale": 0.0,
        "num_inference_steps": 28,
        "shift": 6.0,
        "scheduler": FlashFlowMatchEulerDiscreteScheduler
    },
    "full": {
        "path": f"{MODEL_PREFIX}/HiDream-I1-Full-nf4",
        "guidance_scale": 5.0,
        "num_inference_steps": 50,
        "shift": 3.0,
        "scheduler": FlowUniPCMultistepScheduler
    },
    "fast": {
        "path": f"{MODEL_PREFIX}/HiDream-I1-Fast-nf4",
        "guidance_scale": 0.0,
        "num_inference_steps": 16,
        "shift": 3.0,
        "scheduler": FlashFlowMatchEulerDiscreteScheduler
    }
}
model_type = "dev"

outputs_folder = './temp_outputs/'
os.makedirs(outputs_folder, exist_ok=True)

# Ensure base directories exist
os.makedirs(CONFIG["base_working_dir"], exist_ok=True)
os.makedirs(CONFIG["base_video_dir"], exist_ok=True)

SCENE_DESCRIPTIONS = '''scene_description: boris vallejo style, frank frazetta style, 8k high quality digital painting, masterpiece, very detailed, ultra realistic, (best quality) very detailed epic masterpiece, detailed face, full body, wrinkly wizard toad reading an ancient scroll in a swamp, best quality, epic scene, Dungeons and dragons atmosphere, heroic fantasy, realistic, realism, full body
scene_description: masterpiece, best quality, amazing quality ,solo, holding, closed_mouth, sitting, outdoors, sky, day, cloud, water, blurry, blue_sky, tree, orange_eyes, no_humans, blurry_background, fish, reflection, mountain, animal_focus, lake, fishing_rod, reflective_water, fishing, holding_fishing_rod, fishing_line,A digital illustration shoot from the side about a cute cartoon fish character sitting on a wooden pier by a calm lake, holding a fishing rod. the image also shows a serene mountain landscape with tall trees and a clear blue sky. on the middle of the image, a no human, furry, blue and orange fish with large, expressive eyes and a happy expression is sitting on the wooden pier. the fish appears to be a chubby, cartoonish creature with a slim body and a closed mouth. it is facing the viewer with its eyes looking to the side. the creature is holding the fishing rod in its right hand and its left hand is resting on the edge of the water. the background features a mountain range with a few clouds in the sky, and the water is calm and still. the lighting is soft and natural, creating a peaceful and serene atmosphere. solo, looking at viewer, closed mouth, sitting, brown eyes, outdoors, sky, day, cloud, holding, tree, blue sky, water, tree branch, holding stick, mountain, fish, pond, holding fishing rod
scene_description: masterpiece, best quality, good quality, very awa, newest, highres, absurdres, 1girl, solo, dress, standing, flower, outdoors, water, white flower, pink flower, scenery, reflection, rain, dark, ripples, yellow flower, puddle, colorful, abstract, standing on liquid, very Wide Shot, limited palette,
scene_description: masterpiece, best quality, amazing quality, klskx, nsfw, explicit, 1girl, redhead, open fridge, dim blue fridge light, nude, oversized t-shirt slipping off shoulder, panties, barefoot, messy hair, licking fingers, one hand on hip, standing by fridge, looking back, midnight snack indoors, kitchen, wooden floor, open fridge door, scattered snacks, napkin, night, dark shadows, high contrast, volumetric lighting, intricate details, blurry background, depth of field
scene_description: time travel, holding coffee,, hdr, 8k, absurdres, shiny, outdoors, reflection, blurry, blurry background, tokyo lights,tokyo street, neon lights, cyberpunk, high-contrast lighting, intricate details, vibrant colors, reflective surfaces, futuristic urban environment, glowing neon signs, cybernetic enhancements, punk aesthetic, dynamic pose, dynamic composition, depth of field, dark_theme, detailed backgroud, foreshortening, blurry edges, vignetting
scene_description: masterpiece, best quality, amazing quality ,solo, yellow_eyes, flower, outdoors, sky, cloud, tree, no_humans, night, animal, facial_mark, moon, cat, star_\(sky\), night_sky, full_moon, starry_sky, animal_focus, architecture, black_cat, east_asian_architecture, whiskers, black_fur, huge_moon,A digital illustration shoot from a front camera angle about a cute black cat sitting on a moss-covered branch under a large full moon, surrounded by a fantasy setting with tall pagodas and flowers. on the middle of the image, a 1girl, who appears to be a cat, is sitting, looking at the viewer with large, expressive brown eyes. the cat has black fur with yellow stripes, and its ears are perked up, giving it a curious expression. it is positioned on the branch, with its body facing the viewer, giving a clear view of its full body. in the background, a full moon is visible, with stars twinkling in the night sky, and fluffy white clouds are visible, adding to the magical atmosphere of the scene. the overall style is whimsical and fantastical, with a focus on the cat's curious expression and the intricate details of its markings on its body. solo, smile, brown eyes, flower, outdoors, sky, day, cloud, no humans, animal, night, animal focus, star \(sky\), moon, cat, full moon
scene_description: hyper realistic, a majestic A gemstone stag slowly blooming into life, moss and flowers sprouting from cracks in its crystalline body as it awakens, its eyes, initially dull stones, begin to glow with an inner emerald light, dawn light filters through a forest, illuminating the stags nascent awakening, wide shot capturing the stags full form and the blooming flora with golden antlers standing in a sunlit clearing, surrounded by ethereal forest spirits, glowing flora, magical atmosphere, extremely high-resolution details, photographic, realism pushed to extreme, fine texture, incredibly lifelike, Cinematic, beautiful, vibrant, masterpiece, 32k, ultra HD, ultra-detailed, amazing quality, amazing artist, sharp edges, detailed textures, full view, atmospheric lighting, amazing visuals
scene_description: impressive and grotesque scenery on a distant world, famous artwork inspired by jordan grimmer, dramatic scene, fractal art, 1990s fantasy style, dynamic angle, this image shows the enormous transcendent hydra-like beast known as the powerful jadesnap as it wriggleflomps from the izzled depths of a churning, otherworldly glowing sea under a dramatic tempestuous sky, it is surrounded by the typical jagged fractal rock formations on the crinkled shore of the water-rich planet "zoffeldirly quartus," superstitious life forms call it "the bringer of lost keys", creature focus, very aesthetic, extremely detailed, ultra high resolution, 8k, 4k, harmonising colors, light beige and chartreuse and bordeaux red and indigo blue and byzantium purple and ebony black, ovg, in the style of ck-ovf, amanoer, arsmjstyle, dnddarkestfantasy, aidmafluxpro1.1
scene_description: A realistic toilet, completely engulfed in flames, inferno, blazing, concept art, masterpiece, perfect lighting, purple and pink flames, realistic flames, 8k, absurdres, massive fire - rendered in the highest quality, realistic bathroom background, A3ther
scene_description: intricate linework with expressive contrasts, soft lighting with dynamic highlights, young woman wearing flight goggles, aviator leather jacket, long loose platinum hair, standing next to a 1930s biplane on an airstrip surrounded by tropical jungle, sunset, in orange hues
scene_description: 1 girl , ghost girl , grave stone , hugging , kneeling , tear , raining , masterpiece, best quality, good quality, very awa, newest, highres, absurdres
scene_description: Indi_and_Digo,1girl,solo,furry,tail,source_furry,,red hair,purple fur, purple eyes, child, kid, masterpiece, best quality, female hand holding a small umbrella, miniature wet tiny mouse standing on the path, rain, drops, funny, intricate details, hyper-realistic, hyper-detailed, professional photoshoot, colorful, ultra-sharp, vivid color, chiaroscuro lighting, macro
scene_description: masterpiece, best quality, good quality, very aesthetic, absurdres, newest, 8K, depth of field, in the style of cknc, artist:moriimee, in the style of cksc, 1girl, short golden hair, bob cut, blue eyes, large breasts, bouncy, baggy red tank top, sagging, oversized leggings, sweat-soaked, soft moan, looking up, from below, hiking on ridge, pressing breasts through tank, nipples outlined BREAK mountain ridge, rocky path, whistling wind, pine scent BREAK warm light, high contrast, earthy tones, depth of field, golden hour lighting, rich details, rugged allure
scene_description: masterpiece, best quality, good quality, very aesthetic, absurdres, newest, 8K, depth of field, in the style of cknc, artist:moriimee, in the style of cksc, 1girl, short green hair, tangled, hazel eyes, medium breasts, firm, hooded cloak, glossy skin, picking lock, smirking, looking at viewer, from side BREAK twilight forest, mossy ruins, owls hooting BREAK dynamic light, green tones, high contrast, twilight glow, rich details, sneaky atmosphere
scene_description: A digital art splash in the style of bo-cyborgsplash, a mysterious raven character positioned in the center of the frame, directly facing the viewer, the raven's upper body is close to the camera, showcasing its dark, ornate attire adorned with intricate details and glowing purple gemstones, its long beak is visible, and its piercing pink eyes seem to lock onto the viewer with a sense of intensity, the background is a dark, neon-lit space with glowing elements, creating a mystical atmosphere, the character is adorned with numerous gemstone necklaces, adding a touch of opulence to the overall design, the overall effect is one of intrigue and mystery, a fantastic abstract colorful art splash, high quality, ultra detailed
scene_description: masterpiece, best quality, good quality, very aesthetic, absurdres, newest, 8K, depth of field, in the style of cknc, artist:moriimee, in the style of cksc, 1girl, long grey hair, wild, stormy eyes, large breasts, round, thunderbolt crown, glossy skin, summoning lightning, looking at viewer, from below BREAK stormy cliff, dark clouds, waves crashing BREAK dynamic light, blue tones, high contrast, electric glow, rich details, dramatic atmosphere
scene_description: masterpiece, hyper detailed, high quality v3, ultra-HD details, 16k, midjourneyv6.1, (Pencil_Sketch:1.2, messy lines, greyscale, traditional media, sketch), anime, manga, sketch, unfinished, hatching texture, fullbody portrait, long legs, mustard XXDFace head, creepy smile, x-eyes, robot joints, wearing ((creepy beige victorian tattered, broken, cracked, dirt, ripped dress)) at night, in 22@SIT_BCN wide street, creepy red moonlight, perfect anatomy
scene_description: masterpiece, best quality, good quality, very awa, newest, highres, absurdres, 1girl, solo, short hair, blonde hair, red eyes, holding, standing, belt, hood, water, scarf, arm up, torn clothes, plant, hood up, wading, partially submerged, mittens, pillar, torch, holding torch, limited palette
scene_description: anthropomorphic corgi knight, corgi head, on one knee, planted sword, holding sword, plate armor, scowl, v-shaped eyebrows ,cloudy, godrays, sunshine, riverbank background, holy halo ,wide shot, depth of field, realism, no humans, animal focus, corpses,covered in blood, battlefield
scene_description: 1girl,solo,furry,pink fur,tail,ears,source_furry, child, kid, masterpiece, best quality, long hair, twin braids, farmer outfit,(steampunk), goggles, googles on head, bag, wheat field, outdoors, wind, accordion, holding instrument, playing instrument, gloves, wheat, sunset, farm, house, scenery, landscape, blurry, blurry background, looking at viewer, smile
scene_description: masterpiece, best quality, amazing quality, solo, sitting, no humans, glowing, wariza, robot, science fiction, on floor, electricity, cable, joints, robot joints, damaged, mechanical parts, wire, humanoid robot, screw, bolt, Countless lightning, electric shock, open mouth, Open five fingers,hands up, Bent back, Low Angle
scene_description: cinema scene, photograph, 4k photorealistic, beautiful sexy girl, NSFW, see through sheer, breasts exposed, brown eyes, pink hair, light makeup, red lips, large breasts, glowing necklace, pink ballgown, lactation, lactating, double peace sign, plunging neckline, smile, snow, outdoors, east asian architecture, night
scene_description: A close-up of an avian alien with feathers that change color depending on its emotions, intricate plumage patterns with metallic sheen, and sharp, beak-like appendages that subtly shift shape.
scene_description: A warm, lively Irish bar with rustic wooden beams, vintage pub signs, and soft amber lighting. Behind the counter stands an anthropomorphic cat dressed in a casual bartender outfit — rolled-up sleeves, vest, and flat cap — pouring a beer from a bottle into a pint glass with careful precision. The bar top is polished wood, lined with empty glasses and old whiskey bottles. The atmosphere is cozy and inviting, with a touch of old-world charm. In the background, shelves of liquor and a mirror reflect the soft golden glow of the room. The scene is full of character and detail, capturing the charm of a traditional Irish pub
scene_description: a naked woman with long hair, transparent wet shirt, can see her beautiful breasts, no panties seen, her legs covering her crotch, sits in a chair, lewd pose. diffused natural lighting from a nearby window, sensual, revealing, tender.
scene_description: The high resolution image depicts an underwater castle with intricate architecture, reminiscent of a fairy tale, lying on the sand below. The castle is adorned with domes, spires, and ornate carvings, all in shades of blue and white. It is surrounded by vibrant coral reefs and various marine life, including colorful seashells and bubbles floating around. The scene is bathed in soft, natural light filtering down from above, creating a magical and serene atmosphere. The castle appears to be a blend of Gothic and fantasy styles, with a grand entrance and delicate details that suggest a world of enchantment beneath the waves.
scene_description: perfect quality, bokeh effect, photography, an ancient, cloaked female figure with piercing eyes, face covered in dust and dirt, their face wrapped in fabric adorned with extremely intricate hieroglyphs, set against an abstract, polygonal blue gradient background. The cloth appears worn and frayed
scene_description: A NSFW Snapchat explicit iPhone selfie photo It's of a very pretty and attractive girl, whose a blonde, and has a good, hourglass figure, her boobs are out and she is complety naked, smiling at the camera with her head tilted, her upper body only is visible in the image, her boobs are out and she is complety naked, as it is a selfie in a carseat, candid, vertical 9:16 aspect ratio.
scene_description: The image depicts a beautifully ornate, oval-shaped clear glass sphere with an intricately designed frame. The frame is made of a golden material with elaborate, swirling patterns. Inside the sphere there is a detailed, red and white dragon emerging from what appears to be water or waves made of crystal. The dragon has a majestic and mythical appearance, with flowing, spiky hair and a fierce expression. The background of the sphere shows a dynamic scene with splashing water and waves, adding to the sense of movement and energy. The sphere is mounted on a tree trunk, and there are pink flowers and green foliage surrounding it, enhancing the natural and mystical atmosphere of the scene.
scene_description: A cute monster with a rainbow-colored fur coat, a long tail, and big, round eyes. The monster is sitting on a cluttered shelf in a mysterious, dark environment. The shelf is surrounded by other colorful monsters, some of which are sleeping, while others are playing with toys. The environment is ultra-detailed, with intricate carvings on the walls and floors, and a perfect cinematic lighting that highlights every detail. The product photography is perfect, capturing every angle and texture of the monster and its surroundings. The environment is perfect for a horror movie, with a sense of foreboding and danger lurking around every corner. The scene is rendered in 8K, with every pixel perfectly defined and crisp.
scene_description: A (glistening crystal eye:1.3) is embedded in the bark of a soaked oak tree, its (surface refracting raindrops:1.0) like glass marbles on velvet. The forest around it is (bathed in muted blue lightning:0.8), and nearby crows sit motionless, their feathers (glowing faintly at the edges:0.5), as if time paused mid-rain
scene_description: A stunningly intricate mechanical steampunk timeart gold crow perches gracefully on a gnarled tree branch, its piercing gaze fixed directly upon the viewer, body with spinning gears. The creature's eyes are exquisitely detailed, reflecting the ethereal glow of a dark nebula that shimmers in the background. Soft bokeh adds an air of mystery to this captivating scene.
scene_description: The image depicts a fantastical scene set in a lush, enchanted forest. The focal point is a delicate, ethereal creature that appears to be a blend of plant and humanoid form. This creature has a translucent, almost glass-like body with intricate, vein-like patterns running throughout. Its limbs and torso are slender and elongated, giving it an otherworldly appearance. The creature's head is rounded with large, luminous yellow eyes that seem to glow softly, adding to its mystical aura. It has small, pointed ears or antennae protruding from its head, enhancing its otherworldly look. The creature's skin is a gradient of translucent hues, with subtle red accents at various points, particularly around its joints and the inside base of its body. Surrounding the creature are two large, glowing flowers that resemble lotus flowers. These flowers have a soft, ethereal light emanating from their centers, with petals that are a gradient of white to light pink. The flowers are connected to the creature by slender, elongated stems that are also translucent and veined. The background is a dense, verdant forest with a soft, diffused light filtering through the canopy, creating a mystical and serene atmosphere. The ground is covered in moss, small plants, and fallen leaves, adding to the natural, enchanted setting.
scene_description: A shadowy, humanoid figure floats above scorched earth. No armor, only black smoke and swirling darkness form its body. Occasional red or violet glows inside. Stormy sky above.
scene_description: Photograph of a big green orc in a heavy metal tshirt sitting in a rocking chair on a porch and playing a classical guitar,Photographed with a cinematic 50mm lens
scene_description: The image shows a tabby cat standing behind a wooden fence in a field. The cat is holding a bouquet of colorful flowers, including purple and white daisies, and some other small flowers. The background is a lush, green field with various wildflowers and plants.
'''

ACTION_SEQUENCES = f'''action_sequence: The man dances energetically, leaping mid-air with fluid arm swings and quick footwork,
action_sequence: The girl dances gracefully, with clear movements, full of charm.
action_sequence: The girl suddenly took out a sign that said “cute” using right hand
action_sequence: The girl skateboarding, repeating the endless spinning and dancing and jumping on a skateboard, with clear movements, full of charm.
action_sequence: The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair.
action_sequence: The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements.
action_sequence: The young man writes intensely, flipping papers and adjusting his glasses with swift, focused movements.
'''

In [None]:

def reset_memory(device):
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
    torch.cuda.reset_accumulated_memory_stats(device)
    torch.cuda.ipc_collect()

def load_models(model_type: str):
    config = MODEL_CONFIGS[model_type]
    
    tokenizer_4 = PreTrainedTokenizerFast.from_pretrained(LLAMA_MODEL_NAME)
    
    text_encoder_4 = LlamaForCausalLM.from_pretrained(
        LLAMA_MODEL_NAME,
        output_hidden_states=True,
        output_attentions=True,
        return_dict_in_generate=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    transformer = HiDreamImageTransformer2DModel.from_pretrained(
        config["path"],
        subfolder="transformer",
        torch_dtype=torch.bfloat16
    )
    
    pipe = HiDreamImagePipeline.from_pretrained(
        config["path"],
        scheduler=FlowUniPCMultistepScheduler(num_train_timesteps=1000, shift=config["shift"], use_dynamic_shifting=False),
        tokenizer_4=tokenizer_4,
        text_encoder_4=text_encoder_4,
        torch_dtype=torch.bfloat16,
    )
    pipe.transformer = transformer
    pipe.enable_sequential_cpu_offload()

    del(tokenizer_4)
    del(text_encoder_4)
    del(transformer)
    
    return pipe, config

@torch.inference_mode()
def generate_image(pipe: HiDreamImagePipeline, model_type: str, prompt: str, negative_prompt : str, resolution: tuple[int, int], seed: int):
    # Get configuration for current model
    prompt = "masterpiece, " + prompt
    config = MODEL_CONFIGS[model_type]
    guidance_scale = config["guidance_scale"]
    num_inference_steps = config["num_inference_steps"]
    
    # Parse resolution
    height, width = resolution
 
    # Handle seed
    if seed == -1:
        seed = torch.randint(0, MAX_SEED, (1,)).item()
    
    generator = torch.Generator("cuda").manual_seed(seed)
    
    images = pipe(
        prompt,
        negative_prompt,
        height=height,
        width=width,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps,
        num_images_per_prompt=1,
        generator=generator
    ).images
    
    return images[0], seed

@torch.no_grad()
def generate_video(
    text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
    output_video: str,
    input_image: np.ndarray,
    prompt: str,
    n_prompt: str,
    seed: int,
    total_second_length: float,
    latent_window_size: int,
    steps: int,
    cfg: float,
    gs: float,
    rs: float,
    gpu_memory_preservation: int,
    use_teacache: bool,
    mp4_crf: int
) -> list[str]:
    """
    Synchronous video generation. Returns a list of MP4 filenames
    (one per latent section, final video last).
    """
    # compute sections
    total_latent_sections = int(max(round((total_second_length * 30) / (latent_window_size * 4)), 1))
    job_id = generate_timestamp()
    out_files: list[str] = []

    try:
        # unload if low VRAM
        if not high_vram:
            unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)

        # --- TEXT ENCODING ---
        if not high_vram:
            fake_diffusers_current_device(text_encoder, gpu)
            load_model_as_complete(text_encoder_2, target_device=gpu)

        llama_vec, clip_l_pooler = encode_prompt_conds(
            prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
        )

        if cfg == 1:
            llama_vec_n = torch.zeros_like(llama_vec)
            clip_l_pooler_n = torch.zeros_like(clip_l_pooler)
        else:
            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(
                n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
            )

        llama_vec, llama_attention_mask     = crop_or_pad_yield_mask(llama_vec, length=512)
        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)

        # --- IMAGE PREPROCESS & VAE ENCODE ---
        H, W, C = input_image.shape
        height, width = find_nearest_bucket(H, W, resolution=640)
        input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
        Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f"{job_id}.png"))

        # <-- FIXED: add a singleton 'frames' dim so shape is [1, C, 1, H, W]
        input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
        input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None, :, :]

        if not high_vram:
            load_model_as_complete(vae, target_device=gpu)
        start_latent = vae_encode(input_image_pt, vae)

        # --- CLIP VISION ENCODE ---
        if not high_vram:
            load_model_as_complete(image_encoder, target_device=gpu)
        clip_out = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
        image_encoder_last_hidden_state = clip_out.last_hidden_state

        # cast to transformer dtype
        llama_vec                         = llama_vec.to(transformer.dtype)
        llama_vec_n                       = llama_vec_n.to(transformer.dtype)
        clip_l_pooler                     = clip_l_pooler.to(transformer.dtype)
        clip_l_pooler_n                   = clip_l_pooler_n.to(transformer.dtype)
        image_encoder_last_hidden_state   = image_encoder_last_hidden_state.to(transformer.dtype)

        # --- PREPARE SAMPLING ---
        rnd = torch.Generator("cpu").manual_seed(seed)
        num_frames = latent_window_size * 4 - 3

        history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
        history_pixels = None
        total_generated = 0

        latent_paddings = list(reversed(range(total_latent_sections)))
        if total_latent_sections > 4:
            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]

        # --- SAMPLING LOOP ---
        for pad in latent_paddings:
            is_last = (pad == 0)
            pad_size = pad * latent_window_size
            print(f"Section pad={pad}, is_last={is_last}")

            indices = torch.arange(0, sum([1, pad_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
            (pre_idx, blank_idx, latent_idx,
             post_idx, idx2x, idx4x
            ) = indices.split([1, pad_size, latent_window_size, 1, 2, 16], dim=1)
            clean_idx = torch.cat([pre_idx, post_idx], dim=1)

            clean_pre_latents = start_latent.to(history_latents.device)
            post, mid2, mid4 = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
            clean_latents = torch.cat([clean_pre_latents, post], dim=2)

            if not high_vram:
                unload_complete_models()
                move_model_to_device_with_memory_preservation(
                    transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation
                )

            transformer.initialize_teacache(enable_teacache=use_teacache, num_steps=steps)

            gen_latents = sample_hunyuan(
                transformer=transformer,
                sampler='unipc',
                width=width, height=height,
                frames=num_frames,
                real_guidance_scale=cfg,
                distilled_guidance_scale=gs,
                guidance_rescale=rs,
                num_inference_steps=steps,
                generator=rnd,
                prompt_embeds=llama_vec,
                prompt_embeds_mask=llama_attention_mask,
                prompt_poolers=clip_l_pooler,
                negative_prompt_embeds=llama_vec_n,
                negative_prompt_embeds_mask=llama_attention_mask_n,
                negative_prompt_poolers=clip_l_pooler_n,
                device=gpu, dtype=torch.bfloat16,
                image_embeddings=image_encoder_last_hidden_state,
                latent_indices=latent_idx,
                clean_latents=clean_latents,
                clean_latent_indices=clean_idx,
                clean_latents_2x=mid2,
                clean_latent_2x_indices=idx2x,
                clean_latents_4x=mid4,
                clean_latent_4x_indices=idx4x,
            )

            if is_last:
                gen_latents = torch.cat([start_latent.to(gen_latents), gen_latents], dim=2)

            total_generated += gen_latents.shape[2]
            history_latents = torch.cat([gen_latents.to(history_latents), history_latents], dim=2)

            if not high_vram:
                offload_model_from_device_for_memory_preservation(
                    transformer, target_device=gpu, preserved_memory_gb=8
                )
                load_model_as_complete(vae, target_device=gpu)

            real_latents = history_latents[:, :, :total_generated, :, :]

            if history_pixels is None:
                history_pixels = vae_decode(real_latents, vae).cpu()
            else:
                section_len = (latent_window_size * 2 + 1) if is_last else (latent_window_size * 2)
                overlap = latent_window_size * 4 - 3
                curr_pixels = vae_decode(real_latents[:, :, :section_len], vae).cpu()
                history_pixels = soft_append_bcthw(curr_pixels, history_pixels, overlap)

            if not high_vram:
                unload_complete_models()

            out_name = os.path.join(outputs_folder, f"{job_id}_{total_generated}.mp4")
            save_bcthw_as_mp4(history_pixels, out_name, fps=30, crf=mp4_crf)
            out_files.append(out_name)

            print(f"Saved: {out_name}")
            if is_last:
                out_name = output_video
                save_bcthw_as_mp4(history_pixels, out_name, fps=30, crf=mp4_crf)
                print(f"Saved last: {out_name}")
                break

    except Exception:
        traceback.print_exc()
        if not high_vram:
            unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)

    return out_files

def synthesize_videos(text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
                      output_video: str, input_image: str, prompt: str, total_second_length: float):
    # load and convert your test image
    pil_img = Image.open(input_image)
    input_np = np.array(pil_img)
    # call our refactored generator
    files = generate_video(
        text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
        output_video = output_video,
        input_image=input_np,
        prompt=prompt,
        n_prompt="",
        seed=random.randrange(0, 2**31),
        total_second_length=total_second_length,
        latent_window_size=9,
        steps=25,
        cfg=1.0,
        gs=10.0,
        rs=0.0,
        gpu_memory_preservation=6,
        use_teacache=False,
        mp4_crf=1
    )
    final_video = files[-1] if files else None

    return final_video

def get_openai_prompt_response(
    prompt: str,
    config: dict,
    max_tokens: int = 6000,
    temperature: float = 0.33,
    openai_model: str = "",
):
    """
    Sends a prompt to OpenAI's API and retrieves the response with retry logic.
    """
    client = OpenAI(api_key=config["openai_api_key"])
    response = client.chat.completions.create(
        max_tokens=max_tokens,
        messages=[
            {
                "role": "system",
                "content": """Act as a helpful assistant, you are an expert editor.""",
            },
            {"role": "user", "content": prompt},
        ],
        model=openai_model or config["openai_model"],
        temperature=temperature,
    )

    retry_count = 0
    while retry_count < config["retry_limit"]:
        try:
            message_content = response.choices[0].message.content
            return message_content
        except Exception as e:
            print(f"Error occurred: {e}")
            retry_count += 1
            if retry_count == config["retry_limit"]:
                print("Retry limit reached. Moving to the next iteration.")
                return ""
            else:
                print(f"Retrying... (Attempt {retry_count}/{config['retry_limit']})")
                time.sleep(1)  # Optional: wait before retrying

def create_scenes(text: str, video_summary: str, config: dict):
    """
    Creates scenes based on the extracted lyrics using OpenAI's API.
    """
    # Generate scenes JSON
    prompt = f'''Create a json list of diverse, unique scenes (groupings of text), scene_description, and action_sequence (100 words or less) from the following text.  Scenes should be groups of lyrics with new scenes when the lyric context changes.  Text: {text}   
The json list should have the start value for the first item in the scene and the text that is combined for all items in the same scene.  
The scene_description should include A vivid, sensory-rich, and unique visual description (100 words or less), including:
  - Attire, setting, mood, lighting, and composition
  - Artistic style (e.g., graphic novel, watercolor, oil painting, surrealist CGI)
  - Powerful symbolism and metaphor; religious/spiritual analogies when relevant
  - If people are present, focus on a closeup of a single anthropomorphic or highly stylized character, never multiple people in a closeup
  - Favor semi-nude (e.g., sheer, translucent), natural forms, or artistically beautiful horror, as contextually appropriate
  - Emphasize strong emotion, artistic genius, spiritual energy, and grandeur
  - Special effects and CGI (glowing eyes, magical energy, electric auras, shifting reality, surreal environments, etc.)
  - Unexpected, awe-inspiring, or haunting elements; scenes should be visually striking and imaginative
  - No cliché or generic visual tropes; each scene must stand out and avoid repetition with others
  
The action_sequence should Describe slow, cinematic, and poetic actions (one or two sentences):
  - Avoid sudden or fast movement, abrupt zooms, walking, dancing, shopping, or human group behaviors
  - Action should be minimal, atmospheric, and enhance the emotional tone; favor symbolic gestures, stillness, or gradual transformations
  - No scene transitions; each scene is self-contained
  
These are example scene_descriptions: {SCENE_DESCRIPTIONS}
These are example action_sequences: {ACTION_SEQUENCES}  
The desired general theme or style is: {THEME}
Return only the json list, less jargon. The json list fields should be: start, text, scene_description, action_sequence'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.85)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes

def revise_scenes(scenes, config: dict):
    """
    Revise scenes based on the extracted scenes.
    """
    # Generate scenes JSON
    prompt = f'''Revise the JSON scenes to update the scene_description and action_sequence to engage the senses and imagination, suitable for creating a stunning, cinematic video experience.  We want unique scenes, even ones in the same sequence. Use descriptions of special effects in the scenes.  JSON scenes: {scenes}   
The scene_description should include A vivid, sensory-rich, and unique visual description (100 words or less), including:
  - Attire, setting, mood, lighting, and composition
  - Artistic style (e.g., graphic novel, watercolor, oil painting, surrealist CGI)
  - Powerful symbolism and metaphor; religious/spiritual analogies when relevant
  - If people are present, focus on a closeup of a single anthropomorphic or highly stylized character, never multiple people in a closeup
  - Favor semi-nude (e.g., sheer, translucent), natural forms, or artistically beautiful horror, as contextually appropriate
  - Emphasize strong emotion, artistic genius, spiritual energy, and grandeur
  - Special effects and CGI (glowing eyes, magical energy, electric auras, shifting reality, surreal environments, etc.)
  - Unexpected, awe-inspiring, or haunting elements; scenes should be visually striking and imaginative
  - No cliché or generic visual tropes; each scene must stand out and avoid repetition with others
  
The action_sequence should Describe slow, cinematic, and poetic actions (one or two sentences):
  - Avoid sudden or fast movement, abrupt zooms, walking, dancing, shopping, or human group behaviors
  - Action should be minimal, atmospheric, and enhance the emotional tone; favor symbolic gestures, stillness, or gradual transformations
  
Only update the scene_description and action_sequence. We do not want to have similar scene_descriptions and action_sequences for consecutive scenes, we want unique scenes that tell a brilliant, cohesive story.  Please update the scene_description and action_sequence to be different, creative, and consistent.  
Do not delete any items as having scenes with the given start times are important. 
The desired general theme or style is: {THEME}
Return only the json list, less jargon. The json list fields should be: start, text, scene_description, action_sequence'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.33)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes

def get_audio_duration(audio_file):
    with AudioFileClip(audio_file) as clip:
        return clip.duration  # duration in seconds (float)

def process_audio_scenes(audio_file: str, config: dict):
    # set maximum duration for an image basis, should be in intervals of video generation length
    max_duration_seconds  = 24
    """
    Processes a single audio file through the entire workflow.
    """
    # Create unique identifier based on audio file name
    audio_basename = os.path.splitext(os.path.basename(audio_file))[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = f"{audio_basename}_{timestamp}"

    # Create unique directories for images and videos
    print(f"Create unique directories for images and videos")
    audio_images_dir = os.path.join(config["base_working_dir"], unique_id)
    audio_videos_dir = os.path.join(config["base_video_dir"], unique_id)
    os.makedirs(audio_images_dir, exist_ok=True)
    os.makedirs(audio_videos_dir, exist_ok=True)

    # Step 1: Transcribe audio using Whisper
    print(f"Transcribe audio using Whisper")
    model = whisper.load_model("turbo")
    result = model.transcribe(audio_file)

    # Cleanup Whisper model memory
    del model
    reset_memory(device)

    segments = result['segments']

    # Extract list of start times and texts
    segment_texts_and_start_times = [(segment['text'].strip(), segment['start']) for segment in segments]

    # Combine texts
    text = ""
    for segment_text, start in segment_texts_and_start_times:
        text += f"Start: {start}, Text: {segment_text}\n"

    #last_end_value = segments[-1]['end']
    last_end_value = float(get_audio_duration(audio_file))

    # Path to scenes.json file
    scenes_file_path = os.path.join(audio_images_dir, "scenes.json")

    # Check if scenes.json exists
    if os.path.exists(scenes_file_path):
        print(f"Scenes file already exists at {scenes_file_path}. Skipping scene generation.")
        with open(scenes_file_path, "r") as scenes_file:
            scenes = json.load(scenes_file)
        return scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

    # Step 2: Generate video summary using OpenAI
    print(f"Generate video summary using OpenAI")
    video_summary_prompt = f'Create a short summary that describes a music video based on these lyrics: {text}'
    video_summary = get_openai_prompt_response(video_summary_prompt, config, openai_model=config["openai_model"])

    # Step 3: Create scenes based on lyrics
    print(f"Create scenes based on lyrics")
    try:
        scenes = create_scenes(text, video_summary, config)
    except:
        try:
            scenes = create_scenes(text, video_summary, config)
        except:
            try:
                scenes = create_scenes(text, video_summary, config)
            except: 
                return "", audio_images_dir, audio_videos_dir, last_end_value, timestamp

    # we don't want scenes longer than 18 seconds
    new_scenes = []
    for i in range(len(scenes)):
        scene = scenes[i]
        if i == 0:
            start_time = 0
        else:
            start_time = scene['start']
            print(f'start_time: {start_time}')
        # Determine the end time
        if i < len(scenes) - 1:
            end_time = scenes[i + 1]['start']
        else:
            end_time = last_end_value
        duration = end_time - start_time
        # Split the scene if duration exceeds max_duration_seconds
        while duration > max_duration_seconds:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)
            start_time += max_duration_seconds
            duration = end_time - start_time
        # Append the remaining part of the scene
        if duration > 0:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)
    # Replace the original scenes with the new list
    scenes = new_scenes
    # improve the scenes with a revision
    try:
        scenes_revised = revise_scenes(scenes, config)
        scenes = scenes_revised
        print(f'revised scenes')
    except:
        try:
            scenes_revised = revise_scenes(scenes, config)
            scenes = scenes_revised
            print(f'revised scenes')
        except:
            print('cannot revise scenes')
            
    
    # Save the scenes to scenes.json
    with open(scenes_file_path, "w") as scenes_file:
        json.dump(scenes, scenes_file)
        
    return scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp


def process_audio_images(config: dict, scenes, audio_images_dir):
    # Step 4: Load HiDream pipeline and generate images
    print(f"Load HiDream pipeline and generate images")
    pipe, _ = load_models(model_type)
    height = HEIGHT
    width = WIDTH
    resolution = (height, width)
    guidance_scale = 3.9
    num_inference_steps = 16
    max_sequence_length = 512
    seed = -1
    negative_prompt = "worst quality, low quality, worst aesthetic, old, blurry, lowres, signature, artist name, watermark, username, sketch, logo, furry, text, speech bubble"

    # Generate images for each scene
    image_num = 1
    for scene in scenes:
        image_prompt = THEME+". "+scene['scene_description']
        image, seed = generate_image(pipe, model_type, image_prompt, negative_prompt, resolution, seed)        
        filename = f"image_{str(image_num).zfill(2)}.jpg"
        image_path = os.path.join(audio_images_dir, filename)
        image.save(image_path, dpi=(300, 300))
        image_num += 1

    # Move the pipeline back to CPU and delete it
    del pipe
    reset_memory(device)
    return

def process_audio_video(config: dict, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp):
    # Step 6: Load Video Pipeline
    print(f"Load Video Pipeline")
    # check GPU memory & decide offload strategy
    free_mem_gb = get_cuda_free_memory_gb(gpu)
    high_vram = free_mem_gb > 60
    # load all models & tokenizers
    text_encoder    = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                 subfolder="text_encoder", torch_dtype=torch.float16).cpu()
    text_encoder_2  = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                    subfolder="text_encoder_2", torch_dtype=torch.float16).cpu()
    tokenizer       = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                          subfolder="tokenizer")
    tokenizer_2     = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                     subfolder="tokenizer_2")
    vae             = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                                 subfolder="vae", torch_dtype=torch.float16).cpu()
    feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl",
                                                             subfolder="feature_extractor")
    image_encoder   = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl",
                                                        subfolder="image_encoder", torch_dtype=torch.float16).cpu()
    transformer     = HunyuanVideoTransformer3DModelPacked.from_pretrained(
                        "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu()
    # set eval & dtypes
    for m in (text_encoder, text_encoder_2, image_encoder, vae, transformer):
        m.eval()
        m.requires_grad_(False)
    transformer.high_quality_fp32_output_for_inference = True
    transformer.to(dtype=torch.bfloat16)
    vae.to(dtype=torch.float16)
    image_encoder.to(dtype=torch.float16)
    text_encoder.to(dtype=torch.float16)
    text_encoder_2.to(dtype=torch.float16)
    if not high_vram:
        vae.enable_slicing()
        vae.enable_tiling()
        DynamicSwapInstaller.install_model(transformer, device=gpu)
        DynamicSwapInstaller.install_model(text_encoder, device=gpu)
    else:
        text_encoder.to(gpu)
        text_encoder_2.to(gpu)
        image_encoder.to(gpu)
        vae.to(gpu)
        transformer.to(gpu)
        
    video_num = 1

    # Step 7: Generate video sequences
    for i, scene in enumerate(scenes):
        prompt = scene["action_sequence"]

        # Use the initial image for each scene
        image_input = os.path.join(audio_images_dir, f"image_{str(i+1).zfill(2)}.jpg")

        # Calculate duration to keep the video in 6-second increments
        if i + 1 < len(scenes):
            next_start_time = scenes[i + 1]["start"]
        else:
            next_start_time = last_end_value  # Use the final ending time for the last scene

        if i == 0:
            duration = next_start_time
        else:
            duration = next_start_time - scene["start"]

        video_name = f"v_fpk_{str(video_num).zfill(2)}_{str(i+1)}_{timestamp}.mp4"
        video_output_path = os.path.join(audio_videos_dir, video_name)
        synthesize_videos(text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram, 
            video_output_path, image_input, prompt, duration)    
        time.sleep(1)  # Pause for 1 second
        video_num += 1  # Increment video number for the next segment

    # 1) Offload / unload all models from GPU
    unload_complete_models(
        text_encoder,
        text_encoder_2,
        image_encoder,
        vae,
        transformer
    )
    
    del(text_encoder)
    del(text_encoder_2)
    del(tokenizer)
    del(tokenizer_2)
    del(vae)
    del(feature_extractor)
    del(image_encoder)
    del(transformer)
    
    # 3) Force GC + CUDA clean‐up
    reset_memory(gpu)
    
    return

def process_all_audios(audio_file, config: dict):
    """
    Processes a list of audio files through the workflow.
    """
    print(f"Processing audio file: {audio_file}")
    scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_audio_scenes(audio_file, config)
    print(f'{len(scenes)} scenes:\n{json.dumps(scenes, indent=4)}')
    print(f'last_end_value: {last_end_value} timestamp: {timestamp}')
    # Create starting images for scenes
    process_audio_images(config, scenes, audio_images_dir)
    return config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

def create_video(config):
    config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_all_audios(audio_file, config)
    process_audio_video(config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp)
    return

In [None]:
# run new systems
for audio_file in CONFIG["audio_files"]:
    create_video(CONFIG)
    reset_memory(device)
