In [1]:
import os
os.environ["XFORMERS_IGNORE_FLASH_VERSION_CHECK"] = "1"

import cv2
import einops
import gc
import imageio
import imageio_ffmpeg
import json
import math
import moviepy as mp
import numpy as np
import random
import safetensors.torch as sf
import secrets
import time
import torch
import traceback
import transformers
import whisper


from datetime import datetime
from diffusers import AutoencoderKLHunyuanVideo
from diffusers import ZImagePipeline
from diffusers_helper.bucket_tools import find_nearest_bucket
from diffusers_helper.clip_vision import hf_clip_vision_encode
from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation
from diffusers_helper.memory import fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
from diffusers_helper.thread_utils import AsyncStream, async_run
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
from moviepy.audio.io.AudioFileClip import AudioFileClip
from openai import OpenAI
from PIL import Image
from transformers import LlamaForCausalLM, PreTrainedTokenizerFast
from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
from transformers import SiglipImageProcessor, SiglipVisionModel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16

MAX_SEED = np.iinfo(np.int32).max
retry_limit = 3

Multiple distributions found for package optimum. Picked distribution: optimum
Skipping import of cpp extensions due to incompatible torch version 2.7.1+cu128 for torchao version 0.14.1+cu128             Please see https://github.com/pytorch/ao/issues/2919 for more info


Currently enabled native sdp backends: ['flash', 'math', 'mem_efficient', 'cudnn']
Xformers is installed!
Flash Attn is installed!
Sage Attn is not installed!


In [2]:
THEME = "3D render animation, 3D CGI special effects, storm silenced hearts"
CONFIG = {
    "openai_api_key": "",
    "openai_model": "gpt-4.1-nano",
    "openai_model_small_reasoning": "gpt-5-nano",
    "openai_model_large": "gpt-5-nano",
    "hf_token": "",
    "base_working_dir": "./images",
    "base_video_dir": "./output",
    "audio_files": [
        "/mnt/d/Share/Audio/StormSilencedHearts.mp3",    
    ],
    "device": device,
    "dtype": dtype,
    "retry_limit": retry_limit,
    "MAX_SEED": MAX_SEED,
}

HEIGHT = 544
WIDTH = 960

MODEL_PREFIX = "azaneko"
LLAMA_MODEL_NAME = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"

outputs_folder = './temp_outputs/'
os.makedirs(outputs_folder, exist_ok=True)

# Ensure base directories exist
os.makedirs(CONFIG["base_working_dir"], exist_ok=True)
os.makedirs(CONFIG["base_video_dir"], exist_ok=True)

SCENE_DESCRIPTIONS = '''scene_description: boris vallejo style, frank frazetta style, 8k high quality digital painting, masterpiece, very detailed, ultra realistic, (best quality) very detailed epic masterpiece, detailed face, full body, wrinkly wizard toad reading an ancient scroll in a swamp, best quality, epic scene, Dungeons and dragons atmosphere, heroic fantasy, realistic, realism, full body
scene_description: time travel, holding coffee,, hdr, 8k, absurdres, shiny, outdoors, reflection, blurry, blurry background, tokyo lights,tokyo street, neon lights, cyberpunk, high-contrast lighting, intricate details, vibrant colors, reflective surfaces, futuristic urban environment, glowing neon signs, cybernetic enhancements, punk aesthetic, dynamic pose, dynamic composition, depth of field, dark_theme, detailed backgroud, foreshortening, blurry edges, vignetting
scene_description: hyper realistic, a majestic A gemstone stag slowly blooming into life, moss and flowers sprouting from cracks in its crystalline body as it awakens, its eyes, initially dull stones, begin to glow with an inner emerald light, dawn light filters through a forest, illuminating the stags nascent awakening, wide shot capturing the stags full form and the blooming flora with golden antlers standing in a sunlit clearing, surrounded by ethereal forest spirits, glowing flora, magical atmosphere, extremely high-resolution details, photographic, realism pushed to extreme, fine texture, incredibly lifelike, Cinematic, beautiful, vibrant, masterpiece, 32k, ultra HD, ultra-detailed, amazing quality, amazing artist, sharp edges, detailed textures, full view, atmospheric lighting, amazing visuals
scene_description: impressive and grotesque scenery on a distant world, famous artwork inspired by jordan grimmer, dramatic scene, fractal art, 1990s fantasy style, dynamic angle, this image shows the enormous transcendent hydra-like beast known as the powerful jadesnap as it wriggleflomps from the izzled depths of a churning, otherworldly glowing sea under a dramatic tempestuous sky, it is surrounded by the typical jagged fractal rock formations on the crinkled shore of the water-rich planet "zoffeldirly quartus," superstitious life forms call it "the bringer of lost keys", creature focus, very aesthetic, extremely detailed, ultra high resolution, 8k, 4k, harmonising colors, light beige and chartreuse and bordeaux red and indigo blue and byzantium purple and ebony black, ovg, in the style of ck-ovf, amanoer, arsmjstyle, dnddarkestfantasy, aidmafluxpro1.1
scene_description: A realistic toilet, completely engulfed in flames, inferno, blazing, concept art, masterpiece, perfect lighting, purple and pink flames, realistic flames, 8k, absurdres, massive fire - rendered in the highest quality, realistic bathroom background, A3ther
scene_description: intricate linework with expressive contrasts, soft lighting with dynamic highlights, young woman wearing flight goggles, aviator leather jacket, long loose platinum hair, standing next to a 1930s biplane on an airstrip surrounded by tropical jungle, sunset, in orange hues
scene_description: 1 girl , ghost girl , grave stone , hugging , kneeling , tear , raining , masterpiece, best quality, good quality, very awa, newest, highres, absurdres
scene_description: A digital art splash in the style of bo-cyborgsplash, a mysterious raven character positioned in the center of the frame, directly facing the viewer, the raven's upper body is close to the camera, showcasing its dark, ornate attire adorned with intricate details and glowing purple gemstones, its long beak is visible, and its piercing pink eyes seem to lock onto the viewer with a sense of intensity, the background is a dark, neon-lit space with glowing elements, creating a mystical atmosphere, the character is adorned with numerous gemstone necklaces, adding a touch of opulence to the overall design, the overall effect is one of intrigue and mystery, a fantastic abstract colorful art splash, high quality, ultra detailed
scene_description: anthropomorphic corgi knight, corgi head, on one knee, planted sword, holding sword, plate armor, scowl, v-shaped eyebrows ,cloudy, godrays, sunshine, riverbank background, holy halo ,wide shot, depth of field, realism, no humans, animal focus, corpses,covered in blood, battlefield
scene_description: 1girl,solo,furry,pink fur,tail,ears,source_furry, child, kid, masterpiece, best quality, long hair, twin braids, farmer outfit,(steampunk), goggles, googles on head, bag, wheat field, outdoors, wind, accordion, holding instrument, playing instrument, gloves, wheat, sunset, farm, house, scenery, landscape, blurry, blurry background, looking at viewer, smile
scene_description: masterpiece, best quality, amazing quality, solo, sitting, no humans, glowing, wariza, robot, science fiction, on floor, electricity, cable, joints, robot joints, damaged, mechanical parts, wire, humanoid robot, screw, bolt, Countless lightning, electric shock, open mouth, Open five fingers,hands up, Bent back, Low Angle
scene_description: cinema scene, photograph, 4k photorealistic, beautiful sexy girl, NSFW, see through sheer, breasts exposed, brown eyes, pink hair, light makeup, red lips, large breasts, glowing necklace, pink ballgown, lactation, lactating, double peace sign, plunging neckline, smile, snow, outdoors, east asian architecture, night
scene_description: A close-up of an avian alien with feathers that change color depending on its emotions, intricate plumage patterns with metallic sheen, and sharp, beak-like appendages that subtly shift shape.
scene_description: A warm, lively Irish bar with rustic wooden beams, vintage pub signs, and soft amber lighting. Behind the counter stands an anthropomorphic cat dressed in a casual bartender outfit — rolled-up sleeves, vest, and flat cap — pouring a beer from a bottle into a pint glass with careful precision. The bar top is polished wood, lined with empty glasses and old whiskey bottles. The atmosphere is cozy and inviting, with a touch of old-world charm. In the background, shelves of liquor and a mirror reflect the soft golden glow of the room. The scene is full of character and detail, capturing the charm of a traditional Irish pub
scene_description: a naked woman with long hair, transparent wet shirt, can see her beautiful breasts, no panties seen, her legs covering her crotch, sits in a chair, lewd pose. diffused natural lighting from a nearby window, sensual, revealing, tender.
scene_description: The high resolution image depicts an underwater castle with intricate architecture, reminiscent of a fairy tale, lying on the sand below. The castle is adorned with domes, spires, and ornate carvings, all in shades of blue and white. It is surrounded by vibrant coral reefs and various marine life, including colorful seashells and bubbles floating around. The scene is bathed in soft, natural light filtering down from above, creating a magical and serene atmosphere. The castle appears to be a blend of Gothic and fantasy styles, with a grand entrance and delicate details that suggest a world of enchantment beneath the waves.
scene_description: perfect quality, bokeh effect, photography, an ancient, cloaked female figure with piercing eyes, face covered in dust and dirt, their face wrapped in fabric adorned with extremely intricate hieroglyphs, set against an abstract, polygonal blue gradient background. The cloth appears worn and frayed
scene_description: A NSFW Snapchat explicit iPhone selfie photo It's of a very pretty and attractive girl, whose a blonde, and has a good, hourglass figure, her boobs are out and she is complety naked, smiling at the camera with her head tilted, her upper body only is visible in the image, her boobs are out and she is complety naked, as it is a selfie in a carseat, candid, vertical 9:16 aspect ratio.
scene_description: The image depicts a beautifully ornate, oval-shaped clear glass sphere with an intricately designed frame. The frame is made of a golden material with elaborate, swirling patterns. Inside the sphere there is a detailed, red and white dragon emerging from what appears to be water or waves made of crystal. The dragon has a majestic and mythical appearance, with flowing, spiky hair and a fierce expression. The background of the sphere shows a dynamic scene with splashing water and waves, adding to the sense of movement and energy. The sphere is mounted on a tree trunk, and there are pink flowers and green foliage surrounding it, enhancing the natural and mystical atmosphere of the scene.
scene_description: A cute monster with a rainbow-colored fur coat, a long tail, and big, round eyes. The monster is sitting on a cluttered shelf in a mysterious, dark environment. The shelf is surrounded by other colorful monsters, some of which are sleeping, while others are playing with toys. The environment is ultra-detailed, with intricate carvings on the walls and floors, and a perfect cinematic lighting that highlights every detail. The product photography is perfect, capturing every angle and texture of the monster and its surroundings. The environment is perfect for a horror movie, with a sense of foreboding and danger lurking around every corner. The scene is rendered in 8K, with every pixel perfectly defined and crisp.
scene_description: A (glistening crystal eye:1.3) is embedded in the bark of a soaked oak tree, its (surface refracting raindrops:1.0) like glass marbles on velvet. The forest around it is (bathed in muted blue lightning:0.8), and nearby crows sit motionless, their feathers (glowing faintly at the edges:0.5), as if time paused mid-rain
scene_description: A stunningly intricate mechanical steampunk timeart gold crow perches gracefully on a gnarled tree branch, its piercing gaze fixed directly upon the viewer, body with spinning gears. The creature's eyes are exquisitely detailed, reflecting the ethereal glow of a dark nebula that shimmers in the background. Soft bokeh adds an air of mystery to this captivating scene.
scene_description: The image depicts a fantastical scene set in a lush, enchanted forest. The focal point is a delicate, ethereal creature that appears to be a blend of plant and humanoid form. This creature has a translucent, almost glass-like body with intricate, vein-like patterns running throughout. Its limbs and torso are slender and elongated, giving it an otherworldly appearance. The creature's head is rounded with large, luminous yellow eyes that seem to glow softly, adding to its mystical aura. It has small, pointed ears or antennae protruding from its head, enhancing its otherworldly look. The creature's skin is a gradient of translucent hues, with subtle red accents at various points, particularly around its joints and the inside base of its body. Surrounding the creature are two large, glowing flowers that resemble lotus flowers. These flowers have a soft, ethereal light emanating from their centers, with petals that are a gradient of white to light pink. The flowers are connected to the creature by slender, elongated stems that are also translucent and veined. The background is a dense, verdant forest with a soft, diffused light filtering through the canopy, creating a mystical and serene atmosphere. The ground is covered in moss, small plants, and fallen leaves, adding to the natural, enchanted setting.
scene_description: A shadowy, humanoid figure floats above scorched earth. No armor, only black smoke and swirling darkness form its body. Occasional red or violet glows inside. Stormy sky above.
scene_description: Photograph of a big green orc in a heavy metal tshirt sitting in a rocking chair on a porch and playing a classical guitar,Photographed with a cinematic 50mm lens
scene_description: The image shows a tabby cat standing behind a wooden fence in a field. The cat is holding a bouquet of colorful flowers, including purple and white daisies, and some other small flowers. The background is a lush, green field with various wildflowers and plants.
scene_description: Intimate portrait of a young woman from a nomadic tribe in ancient China, wearing fur-trimmed clothing and intricate silver jewelry. Wind-swept hair and a resilient gaze. Background of a vast, open grassland under a dramatic sky.
scene_description: a digital artwork depicting a Naiir, the embodiment of radiant darkness —  a humanoid alien figure standing in a starry, cosmic background. The alien has a slender, elongated body with a smooth, black, textured skin that features intricate, swirling patterns that resemble lace or etched designs. The figure's head is large and oval-shaped with large, almond-shaped, shiny black eyes that dominate its facial features. formless cosmic intelligence made of living shadow and flowing obsidian light, semi-translucent and luminous from within,   Its body is smooth, elongated and unearthly, dissolving at the edges into nebular mist.   The scene is surrounded by collapsing stars and slow-moving cosmic dust, evoking silence, gravity and the sacred beauty of dissolution. Ethereal, Mohrbacher-inspired, elegant and otherworldly.
scene_description: A cinematic portrait of Krampus from the 2015 movie, his body colored in eerie shades of blue and purple like a frozen corpse. He has glowing cyan eyes without pupils, white hair, a mustache, and a beard. Two long, twisted goat horns curve from his head. He is leaning forward, his face partially framed by a tattered red hood with white trim, wearing a creepy Santa costume. His torso is visible as he extends his creepy hands with long black nails towards a glowing snow globe. Inside the snow globe, the word "XMAS" is visible, with tiny servers (some on fire) and miniature figures (including a dog) desperately trying to fix the chaos. The scene is set on a dark, snowy night with a dominant palette of deep blues and shadows, dramatic lighting, hyperrealistic, 8K.
scene_description: sitting girl, brown eyes, round glasses, pigtails, braids, big hat, witch's hat, serafuku, pantyhose, Knees, library, candles, magic book, glowing effects, foreground, depth of field, Blurred periphery. masterpiece, best quality, amazing quality, very aesthetic, newest, incredibly absurdres, ultra detailed, 8k, HDR, High quality digital art, official art, advertising style, detailed background, painting \(medium\), cinematic lighting, ray Tracing, ambient occlusion, dynamic composition, foreshortening
scene_description: anime digital painting. dutch angle tilt. a striking alien woman with smooth, light purple skin that shimmers under the twin suns of an exotic beach planet, her sculpted body adorned only by a gleaming metallic chain bikini that clinks faintly with each movement. Her long, emerald-green hair is intricately braided with golden threads, the intricate plaits swaying as she stands ankle-deep in the foaming surf, (front view). roaring ocean where colossal turquoise waves crash against jagged coral formations. The sky above is a dramatic swirl of violet and peach clouds, their wispy tendrils stretching toward the horizon as if reaching for the distant crescent moons. Her slitted, pupil-less silver eyes gaze intently at the storm-tossed waters, where the silhouette of some immense, serpentine creature briefly breaks the surface—suggesting this isn't just a leisurely swim, but a summoning. The sand beneath her feet glitters with crushed bioluminescent shells, casting an eerie glow that highlights the arcane sigils tattooed along her thighs, pulsing faintly in time with the crashing waves. (dynamic composition). masterpiece, very aesthetic, absurdres, best quality, extremely detailed, ultra-detailed, detailed face, detailed eyes, expressive eyes, smooth skin, dynamic lighting, high contrast
scene_description: Photograph of an anthropomorphic gazelle woman wearing a blue sleeveless top with "Gazelles love Z-image!" and a lightning bolt in yellow text. The gazelle woman has a real gazelle head with long horns. She sits at a wooden table in an outdoor cafe with a mountainous backdrop. Her right hand rests on the table, and in front of her is a glass dessert dish filled with ice cream and whipped cream. Her hands are brown, matching her fur tone and her fingernails are painted black. She has five fingers on each hand. Her big brown eyes are expressive and her eyelashes are long. Her gazelle fur is realistic and has a natural texture. 
scene_description: A mysterious, celestial woman floating weightlessly at the heart of a glowing cosmic nebula. Her form appears almost sculpted from stardust—softly outlined, radiant, and partially translucent, as if her body is woven from the very light of the stars. She wears delicate, flowing, transparent cosmic veils that drift around her like wisps of nebula gas, revealing her naked body in a subtle, elegant contours of her silhouette in a sensual yet ethereal way. Her skin shimmers with tiny points of light, like constellations scattered across her body. Her long, flowing hair dissolves into swirling clouds of nebula dust, blending seamlessly with the luminous environment around her. Her expression is enigmatic and alluring, her gaze deep and ancient, as though she carries the secrets of the universe. Surrounding her are vivid nebula clouds in rich blues, purples, and magentas, with star clusters sparkling through the mist. Soft cosmic light illuminates her from all sides, giving her an otherworldly glow. The entire scene feels dreamlike, seductive, and profoundly mysterious—an astral goddess emerging from the stars themselves.
scene_description: A vibrant fantasy close-up of an ancient dagger suspended perfectly in midair above a smooth, rounded moss-covered stone deep within a lush enchanted forest. The blade glows with a faint ethereal light, intricate runes etched along the metal pulsing softly. Sunbeams pierce through dense emerald foliage, scattering shimmering particles of dust and magic around the hovering weapon. Vines, ferns, and luminescent flowers frame the scene, their colors rich and saturated. The atmosphere feels charged, as if the forest itself is holding its breath around this floating, mystical dagger.
scene_description: (epic mid-action shot :1.3) of a callipygian Yule huntress leaping between snow-buried roots; (antlered hood of black fur :1.25), glowing runes on thighs, (frosted bowstring pulled taut :1.2), breath steaming in the frigid air. A skeletal stag-spirit charges beside her. (Ghost-style red-and-cyan demonic lighting :1.3), heavy painterly texture, Brom-inspired grim beauty, drifting snow haze, sharp kinetic motion.
scene_description: A snow-covered ridge at night with an arctic fox standing alert near the center of the frame. The foxâs fur is rendered in fine, hyper-detailed texture, each strand catching subtle reflections from the sky behind it. Its thick winter coat displays shifting highlights of pale green, violet, and faint turquoise as the northern lights cascade overhead. The aurora forms wide vertical curtains and soft undulating waves that stretch across the entire sky, with smooth gradients transitioning between pastel greens, cool violets, and faint hints of blue-white near the horizon. Slow, delicate snowfall drifts downward in small, glitter-like flecks, each flake illuminated by ambient aurora light, creating a faint sparkling effect in the still air. The ridge beneath the fox is covered in soft, powdery snow with visible surface textureâtiny wind-carved ripples, slight depressions from the foxâs paws, and crystalline highlights from frost catching the light. The background shows the dark silhouettes of distant snow-covered hills fading into atmospheric haze, providing depth without distracting from the fox. The overall lighting is low and naturally diffused, entirely dependent on aurora glow and ambient night reflection, giving the scene a calm, silent, untouched mood.
scene_description: In the decaying empire of Würstreich, a rogue Witch Smeller hunts through a blighted forest, lantern in hand, revealing twisted fae creatures lurking. The scene captures a blend of eerie magic and the grotesque, with ancient runes glowing ominously in the background. in the style of the decaying empire of Würstreich, dark magic and grotesque creatures abound, inspired by Sean Aaberg's punk aesthetic, dark fantasy world combines grim landscapes with vibrant, exaggerated neon colorful art, heroes skirmish monstrous foes in a setting where rebellion, decay, and mysticism intertwine, crafting a unique, immersive vibe
scene_description: A photorealistic, cinematic portrait of a mermaid emerging from deep ocean water — her upper body and head fully in frame with dramatic low-angle side lighting casting volumetric shadows across her face and torso; skin is pale greenish-blue with realistic wetness, fine pores visible under high-resolution detail, covered in bioluminescent barnacles and organic marine growths that appear to be fused into her flesh — long, flowing emerald-green hair cascades dynamically around her shoulders like underwater kelp, strands translucent and catching refracted light from above; luminous golden eyes with detailed iris texture and realistic pupil dilation, glowing softly as if illuminated by internal bioluminescence or distant sunbeams filtering through water layers; mouth slightly parted in a natural expression of awe or surprise — she wears an ornate gold necklace composed of layered shell motifs and central circular pendant, and matching belt encircling her waist, both rendered with metallic PBR materials showing realistic reflections and subtle oxidation from saltwater exposure; lower body transitions into a dark teal mermaid tail with segmented fin structure, scales subtly visible beneath the water’s surface tension — tail curled gently behind her as if mid-motion or preparing to dive again; background depicts an eerie underwater cavern — silhouettes of twisted coral branches and ghostly kelp forests stretch vertically into a gradient sky of deep crimson-orange fading into murky teal abyss below, suggesting volcanic vents or ancient ruins above; lighting originates from above-right with volumetric rays piercing through water layers, casting soft rim light on her hair and shoulders while plunging the lower body and background into chiaroscuro shadows — shallow depth of field f/2.8 isolates subject sharply against blurred coral silhouettes; color palette dominated by toxic green, deep teal, burnt orange, and gold accents with high contrast between illuminated zones and shadowed recesses; atmosphere is mythic, ominous, and otherworldly — evokes the feeling of a forgotten sea deity awakening from slumber or confronting an unseen threat; textures include hyper-detailed wet skin glistening with water droplets, metallic sheen on jewelry reflecting ambient light, translucent hair strands catching refracted rays, rough coral surfaces with visible erosion patterns, and subtle caustic lighting effects on submerged objects; artistic style blends dark fantasy concept art with cinematic underwater photography — reminiscent of real-world deep-sea documentaries like Blue Planet or Deep Blue, rendered using Unreal Engine 5 PBR materials, subsurface scattering on skin, advanced global illumination, and realistic water refraction; technical parameters: 8K resolution, ultra-high detail, professional color grading with teal/orange split toning, film grain texture overlay for cinematic realism, depth-of-field blur applied to background elements, motion blur on hair strands suggesting slow movement through water; quality boosters: masterpiece, hyperrealistic, photorealistic fantasy portrait, highly detailed, epic scale, dramatic lighting, cinematic composition, professional-grade underwater photography.
scene_description: A colossal, mythological parrot-dragon hybrid captured mid-storm descent in a hyperrealistic cinematic shot — its body sculpted with biomechanical precision and covered in iridescent feathers that shimmer like liquid enamel: vibrant crimson red gradients bleeding into deep emerald green, metallic gold highlights along wing edges, and pearlescent sheen on the neck crest; wings are broad, feathered membranes edged with crystalline spines that refract starlight — talons are razor-sharp black claws tipped with glowing amber energy, beak open wide revealing rows of serrated teeth and a tongue pulsing with bioluminescent veins — eyes blazing with intense orange-gold iridescence like molten rubies; the creature is captured in dynamic motion: body angled downward at 45 degrees, wings swept back for aerodynamic thrust, tail feathers fanned out like a fan of stained glass catching starlight, creating a trail of glowing feather particles that disperse into the atmosphere — this is not just flight, but a controlled, gravity-defying dive through cosmic space. Below it: an otherworldly frozen lunar wasteland — jagged obsidian monoliths covered in thick frost and crystalline ice formations resembling ancient alien architecture; cracked plains of translucent deep indigo-blue ice reflecting starlight like shattered mirrors; wind-sculpted dunes of powdered snow drift across the terrain, swirling into vortexes that catch and refract light — no mouse present, replaced by a dynamic storm system: gusts of wind lift ice shards and frozen mist in slow-motion spirals, creating a sense of motion and scale; snowflakes are rendered with extreme detail — each flake has 6-8 arms with fractal symmetry, catching starlight to glow faintly blue or gold depending on angle. Above: a vast, star-strewn sky painted in deep indigo and dark teal tones — constellations of glowing celestial bodies (nebulae, distant stars) pierce the atmosphere like diamond dust; subtle aurora-like ribbons of violet and teal energy drift across the upper horizon; soft ambient glow from the stars illuminates the scene with a cool, ethereal light that contrasts sharply with the creature’s warm iridescence. Camera & Composition: Low-angle wide shot (worm's eye view) capturing the parrot-dragon descending toward the viewer — composition follows the rule of thirds with the creature positioned in the upper-left third and its descent path leading diagonally down to the lower-right corner; dynamic motion blur on trailing feathers and wind-blown snow particles enhances speed and momentum; depth of field f/2.8, 35mm lens (cinematic wide-angle), shallow focus blurring distant ice formations while keeping foreground frost crystals and mid-air feather particles razor-sharp. Lighting: Dramatic chiaroscuro with dual light sources — cool ambient starlight from above casting soft shadows on the icy terrain, and warm internal bioluminescence within the creature’s feathers and eyes creating glowing highlights that contrast against the cold environment; volumetric lighting scatters through ice crystals and snowflakes to create realistic lens flare and atmospheric haze. Color Palette: Dominated by deep indigo-blue for the entire landscape — from sky to ground, with subtle dark teal undertones in shadows — contrasted sharply with the parrot-dragon’s vibrant crimson red and emerald green iridescence, creating maximum visual contrast. Saturation is high on the creature but muted on the ground to preserve realism; color temperature is cool overall (5000K-6500K) with localized warm spots from glowing eyes and feather edges. Textures: Hyper-detailed PBR materials — feathers rendered with micro-fur texture, iridescence mapped via specular highlights that shift based on viewing angle; ice surfaces have crystalline structure with refractive index variations; frost has layered translucent layers with subtle subsurface scattering; snowflakes are individually textured with fractal geometry and light refraction. Artistic References: Studio Ghibli’s “Nausicaä of the Valley of the Wind” meets “The Legend of Zelda: Breath of the Wild” frozen landscapes — combined with modern digital concept art from artists like Greg Rutkowski, Artgerm, and Katsuya Terada for creature design; cinematic lighting inspired by Denis Villeneuve’s “Dune” and Guillermo del Toro’s “Pacific Rim.” Technical Parameters: Rendered using Unreal Engine 5 with Nanite geometry and Lumen global illumination — camera settings mimic professional wildlife photography: 35mm full-frame lens, f/2.8 aperture for shallow depth of field, ISO 100 for clean low-noise capture, shutter speed 1/400s to freeze motion; post-processing includes depth of field blur, motion blur, volumetric fog, lens flare, and chromatic aberration for cinematic realism — rendered with photorealistic lighting simulation including global illumination, ray-traced reflections on ice surfaces, and accurate color grading. Quality Boosters: Highly detailed, photorealistic textures, epic fantasy scene, masterpiece quality, cinematic composition, dynamic motion, atmospheric depth, studio-quality rendering, Unreal Engine 5 PBR materials, volumetric starlight, frost physics simulation, feather particle system with glow trails, ultra-realistic ice and snowflake detail — no mouse present, replaced by wind-blown ice vortexes and glowing lichen patches for environmental storytelling.
scene_description: This digital artwork presents a striking, hyper-realistic depiction of an alien figure, characterized by its large, expressive eyes and elongated, almost skeletal form. The composition is centered on the alien's head and upper torso, with its extended arm reaching towards the viewer, creating a sense of immediacy and engagement. The use of color is minimal yet effective, with a dominant palette of cool blues and greens that contrast sharply against the dark, almost black background. This contrast enhances the otherworldly, eerie quality of the figure. The lighting is dramatic, with a bright, almost neon-like glow emanating from the alien's head and eyes, which serves to highlight its facial features and create a sense of depth and dimensionality. The texture of the alien's skin is smooth yet slightly reflective, adding to the realism of the digital medium. The style of the artwork can be associated with the sci-fi and horror genres, and it shares similarities with the works of H.R. Giger, known for his biomechanical and alien designs. The digital effects, such as the subtle glitch-like distortions and the timestamp in the bottom left corner, add a modern, cybernetic feel to the piece, suggesting themes of technology and otherness. Symbolically, the alien figure can be interpreted as a representation of the unknown, the outsider, or the "other," evoking feelings of fear, curiosity, and fascination. The extended arm and direct gaze may also suggest a sense of intrusion or invasion, adding to the overall tension and unease of the image. 
scene_description: A dynamic, full-body fantasy illustration in the style of high-fantasy arcane battle art â glowing spell effects, dramatic contrast lighting, and a richly textured, painterly aesthetic. The scene captures a courageous female gnomish mage holding a shimmering magical shield against a barrage of sickly green fel-energy blasts. On the right side, the gnomish mage stands her ground with fierce determination. Small in stature but mighty in presence, she braces herself with feet planted firmly on scorched stone, leaning slightly into the force of the attack. Her expression is focused, brows furrowed, teeth clenched with effort. Her eyes glow with arcane blue light, reflecting the magic of the shield she conjures. Curly copper hair escapes from a loose bun, whipping wildly in the magical turbulence. She wears layered mage robes in deep blues and violets, embroidered with silver runes that shimmer as they channel protective power. Arcane trinkets and crystalline charms dangle from her belt, rattling from the impact of the spells. Both of her hands project a large, curved energy barrier, semi-transparent and radiant, swirling with prismatic arcane sigils. The shield glows in cool tones â icy blues, radiant whites, and violet edges â forming a dome-like shape that ripples each time the necromancerâs magic strikes it. The fel bolts slam into the gnomeâs shield with explosive force, splashing across its surface like acid against glass. Sparks of green fire scatter into the air, dissipating into swirling smoke. The shield dents inward but holds strong, the arcane runes brightening each time they absorb the impact. The environment around them is a ruined crypt chamber â cracked stone pillars, glowing green braziers, skull reliefs carved into the walls, and drifting necrotic mist pooling around the floor. The ground is littered with broken bones and shattered arcane crystals. Magical wind pulls loose papers and fragments of cloth into the air. The lighting is intense and cinematic: sickly green fel-light casting sharp shadows, while the cool radiant glow of the gnomeâs shield lights her face and robes with heroic determination. The clash of green and blue light dominates the color palette, creating a vivid magical duel. Textures are rich and tactile: rough stone, tattered cloth, shimmering spell energy, glowing sigils, floating dust, and fel-fire smoke rendered in high fidelity. The composition is charged with movement and tension â a small but powerful gnomish mage standing against overwhelming dark magic, her shield the only thing between her and annihilation.
scene_description: A 3d rendering shoot from a close-up camera angle about a futuristic alien character with intricate tattoos and a small reptile perched on his shoulder, set in a lush, green forest background. the image also shows a humanoid alien with a bald head and blue skin, wearing a futuristic, metallic suit with intricate designs on his face and shoulders. on the middle of the image, a 1boy, who appears to be in his late teens or early twenties, is facing the viewer, looking directly at the camera with a serious expression. he has short, spiky hair and large, pointed ears. his eyes are brown and his skin is a vibrant turquoise color. he is wearing a metallic, futuristic suit with orange accents and has a small, insect-like creature perched atop his shoulder. on his upper body, there is a reptile named alien. the background is blurred with greenery and soft sunlight filtering through the trees, giving a sense of depth and movement to the image.
scene_description: A CGI-rendered image of a fantastical creature with a humanoid, reptilian appearance. It has long, white hair, red eyes, and a red, tattered cloak. The creature is squatting, holding a glowing pink crystal in its clawed hand. The background features a dark, rocky cavern with mist and green moss. The creature's skin is textured and rough, with a blend of grey and green hues. The overall atmosphere is mystical and otherworldly.
scene_description: A willowy young woman with light-brown hair stands confidently on a tall, craggy rock in a sheer silk concert dress with an ornate pattern, playing the violin. She has a slender body with a thin waist and small breasts. Her left hand gracefully glides over the instrument's neck, while her right hand traces the bow across the vibrating strings. Heavy rain falls thickly around her, drenching her body and hair, and clouded moonlight illuminates her toned arms and highlights the contrast between her smooth skin and the warm sound of the violin, as in the blurry background a storm of lightning and thunder brews over the sea. This combination of intimacy and classical grace creates a tranquil yet dynamic atmosphere, where the simplicity of the setting enhances the elegance of the performance.
scene_description: A hauntingly beautiful close-up captures an ethereal young woman gazing through a rain-drenched glass pane, her face partially obscured by droplets that cling to every surface like tears of sorrow. Her piercing green eyes—so intense yet vulnerable—are fixed on the viewer with an expression of quiet contemplation or melancholy, their gaze both intimate and distant. She wears a thick, vibrant teal knitted sweater that seems to be made for warmth against the chill outside; it drapes loosely around her shoulders while she holds a single golden maple leaf gently between her fingers—a symbol of autumn's fleeting beauty. The background is shrouded in deep darkness, creating stark contrast with the illuminated figure and enhancing the sense of isolation. Raindrops fall steadily across the lens, each one catching light differently depending on its position, adding texture and depth to what appears at first glance to be just water droplets but ultimately contributing to a dreamlike atmosphere where reality blurs into memory. The overall composition evokes a powerful emotional narrative: perhaps longing, solitude, or introspection during a stormy evening. This image has all the hallmarks of photorealism—the meticulous detail from individual hairs to subtle skin textures—and uses soft shadows and natural highlights effectively to create realism without sacrificing emotion. It feels almost painted, not merely photographed—it’s rendered with such delicate precision and atmospheric tension that viewers are drawn deeper into this moment of stillness and reflection. As though time itself pauses beneath the falling drops, this portrait stands out as a masterful blend of realistic technique and poetic storytelling—an unforgettable glimpse into inner world reflected through nature’s transient elements.
scene_description: A dark magic female daemon. She has long red bullhorns on her head, red glowing eyes, a big silver nose ring and sharp teeth. She has long black wavy hair. She wears a massive black metal armor with sharp edges. She holds a massive large greatsword made of black metal and red glowing edges. In a fast and forcefull movement, she is ramming her sword into the ground making it break open. 4k, cinematic movement
scene_description: hyper-realistic, illustration, concept art style,dark, stylish, and futuristic figure, likely 30-40 years old with African descent, walks confidently. Short, cropped haircut; dark beard; wearing round sunglasses. He's dressed in a distressed, layered outfit of grey, leather-like, and dark-gray fabric garments. A long, loose overcoat with fringe detailing, a vest-like piece underneath, and fitted trousers form a unique cyberpunk ensemble. Multiple necklaces with varied textures and shapes of metal and stone adorn his attire. The clothing style is avant-garde and combines elements of modern streetwear with a hint of military and tribal aesthetics. He carries a dark-colored backpack. His expression is intense and purposeful, with a focused stare, conveying a sense of quiet power and determination. The pose is casual yet commanding, with a slight forward lean as if walking purposefully. He's standing on a gray pavement. The background subtly depicts a cityscape. The lighting is dramatic, with strong shadows and muted tones, creating a moody atmosphere. The overall style is gritty, with a strong emphasis on texture and form. The colors are primarily muted greys, blacks, and browns. The image has a gritty, low-key, and dramatic feel. The perspective is a mid-shot, angled slightly from the front, focusing on the man's attire, and body language. Street style, photorealistic, cinematic, photorealism, highly detailed,
scene_description: Dark fantasy illustration of a powerful sorceress with an hourglass figure, digital art with an anime aesthetic, wearing a massive, pointed witch hat made of a dark, weathered, stone-like material covered in sharp thorns and a crown of multiple glowing red demonic eyes, her face is partially shadowed by the brim, revealing glowing red pupils, pale porcelain skin with fine black crackle veins on her hips and thighs, and short, messy dark auburn hair, she is dressed in a tattered, skin-tight, black symbiotic bodysuit that appears organic and web-like, with shredded sections revealing her pale skin, thorny protrusions on her shoulders and arms, and long, claw-like black gloves, dramatic chiaroscuro lighting, intense, fiery orange-red backlighting emanating from under the hat's brim, casting a powerful rim light on her form and hair, contrasted with a soft, cool ambient fill light from the front, a dark and moody color palette of charcoal black, cool grays, and pale off-white, accented by vibrant, saturated red and orange glows, low-angle, three-quarter body shot, dynamic S-curve pose with arms raised to her hat, set against a dark, atmospheric, out-of-focus background of a misty forest with gnarled, thorny branches, ultra-detailed, sharp focus on the subject, shallow depth of field, high resolution, intricate textures.
scene_description: A dramatic scene in the style of Black Myth, a serene young monk in simple kasaya robes meditating calmly, completely unmoved as the ethereal White Bone Spirit (Bai Gu Jing) with pale, semi-transparent skin and a seductive yet melancholic expression leans over his shoulder, her tattered white robes flowing. Set inside a misty, ruined ancient Chinese temple with broken pillars and misty light (chiaroscuro). Dark fantasy aesthetic, traditional Chinese ink wash painting style, cinematic rendering, highly detailed, 8K resolution, intricate textures.
'''

ACTION_SEQUENCES = f'''action_sequence: The man dances energetically, leaping mid-air with fluid arm swings and quick footwork
action_sequence: The girl skateboarding, repeating the endless spinning and dancing and jumping on a skateboard, with clear movements, full of charm, breats jiggling
action_sequence: The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair
action_sequence: The young man writes intensely, flipping papers and adjusting his glasses with swift, focused movements
action_sequence: The girl dances gracefully, with clear movements, full of charm, breasts jiggling
action_sequence: A jellyfish dances in the sea
action_sequence: The man dances energetically, leaping mid-air with fluid arm swings and quick footwork
action_sequence: A pretty clown girl with blue skin giggles as blood drips from the knife in her hand.
'''

In [4]:

def reset_memory(device):
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
    torch.cuda.reset_accumulated_memory_stats(device)
    torch.cuda.ipc_collect()

def get_audio_pipe():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    
    model_id = "openai/whisper-large-v3-turbo"
    
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=dtype, low_cpu_mem_usage=True
    )
    model.to(device)
    
    processor = AutoProcessor.from_pretrained(model_id)
    
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        chunk_length_s=24,
        batch_size=16,  # batch size for inference - set based on your device
        torch_dtype=dtype,
        device=device,
    )
    del model
    return pipe

def load_models():
    pipe = ZImagePipeline.from_pretrained(
        "Tongyi-MAI/Z-Image-Turbo",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=False,
    )
    pipe.to("cuda")
    
    return pipe

@torch.inference_mode()
def generate_image(pipe: ZImagePipeline, prompt: str, negative_prompt : str, resolution: tuple[int, int], seed: int):
    # Get configuration for current model
    prompt = THEME + ", " + prompt
    
    # Parse resolution
    height, width = resolution
 
    # Handle seed
    if seed == -1:
        seed = secrets.randbits(32)          # 0 .. 2**32-1
    
    image = pipe(
        prompt=prompt,
        height=height,
        width=width,
        num_inference_steps=9,  # This actually results in 8 DiT forwards
        guidance_scale=0.0,     # Guidance should be 0 for the Turbo models
        generator=torch.Generator("cuda").manual_seed(seed),
    ).images[0]
    return image, seed

@torch.no_grad()
def generate_video(
    text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
    output_video: str,
    input_image: np.ndarray,
    prompt: str,
    n_prompt: str,
    seed: int,
    total_second_length: float,
    latent_window_size: int,
    steps: int,
    cfg: float,
    gs: float,
    rs: float,
    gpu_memory_preservation: int,
    use_teacache: bool,
    mp4_crf: int
) -> list[str]:
    """
    Synchronous video generation. Returns a list of MP4 filenames
    (one per latent section, final video last).
    """
    # compute sections
    total_latent_sections = int(max(round((total_second_length * 30) / (latent_window_size * 4)), 1))
    job_id = generate_timestamp()
    out_files: list[str] = []

    try:
        # unload if low VRAM
        if not high_vram:
            unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)

        # --- TEXT ENCODING ---
        if not high_vram:
            fake_diffusers_current_device(text_encoder, gpu)
            load_model_as_complete(text_encoder_2, target_device=gpu)

        llama_vec, clip_l_pooler = encode_prompt_conds(
            prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
        )

        if cfg == 1:
            llama_vec_n = torch.zeros_like(llama_vec)
            clip_l_pooler_n = torch.zeros_like(clip_l_pooler)
        else:
            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(
                n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
            )

        llama_vec, llama_attention_mask     = crop_or_pad_yield_mask(llama_vec, length=512)
        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)

        # --- IMAGE PREPROCESS & VAE ENCODE ---
        H, W, C = input_image.shape
        height, width = find_nearest_bucket(H, W, resolution=640)
        input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
        Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f"{job_id}.png"))

        # <-- FIXED: add a singleton 'frames' dim so shape is [1, C, 1, H, W]
        input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
        input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None, :, :]

        if not high_vram:
            load_model_as_complete(vae, target_device=gpu)
        start_latent = vae_encode(input_image_pt, vae)

        # --- CLIP VISION ENCODE ---
        if not high_vram:
            load_model_as_complete(image_encoder, target_device=gpu)
        clip_out = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
        image_encoder_last_hidden_state = clip_out.last_hidden_state

        # cast to transformer dtype
        llama_vec                         = llama_vec.to(transformer.dtype)
        llama_vec_n                       = llama_vec_n.to(transformer.dtype)
        clip_l_pooler                     = clip_l_pooler.to(transformer.dtype)
        clip_l_pooler_n                   = clip_l_pooler_n.to(transformer.dtype)
        image_encoder_last_hidden_state   = image_encoder_last_hidden_state.to(transformer.dtype)

        # --- PREPARE SAMPLING ---
        rnd = torch.Generator("cpu").manual_seed(seed)
        num_frames = latent_window_size * 4 - 3

        history_latents = torch.zeros((1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
        history_pixels = None
        total_generated = 0

        latent_paddings = list(reversed(range(total_latent_sections)))
        if total_latent_sections > 4:
            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]

        # --- SAMPLING LOOP ---
        for pad in latent_paddings:
            is_last = (pad == 0)
            pad_size = pad * latent_window_size
            print(f"Section pad={pad}, is_last={is_last}")

            indices = torch.arange(0, sum([1, pad_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
            (pre_idx, blank_idx, latent_idx,
             post_idx, idx2x, idx4x
            ) = indices.split([1, pad_size, latent_window_size, 1, 2, 16], dim=1)
            clean_idx = torch.cat([pre_idx, post_idx], dim=1)

            clean_pre_latents = start_latent.to(history_latents.device)
            post, mid2, mid4 = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
            clean_latents = torch.cat([clean_pre_latents, post], dim=2)

            if not high_vram:
                unload_complete_models()
                move_model_to_device_with_memory_preservation(
                    transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation
                )

            transformer.initialize_teacache(enable_teacache=use_teacache, num_steps=steps)

            gen_latents = sample_hunyuan(
                transformer=transformer,
                sampler='unipc',
                width=width, height=height,
                frames=num_frames,
                real_guidance_scale=cfg,
                distilled_guidance_scale=gs,
                guidance_rescale=rs,
                num_inference_steps=steps,
                generator=rnd,
                prompt_embeds=llama_vec,
                prompt_embeds_mask=llama_attention_mask,
                prompt_poolers=clip_l_pooler,
                negative_prompt_embeds=llama_vec_n,
                negative_prompt_embeds_mask=llama_attention_mask_n,
                negative_prompt_poolers=clip_l_pooler_n,
                device=gpu, dtype=torch.bfloat16,
                image_embeddings=image_encoder_last_hidden_state,
                latent_indices=latent_idx,
                clean_latents=clean_latents,
                clean_latent_indices=clean_idx,
                clean_latents_2x=mid2,
                clean_latent_2x_indices=idx2x,
                clean_latents_4x=mid4,
                clean_latent_4x_indices=idx4x,
            )

            if is_last:
                gen_latents = torch.cat([start_latent.to(gen_latents), gen_latents], dim=2)

            total_generated += gen_latents.shape[2]
            history_latents = torch.cat([gen_latents.to(history_latents), history_latents], dim=2)

            if not high_vram:
                offload_model_from_device_for_memory_preservation(
                    transformer, target_device=gpu, preserved_memory_gb=8
                )
                load_model_as_complete(vae, target_device=gpu)

            real_latents = history_latents[:, :, :total_generated, :, :]

            if history_pixels is None:
                history_pixels = vae_decode(real_latents, vae).cpu()
            else:
                section_len = (latent_window_size * 2 + 1) if is_last else (latent_window_size * 2)
                overlap = latent_window_size * 4 - 3
                curr_pixels = vae_decode(real_latents[:, :, :section_len], vae).cpu()
                history_pixels = soft_append_bcthw(curr_pixels, history_pixels, overlap)

            if not high_vram:
                unload_complete_models()

            out_name = os.path.join(outputs_folder, f"{job_id}_{total_generated}.mp4")
            save_bcthw_as_mp4(history_pixels, out_name, fps=30, crf=mp4_crf)
            out_files.append(out_name)

            print(f"Saved: {out_name}")
            if is_last:
                out_name = output_video
                save_bcthw_as_mp4(history_pixels, out_name, fps=30, crf=mp4_crf)
                print(f"Saved last: {out_name}")
                break

    except Exception:
        traceback.print_exc()
        if not high_vram:
            unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)

    return out_files

def synthesize_videos(text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
                      output_video: str, input_image: str, prompt: str, total_second_length: float):
    # load and convert your test image
    pil_img = Image.open(input_image)
    input_np = np.array(pil_img)
    # call our refactored generator
    files = generate_video(
        text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram,
        output_video = output_video,
        input_image=input_np,
        prompt=prompt,
        n_prompt="",
        seed=random.randrange(0, 2**31),
        total_second_length=total_second_length,
        latent_window_size=9,
        steps=32,
        cfg=1.0,
        gs=10.0,
        rs=0.0,
        gpu_memory_preservation=6,
        use_teacache=False,
        mp4_crf=1
    )
    final_video = files[-1] if files else None

    return final_video

def get_openai_prompt_response(
    prompt: str,
    config: dict,
    max_tokens: int = 12000,
    temperature: float = 0.33,
    openai_model: str = "",
):
    """
    Sends a prompt to OpenAI's API and retrieves the response with retry logic.
    """
    client = OpenAI(api_key=config["openai_api_key"])
    response = client.chat.completions.create(
        max_completion_tokens=max_tokens,
        messages=[
            {
                "role": "system",
                "content": """Act as a helpful assistant, you are an expert editor.""",
            },
            {"role": "user", "content": prompt},
        ],
        model=openai_model or config["openai_model"],
    )

    retry_count = 0
    while retry_count < config["retry_limit"]:
        try:
            message_content = response.choices[0].message.content
            return message_content
        except Exception as e:
            print(f"Error occurred: {e}")
            retry_count += 1
            if retry_count == config["retry_limit"]:
                print("Retry limit reached. Moving to the next iteration.")
                return ""
            else:
                print(f"Retrying... (Attempt {retry_count}/{config['retry_limit']})")
                time.sleep(1)  # Optional: wait before retrying

def create_scenes(text: str, video_summary: str, config: dict):
    """
    Creates scenes based on the extracted lyrics using OpenAI's API.
    """
    # Generate scenes JSON
    prompt = f'''Create a json list of diverse, unique scenes (groupings of text), scene_description, and action_sequence (100 words or less) 
    from the following song text to create an animated music short film with many unique scenes.  Scenes should be groups of lyrics with 
    new scenes when the lyric context changes.  Text: {text}   
The json list should have the start value for the first item in the scene and the text that is combined for all items in the same scene.  
The scene_description should include A vivid, sensory-rich, and unique visual description, including:
  - Attire, setting, mood, lighting, and composition
  - Artistic style (e.g., graphic novel, watercolor, oil painting, surrealist CGI)
  - Emphasize strong emotion, artistic genius, spiritual energy, and grandeur
  - Special effects and CGI (glowing eyes, magical energy, electric auras, shifting reality, surreal environments, etc.)
  - Unexpected, awe-inspiring, or haunting elements; scenes should be visually striking and imaginative
  - No cliché or generic visual tropes; each scene must stand out and avoid repetition with others
  - Each scene should be fantastic, creative, beautiful, and intrically woven to bring the lyrics to like with visual storytelling
  - Pay attention to details and avoid long scenes in favor of diverse scenes for music video attention and artistic awe
  - Scenes must contain all elements of action. If an action involves an object, the object must be described in the scene.
  - Scenes should tell an imaginative and creative story together
  
The action_sequence should Describe slow, cinematic, and poetic actions (one or two sentences):
  - Avoid sudden or fast movement, abrupt zooms, walking, dancing, shopping, or human group behaviors
  - Action should be minimal, atmospheric, and enhance the emotional tone; favor symbolic gestures, stillness, or gradual transformations
  - No scene transitions; each scene is self-contained
  
These are example scene_descriptions: {SCENE_DESCRIPTIONS}
These are example action_sequences: {ACTION_SEQUENCES}  
The desired general theme or style is: {THEME}
Have an opening scene at time 0.
Return only the json list, less jargon. The json list fields must contain: start, text, scene_description, action_sequence.
Create any missing fields in the list.'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.85)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes

def revise_scenes(scenes, config: dict):
    """
    Revise scenes based on the extracted scenes.
    """
    # Generate scenes JSON
    prompt = f'''Revise the JSON scenes to update the scene_description and action_sequence to engage the senses and imagination, suitable for creating a stunning, cinematic video experience.  We want unique scenes, even ones in the same sequence. Use descriptions of special effects in the scenes.  JSON scenes: {scenes}   
The scene_description should include A vivid, sensory-rich, and unique visual description, including:
  - Attire, setting, mood, lighting, and composition
  - Artistic style (e.g., graphic novel, watercolor, oil painting, surrealist CGI)
  - Emphasize strong emotion, artistic genius, spiritual energy, and grandeur
  - Special effects and CGI (glowing eyes, magical energy, electric auras, shifting reality, surreal environments, etc.)
  - Unexpected, awe-inspiring, or haunting elements; scenes should be visually striking and imaginative
  - No cliché or generic visual tropes; each scene must stand out and avoid repetition with others
  - Each scene should be fantastic, creative, beautiful, and intrically woven to bring the lyrics to like with visual storytelling
  - Pay attention to details  
  - Scenes must contain all elements of action. If an action involves an object, the object must be described in the scene.
  - Scenes should tell an imaginative and creative story together
  
The action_sequence should Describe slow, cinematic, and poetic actions (one or two sentences):
  - Avoid sudden or fast movement, abrupt zooms, walking, dancing, shopping, or human group behaviors
  - Action should be minimal, atmospheric, and enhance the emotional tone; favor symbolic gestures, stillness, or gradual transformations
  
Only update the scene_description and action_sequence. We do not want to have similar scene_descriptions and action_sequences for 
consecutive scenes, we want unique scenes that tell a brilliant, cohesive story.  Please update the scene_description 
and action_sequence to be different, creative, and consistent.  
Use diverse and creative scene_descriptions, don't repeat the same ones, create new scenes descriptions.
Do not delete any items as having scenes with the given start times are important. 
The desired general theme or style is: {THEME}
Return only the json list, less jargon. The json list fields must contain: start, text, scene_description, action_sequence.
Create any missing fields in the list.'''

    result = get_openai_prompt_response(prompt, config, openai_model=config["openai_model"], temperature=0.33)
    result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")
    scenes = json.loads(result)
    return scenes

def get_audio_duration(audio_file):
    with AudioFileClip(audio_file) as clip:
        return clip.duration  # duration in seconds (float)

def process_audio_scenes(audio_file: str, config: dict):
    # set maximum duration for an image basis, should be in intervals of video generation length
    max_duration_seconds  = 12
    """
    Processes a single audio file through the entire workflow.
    """
    # Create unique identifier based on audio file name
    audio_basename = os.path.splitext(os.path.basename(audio_file))[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = f"{audio_basename}_{timestamp}"

    # Create unique directories for images and videos
    print(f"Create unique directories for images and videos")
    audio_images_dir = os.path.join(config["base_working_dir"], unique_id)
    audio_videos_dir = os.path.join(config["base_video_dir"], unique_id)
    os.makedirs(audio_images_dir, exist_ok=True)
    os.makedirs(audio_videos_dir, exist_ok=True)

    # Step 1: Transcribe audio using Whisper
    print(f"Transcribe audio using Whisper")
    audio_pipe = get_audio_pipe()
    result = audio_pipe([audio_file], return_timestamps=True)

    # Cleanup Whisper model memory
    del audio_pipe
    reset_memory(device)

    segments = []
    for item in result:
        for chunk in item.get("chunks", []):
            start = chunk.get("timestamp", (None, None))[0]
            segment_text = chunk.get("text", "").strip()
            segments.append((start, segment_text))
    
    # optional: sort by start time
    segments.sort(key=lambda x: (x[0] is None, x[0]))

    text = ""
    for start, segment_text in segments:
        text += f"Start: {start}, Text: {segment_text}\n"

    #last_end_value = segments[-1]['end']
    last_end_value = float(get_audio_duration(audio_file))

    # Path to scenes.json file
    scenes_file_path = os.path.join(audio_images_dir, "scenes.json")

    # Check if scenes.json exists
    if os.path.exists(scenes_file_path):
        print(f"Scenes file already exists at {scenes_file_path}. Skipping scene generation.")
        with open(scenes_file_path, "r") as scenes_file:
            scenes = json.load(scenes_file)
        return scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

    # Step 2: Generate video summary using OpenAI
    print(f"Generate video summary using OpenAI")
    video_summary_prompt = f'Create a short, concise summary to create a music video based on these song lyrics: {text}'
    video_summary = get_openai_prompt_response(video_summary_prompt, config, openai_model=config["openai_model"])

    # Step 3: Create scenes based on lyrics
    print(f"Create scenes based on lyrics: 1")
    try:
        scenes = create_scenes(text, video_summary, config)
    except:
        try:
            print(f"Create scenes based on lyrics: 2")
            scenes = create_scenes(text, video_summary, config)
        except:
            try:
                print(f"Create scenes based on lyrics: 3")
                scenes = create_scenes(text, video_summary, config)
            except: 
                return "", audio_images_dir, audio_videos_dir, last_end_value, timestamp

    # ---------------------------------------------------------
    # FIX: Run Revision FIRST, while the list is still concise
    # ---------------------------------------------------------
    try:
        scenes_revised = revise_scenes(scenes, config)
        scenes = scenes_revised
        print(f'revised scenes')
    except:
        try:
            scenes_revised = revise_scenes(scenes, config)
            scenes = scenes_revised
            print(f'revised scenes')
        except:
            print('cannot revise scenes')

    # ---------------------------------------------------------
    # FIX: Run Splitting LAST, so the AI cannot undo the split
    # ---------------------------------------------------------
    # we don't want scenes longer than 12 seconds
    new_scenes = []
    for i in range(len(scenes)):
        scene = scenes[i]
        
        # Determine Start Time
        if i == 0:
            start_time = 0
        else:
            start_time = scene['start']
        
        # Determine End Time
        if i < len(scenes) - 1:
            end_time = scenes[i + 1]['start']
        else:
            end_time = last_end_value
            
        duration = end_time - start_time
        
        # Split the scene if duration exceeds max_duration_seconds
        while duration > max_duration_seconds:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)
            
            # Increment for next chunk
            start_time += max_duration_seconds
            duration = end_time - start_time
            
        # Append the remaining part of the scene (or the whole scene if it wasn't split)
        if duration > 0:
            new_scene = scene.copy()
            new_scene['start'] = start_time
            new_scenes.append(new_scene)

    # Replace the original scenes with the new split list
    scenes = new_scenes

    # Save the scenes to scenes.json
    with open(scenes_file_path, "w") as scenes_file:
        json.dump(scenes, scenes_file)
        
    return scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp


def process_audio_images(config: dict, scenes, audio_images_dir):
    # Step 4: Load ZImage pipeline and generate images
    print(f"Load ZImage pipeline and generate images")
    pipe = load_models()
    height = HEIGHT
    width = WIDTH
    resolution = (height, width)
    guidance_scale = 3.9
    num_inference_steps = 16
    max_sequence_length = 512
    seed = -1
    negative_prompt = "worst quality, low quality, worst aesthetic, old, blurry, lowres, signature, artist name, watermark, username, sketch, logo, furry, text, speech bubble, symbols, words, letteres"

    # Generate images for each scene
    image_num = 1
    for scene in scenes:
        image_prompt = THEME+". "+scene['scene_description']
        image, seed = generate_image(pipe, image_prompt, negative_prompt, resolution, -1)        
        filename = f"image_{str(image_num).zfill(2)}.jpg"
        image_path = os.path.join(audio_images_dir, filename)
        image.save(image_path, dpi=(300, 300))
        image_num += 1

    # Move the pipeline back to CPU and delete it
    del pipe
    reset_memory(device)
    return

def process_audio_video(config: dict, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp, start_image: int = 0):
    # Step 6: Load Video Pipeline
    print(f"Load Video Pipeline")
    # check GPU memory & decide offload strategy
    free_mem_gb = get_cuda_free_memory_gb(gpu)
    high_vram = free_mem_gb > 60
    # load all models & tokenizers
    text_encoder    = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                 subfolder="text_encoder", torch_dtype=torch.float16).cpu()
    text_encoder_2  = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                    subfolder="text_encoder_2", torch_dtype=torch.float16).cpu()
    tokenizer       = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                          subfolder="tokenizer")
    tokenizer_2     = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                     subfolder="tokenizer_2")
    vae             = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo",
                                                                 subfolder="vae", torch_dtype=torch.float16).cpu()
    feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl",
                                                             subfolder="feature_extractor")
    image_encoder   = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl",
                                                        subfolder="image_encoder", torch_dtype=torch.float16).cpu()
    transformer     = HunyuanVideoTransformer3DModelPacked.from_pretrained(
                        "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16).cpu()
    # set eval & dtypes
    for m in (text_encoder, text_encoder_2, image_encoder, vae, transformer):
        m.eval()
        m.requires_grad_(False)
    transformer.high_quality_fp32_output_for_inference = True
    transformer.to(dtype=torch.bfloat16)
    vae.to(dtype=torch.float16)
    image_encoder.to(dtype=torch.float16)
    text_encoder.to(dtype=torch.float16)
    text_encoder_2.to(dtype=torch.float16)
    if not high_vram:
        vae.enable_slicing()
        vae.enable_tiling()
        DynamicSwapInstaller.install_model(transformer, device=gpu)
        DynamicSwapInstaller.install_model(text_encoder, device=gpu)
    else:
        text_encoder.to(gpu)
        text_encoder_2.to(gpu)
        image_encoder.to(gpu)
        vae.to(gpu)
        transformer.to(gpu)
        
    video_num = 1

    # Step 7: Generate video sequences
    for i, scene in enumerate(scenes):
        prompt = scene["action_sequence"]

        # Use the initial image for each scene
        image_input = os.path.join(audio_images_dir, f"image_{str(i+1).zfill(2)}.jpg")

        # Calculate duration to keep the video in 6-second increments
        if i + 1 < len(scenes):
            next_start_time = scenes[i + 1]["start"]
        else:
            next_start_time = last_end_value  # Use the final ending time for the last scene

        if i == 0:
            duration = next_start_time
        else:
            duration = next_start_time - scene["start"]

        if i >= start_image-1:
            video_name = f"v_fpk_{str(video_num).zfill(2)}_{timestamp}.mp4"
            video_output_path = os.path.join(audio_videos_dir, video_name)
            synthesize_videos(text_encoder, text_encoder_2, image_encoder, vae, transformer, tokenizer, tokenizer_2, feature_extractor, high_vram, 
                video_output_path, image_input, prompt, duration)    
            time.sleep(1)  # Pause for 1 second
        video_num += 1  # Increment video number for the next segment

    # 1) Offload / unload all models from GPU
    unload_complete_models(
        text_encoder,
        text_encoder_2,
        image_encoder,
        vae,
        transformer
    )
    
    del(text_encoder)
    del(text_encoder_2)
    del(tokenizer)
    del(tokenizer_2)
    del(vae)
    del(feature_extractor)
    del(image_encoder)
    del(transformer)
    
    # 3) Force GC + CUDA clean‐up
    reset_memory(gpu)
    
    return

def process_all_audios(audio_file, config: dict):
    """
    Processes a list of audio files through the workflow.
    """
    print(f"Processing audio file: {audio_file}")
    scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_audio_scenes(audio_file, config)
    print(f'{len(scenes)} scenes:\n{json.dumps(scenes, indent=4)}')
    print(f'last_end_value: {last_end_value} timestamp: {timestamp}')
    # Create starting images for scenes
    process_audio_images(config, scenes, audio_images_dir)
    return config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp

def create_video(config):
    config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp = process_all_audios(audio_file, config)
    process_audio_video(config, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp, 0)
    return

In [None]:
# run new systems
for audio_file in CONFIG["audio_files"]:
    create_video(CONFIG)
    reset_memory(device)


`torch_dtype` is deprecated! Use `dtype` instead!


Processing audio file: /mnt/d/Share/Audio/StormSilencedHearts.mp3
Create unique directories for images and videos
Transcribe audio using Whisper


`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Generate video summary using OpenAI
Create scenes based on lyrics: 1
revised scenes
52 scenes:
[
    {
        "start": 0,
        "text": "Did you see how souls are torn, by despair their essence mourn?",
        "scene_description": "A hyper-detailed surreal CGI landscape of luminous, fracturing spirits dissolving into an infinite abyss, where iridescent tendrils of energy ripple through a void filled with shimmering, shattered mirrors and ghostly light. Ethereal figures draped in shimmering, torn garments evoke anguish amid swirling cosmic fractals and glowing fissures, illuminated by electric currents that pulse with raw emotion and spiritual chaos.",
        "action_sequence": "Silent, slow unfolding as shards of glowing light and fractured reflections drift and dissolve into darkness, emphasizing the fragile breakage of spirits."
    },
    {
        "start": 12.0,
        "text": "To the truth their paths forlorn, heartstrings by their own hands shorn",
        "scene_descriptio

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

Load Video Pipeline


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unloaded DynamicSwap_LlamaModel as complete.
Unloaded CLIPTextModel as complete.
Unloaded SiglipVisionModel as complete.
Unloaded AutoencoderKLHunyuanVideo as complete.
Unloaded DynamicSwap_HunyuanVideoTransformer3DModelPacked as complete.
Loaded CLIPTextModel to cuda:0 as complete.
Unloaded CLIPTextModel as complete.
Loaded AutoencoderKLHunyuanVideo to cuda:0 as complete.
Unloaded AutoencoderKLHunyuanVideo as complete.
Loaded SiglipVisionModel to cuda:0 as complete.
Section pad=3, is_last=False
Unloaded SiglipVisionModel as complete.
Moving DynamicSwap_HunyuanVideoTransformer3DModelPacked to cuda:0 with preserved memory: 6 GB


  0%|          | 0/32 [00:00<?, ?it/s]

## Run Previous Generation

In [None]:
# run saved config
timestamp = '20250920_151618'
title = 'TruthAintForSale'
start_image = 18
scenes_file_path = f'./images/{title}_{timestamp}/scenes.json'
audio_images_dir = f'./images/{title}_{timestamp}'
audio_videos_dir = f'./output/{title}_{timestamp}'

last_end_value = 425.18 

with open(scenes_file_path, "r") as scenes_file:
    scenes = json.load(scenes_file)

process_audio_video(CONFIG, scenes, audio_images_dir, audio_videos_dir, last_end_value, timestamp, start_image)