In [6]:
import torch
from PIL import Image
from pyramid_dit import PyramidDiTForVideoGeneration
from diffusers.utils import load_image, export_to_video
import time
import os
import json
import random
import numpy as np

In [7]:

def setSeeds(seed):
    global logger
    # predefining random initial seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [2]:
PATH = './local_model'

OUTPUT_BASE_PATH = os.path.join('..','..','tests')

device = "cuda:2"
torch.cuda.set_device(int(device[-1]))

In [3]:

model_dtype, torch_dtype = 'bf16', torch.bfloat16   # Use bf16 (not support fp16 yet)

model = PyramidDiTForVideoGeneration(
    PATH,                                         # The downloaded checkpoint dir
    model_dtype,
    model_name="pyramid_flux",
    model_variant='diffusion_transformer_384p',     # SD3 supports 'diffusion_transformer_768p'
)

model.vae.enable_tiling()
model.vae.to(device)
model.dit.to(device)
model.text_encoder.to(device)

# if you're not using sequential offloading bellow uncomment the lines above ^
#model.enable_sequential_cpu_offload()

Using temporal causal attention


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.78it/s]
An error occurred while trying to fetch ./local_model/causal_video_vae: Error no file named diffusion_pytorch_model.safetensors found in directory ./local_model/causal_video_vae.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
  return self.fget.__get__(instance, owner)()


The latent dimmension channes is 16
The start sigmas and end sigmas of each stage is Start: {0: 1.0, 1: 0.8002399489209289, 2: 0.5007496155411024}, End: {0: 0.6669999957084656, 1: 0.33399999141693115, 2: 0.0}, Ori_start: {0: 1.0, 1: 0.6669999957084656, 2: 0.33399999141693115}


FluxTextEncoderWithMask(
  (text_encoder): CLIPTextModel(
    (text_model): CLIPTextTransformer(
      (embeddings): CLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(77, 768)
      )
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_featu

## older code

In [6]:
# first tries with an artistic prompt
art_prompt = "This is a sombre image of violent forces of nature where the expressive feel is intensified by the fierceness of Strindberg’s spatula work. He found the scene depicted at Dalar¨o in the Stockholm skerries in the summer of 1892. Strindberg worked very intuitively, translating his mental state into images with brief, fierce bursts of activity. That is also why he chose small-scale formats; in fact this painting, which Strindberg called The Flying Dutchman as a reference to Richard Wagner’s lonely, ever roaming captain, is one of his largest"

prompt = f"a video derived from the following artwork description '{art_prompt}'"

video_out_base_name = "output_art"

In [4]:
# chpt art

# prompt: i would like to use a text-to-video ai to generate some artistic videos from the descriptions of some paintings. However, using directly the painting description or simply pretending it to be a brief introduction to create a video doesn't seem to be that suitable for this task leading to poor results. So I now give you the original description and I would like you to create one that preserves the meaning but adapts it to the video generation task, creating an artistic video as a result that is rather high pace and dynamic. here is the original description "This is a sombre image of violent forces of nature where the expressive feel is intensified by the fierceness of Strindberg’s spatula work. He found the scene depicted at Dalar¨o in the Stockholm skerries in the summer of 1892. Strindberg worked very intuitively, translating his mental state into images with brief, fierce bursts of activity. That is also why he chose small-scale formats; in fact this painting, which Strindberg called The Flying Dutchman as a reference to Richard Wagner’s lonely, ever roaming captain, is one of his largest"

prompt = "A fierce, storm-lashed seascape unfolds, with towering waves crashing and dark clouds swirling in violent motion. The sky churns in chaotic strokes, capturing the raw power of nature as it collides with an unseen force. Brushstrokes and spatula marks give the scene a visceral intensity, as though painted with a storm's own fury. Quick bursts of flashing light illuminate a lone, spectral ship—a ghostly figure, ever-drifting, embodying the spirit of Wagner’s haunted captain, cursed to roam through darkened waters. In this relentless motion, the scene becomes a tempestuous expression of inner turmoil, echoing the artist’s own struggles. Dark, fleeting glimpses of sky and sea reflect a soul locked in eternal, furious pursuit. A storm that never ends."

video_out_base_name = "output_art_cgpt"

In [7]:
# prompt adjusted with llama3

prompt = "Create a sombre and intense video depicting a turbulent scene of forces of nature, reminiscent of a violent summer storm at Dalar¨o in the Stockholm skerries. Incorporate expressive brushstrokes and vivid colors to convey the emotional turmoil, characteristic of Strindberg's spatula work. Include brief, dynamic bursts of energy, evoking the artist's intuitive and frenzied creative process. Set against a dramatic, isolated backdrop, the scene should evoke the sense of isolation and loneliness, inspired by Richard Wagner's 'Flying Dutchman' legend. Use a mix of fast-paced camera movements and close-ups to capture the raw energy and emotion of the painting."

video_out_base_name = "output_art_llama3"

In [9]:
# prompt adjusted with llama3

prompt = "Create a somber and intense video depicting violent forces of nature, reminiscent of Strindberg's expressive and spontaneous style. Incorporate fierce, bold brushstrokes and vivid colors to convey the energy and turmoil of the scene. Use a mix of fast-paced cuts and slow-motion sequences to capture the dynamic movement of the natural elements. Include a small-scale format, with a focus on bold, expressive lines and vivid textures. Reference the 1892 painting 'The Flying Dutchman' by Strindberg, with a nod to Richard Wagner's iconic character. The tone should evoke a sense of eeriness and foreboding, with a focus on capturing the raw emotion and intensity of the scene."

video_out_base_name =  "output_art_llama3_2"

In [13]:
# prompt adjusted with llama3

prompt = "Create a somber, immersive video showcasing turbulent natural forces, with intense, expressive brushstrokes reminiscent of Strindberg's spatula technique. Depict a dramatic scene set in the Stockholm skerries during summer 1892, inspired by Strindberg's own mental state and translated into dynamic, brief movements. Incorporate a sense of foreboding, using muted colors and bold contrasts to evoke a feeling of unease. Incorporate subtle, suggestive elements to evoke the mythological figure of the Flying Dutchman, as seen in Richard Wagner's work, and explore the contrast between the natural world and the human experience."

video_out_base_name =  "output_art_llama3_3"

In [4]:
# prompt adjusted with llama3

prompt = "Create a dynamic video inspired by a 16th-century painting of St. Ladislas, depicting him seated on a throne with an embroidered cloak, wearing knightly armor and holding a battle-axe. The throne is set against a Renaissance palace background with two landscapes on either side: a monochrome topographic reference to the foundation of the Nagyvárad Cathedral on the left, and a legendary defeat scene on the right. Incorporate elements of the medieval legends surrounding St. Ladislas, such as the national emblem on a shield, while maintaining a focus on his historical persona without a halo. Use a mix of medieval and Renaissance styles to evoke the spirit of the original painting, and include subtle animations and transitions to enhance the visual narrative. The video should convey the unity of the Christian king and the Christian knight, as well as St. Ladislas' connection to the world of power and his role as a historical figure."

video_out_base_name =  "output_art_llama3_4"

In [11]:

video_out_base_name = "output_art_llama3_11"

prompts = ["A dynamic scene unfolds as Ludovico, resplendent in ornate cardinal attire, turns his head to directly engage the viewer. His eyes sparkle with intensity as he gazes straight ahead, his eyelids slightly raised in a subtle, knowing smile. The ornate ring on his left hand glints in the light, drawing attention to his newly acquired status. Ludovico's right hand, now clenched into a firm fist, grasps the chair armrests, while his left hand now holds a parchment letter, its edges fluttering as if about to take flight. The atmosphere is charged with anticipation, as if Ludovico is about to reveal a long-held secret. The soft, golden light illuminates his face, casting a warm glow on the intricate details of his attire, and the subtle shadows accentuate the contours of his features. The background, a muted, richly textured wood panel, provides a sophisticated backdrop for Ludovico's dynamic pose, evoking a sense of opulence and refinement.",
 "A serene paradise landscape unfolds in the distance, with lush greenery and towering trees, as the figure of Adam, dressed in 16th-century attire, reaches out to receive the hand of Eve, who stands beside a blooming tree. In the centre of the scene, the dramatic moment of the Fall of Man unfolds, with Adam and Eve standing at the edge of a cliff, as they succumb to the forbidden fruit. The air is filled with swirling, ethereal lights, and the sky above is ablaze with vibrant hues of orange and pink. In the background, the distant figures of God and the serpent are seen, with God's hand extended in a gesture of creation, while the serpent coils around the tree of knowledge, its eyes fixed intently on the couple. The entire scene is bathed in a warm, golden light, as if the very essence of the painting has been brought to life.",
 "Père Tanguy, dressed in traditional Breton peasant attire, stands at the center of a vibrant, autumnal landscape, set against a backdrop of intricately designed Japanese seasonal scenes and figures. He faces the viewer with an air of quiet dignity, his eyes cast downward in contemplation. To his left, a young woman in a traditional Breton dress and a man in a rustic Japanese kimono stroll through the scene, their movements fluid and natural. The woman holds a basket of fruit, while the man carries a large, ornate fan. Tanguy's massive frame and imposing presence are balanced by the whimsical, fantastical elements of the background, creating a sense of dynamic tension and harmony. The overall effect is one of enchanting, dreamlike beauty, as if the boundaries between reality and fantasy have been blurred. As the camera pans out, the viewer is drawn into the heart of the scene, surrounded by the vibrant colors and intricate details of the Japanese-inspired setting.",
 "A dark, ornate throne room is set against a deep crimson backdrop. Herod, a portly figure in a lavish purple robe, sits regally on the throne, his eyes fixed intently on Salome, a statuesque young woman in a flowing white gown, as she dances seductively around him. The room is filled with the opulent splendor of the ancient world, with golden candelabras, ornate tapestries, and a majestic stone fountain in the background. As Salome's dance reaches its climax, she raises her arms to Herod, and in a shocking twist, she holds aloft the severed head of Saint John the Baptist on a gleaming silver platter, which is balanced on a charger. The head, rendered in exquisite detail, is positioned so that it appears to be gazing directly at the viewer, adding to the sense of horror and unease. As the scene unfolds, Salome's mother, a shadowy figure in the background, watches with an air of calculated calculation, while Herod's expression shifts from fascination to disgust, his face contorted in a mixture of horror and desire.",
 "The Virgin sits in a serene, golden light, gently cradling the Child on her lap. Her hands are clasped around the Child, with delicate fingers intertwined. The Child's face is tilted upwards, looking towards the viewer with a gentle smile. Behind the Virgin, the four figures are arranged in a semi-circle, their faces turned towards her in contemplative poses. The background is a deep, bold red, with intricate black curving tendrils printed across it. The Virgin's haloes glow softly, casting a halo of light around her and the Child. The four figures are dressed in flowing, earth-toned robes, their faces and hands subtly animated as they reach out to the Virgin and Child. The overall mood is one of peaceful reverie, with the figures and the Virgin and Child frozen in time, as if in a moment of quiet devotion.",
 "A darkened, misty valley with a lone figure of David, dressed in simple, earth-toned clothing, standing confidently in front of a giant stone pedestal with Goliath's massive, grotesque head perched atop it. Goliath's head is rendered in vivid, eerie detail, with a menacing gaze and outstretched tongue. David raises his hand, holding a gleaming, ornate sling, as he takes a few swift steps forward. With a swift motion, he releases the sling, propelling a small, golden-tipped stone towards Goliath's head. The stone hurtles through the air, leaving a glowing trail behind it, and strikes Goliath's head with a burst of radiant light. The giant's head shatters into a thousand pieces, sending shards of stone and debris flying in all directions. David stands tall, his chest heaving with exertion, as the valley erupts in a kaleidoscope of colors and patterns, symbolizing the dawn of his triumph.",
 "A serene landscape with a stone bridge stretches across the screen, but instead of being static, it begins to crumble and collapse, with rocks tumbling down and the bridge's wooden planks splintering apart. In the distance, a windmill spins rapidly, its sails creaking and groaning as it generates a powerful gust of wind that sends debris flying through the air. Meanwhile, a group of cottages, reminiscent of the etched versions by Rembrandt, stand in the foreground, their thatched roofs and chimney pots swaying violently as they're buffeted by the wind. The scene is a dynamic, chaotic mess, with trees uprooted and branches snapping in the wind, all set against a hazy, dreamlike sky.",
 "A fiery, golden dragon bursts through the stone walls of a medieval prison, its scales glistening in the dim light. St Margaret of Antioch, dressed in 16th-century attire, stands defiantly with a large cross in hand, her eyes ablaze with determination. As she charges forward, her cross slices through the dragon's scales, revealing a cavernous interior. The dragon's jaws part, and St Margaret seizes the opportunity, plunging her cross deep into the beast's belly. The dragon's body begins to convulse, and its scales shatter, releasing a wave of flames that St Margaret narrowly avoids. As the flames die down, St Margaret stands victorious, her cross still lodged in the dragon's chest. The background dissolves into a kaleidoscope of colors, with the prison walls crumbling, and St Margaret emerging, unscathed, into a bright, sunlit landscape. In the distance, the silhouette of a beheading block looms, foreshadowing her ultimate fate.",
 "Margaret of Austria, dressed in a flowing white hood, stands in a grand, dimly lit room. Her hood, adorned with intricate lace and subtle gold embroidery, falls across her face, partially concealing her features. The Netherlandish style is evident in the ornate, baroque furnishings and the delicate, curved lines of the wooden paneling. As she raises her hands, the white hood billows outward, revealing a glimpse of her elegant, Renaissance-style gown beneath. Her eyes, cast downward, sparkle with a hint of introspection. In a sudden movement, Margaret's hands burst forth from her sleeves, releasing a flurry of white rose petals that dance in the air, as if carried by an invisible breeze. The petals swirl and twirl, eventually settling gently on the floor, where they form a delicate, lace-like pattern.",
 "A woman, posed in a flowing, abstracted nude, stands with her back to the viewer, her left arm extended and her right hand holding a delicate, antique black stone vessel. Her posture conveys a sense of relaxed elegance, as if lost in thought. A flowing, draped cape billows from her shoulder, its folds rippling and swirling around her. Her gestures exude sensuality, as she gently tilts her head, her long, dark hair cascading down her back. Her body is bathed in soft, golden light, casting a warm glow across the surrounding environment. In the background, subtle, muted colors evoke a sense of cool, classical grandeur, reminiscent of Ingres's style."]

for prompt in prompts:
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
        frames = model.generate(
            prompt=prompt,
            num_inference_steps=[20, 20, 20],
            video_num_inference_steps=[10, 10, 10],
            height=384,     
            width=640,
            temp=16,                    # temp=16: 5s, temp=31: 10s
            guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
            video_guidance_scale=5.0,   # The guidance for the other video latent
            output_type="pil",
            save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
        )

    fps = 24

    export_to_video(frames, f"{video_out_base_name}_{fps}_{int(time.time())}.mp4", fps=fps)

100%|██████████| 16/16 [01:03<00:00,  3.94s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16/16 [01:03<00:00,  3.94s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16/16 [01:03<00:00,  3.95s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 50%|██

KeyboardInterrupt: 

## new code

In [None]:
DIR_NAME = 'SMZG7P'
setSeeds(42)

output_path = os.path.join(OUTPUT_BASE_PATH, DIR_NAME)

video_out_base_name = "output_art_llama3_11"

video_out_path = os.path.join(output_path, 'videos')
os.makedirs(video_out_path, exist_ok=True)

with open(os.path.join(output_path, 'video_desc.json')) as f:
    data = json.load(f)
    

for el in data['data']:
    prompt = el['generated_video_desc']
    if prompt is not None:
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
            frames = model.generate(
                prompt=prompt,
                num_inference_steps=[20, 20, 20],
                video_num_inference_steps=[10, 10, 10],
                height=384,     
                width=640,
                temp=16,                    # temp=16: 5s, temp=31: 10s
                guidance_scale=7.0,         # The guidance for the first frame, set it to 7 for 384p variant
                video_guidance_scale=5.0,   # The guidance for the other video latent
                output_type="pil",
                save_memory=False, #True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
                #save_memory doesn't seem to do much difference
            )

        fps = 24

        export_to_video(frames, os.path.join(video_out_path, f"{'.'.join(el['painting_file'].split('.')[:-1])}_{fps}_{int(time.time())}.mp4"), fps=fps)

100%|██████████| 16/16 [00:45<00:00,  2.87s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16/16 [00:46<00:00,  2.89s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16/16 [00:46<00:00,  2.89s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██

In [9]:
el['generated_video_desc']

{'setting': {'architecture': {'type': 'temple', 'style': 'closed'},
  'environment': {'atmosphere': 'serene'}},
 'characters': [{'name': 'Joachim',
   'age': 'elderly',
   'action': ['approaching the temple',
    'looking at the priest',
    'taking a step back',
    'looking down',
    'turning away'],
   'movement': ['dual movement', 'slow and deliberate']},
  {'name': 'priest',
   'action': ['standing at the entrance',
    'gesturing to Joachim',
    'turning to Joachim',
    'looking at Joachim with a serious expression'],
   'movement': ['static', 'firm and authoritative']},
  {'name': 'sheep',
   'action': ['standing next to Joachim',
    'looking up at Joachim',
    'taking a step forward',
    "nuzzling Joachim's hand"],
   'movement': ['slow and gentle', 'tender']}],
 'camera_movement': [{'type': 'static',
   'location': 'high angle',
   'duration': '2 seconds'},
  {'type': 'dolly', 'location': 'medium shot', 'duration': '3 seconds'},
  {'type': 'pan', 'location': 'wide shot',

In [7]:
image = Image.open('skerries.jpg').convert("RGB").resize((640, 384))

with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    frames = model.generate_i2v(
        prompt=prompt,
        input_image=image,
        num_inference_steps=[10, 10, 10],
        temp=16,
        video_guidance_scale=4.0,
        output_type="pil",
        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
    )

fps = 24

export_to_video(frames, f"{video_out_base_name}_{fps}_{int(time.time())}_image_conditioned.mp4", fps=fps)

100%|██████████| 15/15 [01:00<00:00,  4.04s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'output_art_24_1730988950_image_conditioned.mp4'

In [11]:
# first tries with an artistic prompt
art_prompt = "Agasse was born in Switzerland, but he lived in London for the last 50 years of his life. His life in London was marked by a great success as an animal painter, and his patrons included many grand landed families. He made something of a speciality in painting exotic animals, and was a frequent visitor to the menageries which were such a feature of London Life at this date"

prompt = f"a video derived from the following artwork description '{art_prompt}'"

video_out_base_name = "output_art_2"

In [14]:

image = Image.open('00136-terriers.jpg').convert("RGB").resize((640, 384))

with torch.no_grad(), torch.cuda.amp.autocast(enabled=True, dtype=torch_dtype):
    frames = model.generate_i2v(
        prompt=prompt,
        input_image=image,
        num_inference_steps=[10, 10, 10],
        temp=16,
        video_guidance_scale=4.0,
        output_type="pil",
        save_memory=True,           # If you have enough GPU memory, set it to `False` to improve vae decoding speed
    )

fps = 24

export_to_video(frames, f"{video_out_base_name}_{fps}_{int(time.time())}_image_conditioned.mp4", fps=fps)

100%|██████████| 15/15 [01:04<00:00,  4.31s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'output_art_2_24_1730991653_image_conditioned.mp4'