# Exploring the latent space of stable diffusion

What's interesting is that you can generate an image from any point in latent space.

In [None]:
#@markdown # First, let's install all the required modules.

!pip install -q diffusers transformers accelerate
!pip install -q numpy scipy ftfy Pillow moviepy

In [None]:
#@markdown # Import modules

import torch
import numpy as np
import os

import time

from moviepy.editor import ImageSequenceClip
from PIL import Image
from IPython import display as IPdisplay
from tqdm.auto import tqdm

from diffusers import StableDiffusionPipeline
from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
    DPMSolverMultistepScheduler,
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
    AutoencoderKL,
)
from transformers import logging

from google.colab import files
from google.colab import output
import io

from torchvision import transforms

from ipywidgets import Button
import shutil

logging.set_verbosity_error()

#Let's check if CUDA is available.

print(torch.cuda.is_available())

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# These settings are used to optimize the performance of PyTorch models on CUDA-enabled GPUs,
# especially when using mixed precision training or inference, which can be beneficial in terms of speed and memory usage.
# Source: https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
#@markdown # Model

#@markdown The [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) model and the EulerDiscreteSchedulerscheduler were chosen to generate images. Despite being an older technology, it continues to enjoy popularity due to its fast performance, minimal memory requirements, and the availability of numerous community fine-tuned models built on top of SD1.5. However, you are free to experiment with other models and schedulers to compare the results.


model_name_or_path = "stabilityai/stable-diffusion-2-1-base"

scheduler = EulerDiscreteScheduler.from_pretrained(model_name_or_path, subfolder="scheduler")

pipe = StableDiffusionPipeline.from_pretrained(
    model_name_or_path,
    scheduler=scheduler,
    torch_dtype=torch.float32,
).to(device)

# Disable image generation progress bar, we'll display our own
pipe.set_progress_bar_config(disable=True)

vae = AutoencoderKL.from_pretrained(model_name_or_path, subfolder="vae", torch_dtype=torch.float16).to(device)

In [None]:
#@markdown # Reduce the memory consumed by the GPU.

#@markdown More detailed information can be found here: https://huggingface.co/docs/diffusers/en/optimization/opt_overview
#@markdown In particular, information about the following methods can be found here: https://huggingface.co/docs/diffusers/optimization/memory


# Offloading the weights to the CPU and only loading them on the GPU can reduce memory consumption to less than 3GB.
pipe.enable_model_cpu_offload()

# Tighter ordering of memory tensors.
pipe.unet.to(memory_format=torch.channels_last)

# Decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time.
pipe.enable_vae_slicing()

# Splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image.
pipe.enable_vae_tiling()

# Using Flash Attention; If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling xformers.
# pipe.enable_xformers_memory_efficient_attention()


In [None]:
#@markdown # Define functions

# The path where the generated GIFs will be saved
save_path = "/content/output"

if not os.path.exists(save_path):
    os.makedirs(save_path)

# Saves the images and a GIF
def save_images(images, output_path):
  # Generate a file name based on the current time, replacing colons with hyphens
    # to ensure the filename is valid for file systems that don't allow colons.
    dirname = (
        time.strftime("%H:%M:%S", time.localtime())
        .replace(":", "-")
    )

    save_path = os.path.join(output_path, dirname)

    if not os.path.exists(save_path):
      os.makedirs(save_path)

    # Convert each image in the 'images' list from an array to an Image object.
    converted = []

    image_files = []
    for i, image in enumerate(images):
      image_file = f"{save_path}/{i:03d}.png"
      image_files.append(image_file)

      pil_image = Image.fromarray(np.array(image[0], dtype=np.uint8))
      pil_image.save(image_file)
      converted.append(pil_image)

    clip = ImageSequenceClip(image_files, fps=10)
    clip.write_videofile(f"{save_path}/video.mp4")

    # Save the first image in the list as a GIF file at the 'save_path' location.
    # The rest of the images in the list are added as subsequent frames to the GIF.
    # The GIF will play each frame for 100 milliseconds and will loop indefinitely.
    converted[0].save(
        f"{save_path}/preview.gif",
        save_all=True,
        append_images=converted[1:],
        duration=100,
        loop=0,
    )

    return save_path

# Displays the GIF saved in a path
def display_gif(path):
    # Return the saved GIF as an IPython display object so it can be displayed in a notebook.

    gif = IPdisplay.Image(f"{path}/preview.gif")

    def download(b):
      shutil.make_archive(path, 'zip', path)
      files.download(path + '.zip')

    button = Button(description="Download Images")
    button.on_click(download)

    display(gif, button)

    # return VBox([gif, button])


# based on https://towardsdatascience.com/stable-diffusion-using-hugging-face-501d8dbdd8
def pil_to_latents(image):
    '''
    Function to convert image to latents
    '''
    init_image = transforms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device=device, dtype=torch.float16)
    init_latent_dist = vae.encode(init_image).latent_dist.sample() * 0.18215
    return init_latent_dist

def latents_to_images(latents):
    '''
    Function to convert latents to images
    '''
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample

    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    # images = ([image] for image in images)
    return images

def latents_to_pil(latents):
    '''
    Function to convert latents to images
    '''
    images = latents_to_images(latents)
    pil_images = [Image.fromarray(image[0]) for image in images]
    return pil_images

# The function presented below stands for Spherical Linear Interpolation. It is a method
# of interpolation on the surface of a sphere. This function is commonly used in computer
# graphics to animate rotations in a smooth manner and can also be used to interpolate
# between high-dimensional data points in machine learning, such as latent vectors used
# in generative models.

# The source is from Andrej Karpathy's gist: https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355.
# A more detailed explanation of this method can be found at: https://en.wikipedia.org/wiki/Slerp.

def slerp(v0, v1, num, t0=0, t1=1):
    v0 = v0.detach().cpu().numpy()
    v1 = v1.detach().cpu().numpy()

    def interpolation(t, v0, v1, DOT_THRESHOLD=0.9995):
        """helper function to spherically interpolate two arrays v1 v2"""
        dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
        if np.abs(dot) > DOT_THRESHOLD:
            v2 = (1 - t) * v0 + t * v1
        else:
            theta_0 = np.arccos(dot)
            sin_theta_0 = np.sin(theta_0)
            theta_t = theta_0 * t
            sin_theta_t = np.sin(theta_t)
            s0 = np.sin(theta_0 - theta_t) / sin_theta_0
            s1 = sin_theta_t / sin_theta_0
            v2 = s0 * v0 + s1 * v1
        return v2

    t = np.linspace(t0, t1, num)

    v3 = torch.tensor(np.array([interpolation(t[i], v0, v1) for i in range(num)]))

    return v3

### Generation parameters


* `seed`: This variable is used to set a specific random seed for reproducibility.
* `guidance_scale`: This parameter controls the extent to which the model should follow the prompt in text-to-image generation tasks, with higher values leading to stronger adherence to the prompt.       
* `num_inference_steps`: This specifies the number of steps the model takes to generate an image. More steps can lead to a higher quality image but take longer to generate.        
* `num_interpolation_steps`: This determines the number of steps used when interpolating between two points in the latent space, affecting the smoothness of transitions in generated       animations.        
* `height`: The height of the generated images in pixels.       
* `width`: The width of the generated images in pixels.

### Interpolation between multiple prompts

In contrast to the first example, where we moved away from a single prompt, in this example, we will be interpolating between any number of prompts. To do so, we will take consecutive pairs of prompts and create smooth transitions between them. Then, we will combine the interpolations of these consecutive pairs, and instruct the model to generate images based on them. For interpolation we will use the slerp function, as in the second example.

![Example 3](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/sd_interpolation_3.gif)

In [None]:
# @markdown Specify a list of prompts (a prompt per line)

from ipywidgets import Textarea, HBox

try:
  value=list1.value
except:
  value=None

list1 = Textarea(
    value=value,
    layout={'width': '45%', 'height':'95%'},
    description='Prompts')

try:
  value=list2.value
except:
  value=None

list2 = Textarea(
    value=value,
    layout={'width': '45%', 'height':'95%'},
    description='Negative<br/>prompts')

display(HBox([list1,list2], layout={'height':'250px'}))

In [None]:
seed = -1 # @param {type:"integer"}
guidance_scale = 8 # @param {type:"slider", min:0, max:20, step:0.1}
num_inference_steps = 10 # @param {type:"slider", min:0, max:50, step:1}
num_interpolation_steps = 10 # @param {type:"slider", min:1, max:100, step:1}
height = 512 # @param {type:"slider", min:256, max:1024, step:1}
width = 512 # @param {type:"slider", min:256, max:1024, step:1}

if seed >= 0:
    generator = torch.manual_seed(seed)
    print( "seed:", seed)
else:
    generator = None
    print( "seed:", torch.seed() )


# Once again, let's tokenize and obtain embeddings but this time for multiple positive and negative text prompts.

prompts = list1.value.split('\n')
negative_prompts = list2.value.split('\n')

batch_size = len(prompts)

if len(negative_prompts) < batch_size:
  negative_prompts += [''] * (batch_size - len(negative_prompts))
else:
  negative_prompts = negative_prompts[:batch_size]

# Tokenizing and encoding prompts into embeddings.
prompts_tokens = pipe.tokenizer(
    prompts,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
prompts_embeds = pipe.text_encoder(
    prompts_tokens.input_ids.to(device)
)[0]

negative_prompts_tokens = pipe.tokenizer(
    negative_prompts,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
negative_prompts_embeds = pipe.text_encoder(
    negative_prompts_tokens.input_ids.to(device)
)[0]


# We will take consecutive pairs of prompts and create smooth transitions between them with `slerp` function.

# Generating initial U-Net latent vectors from a random normal distribution.
latents = torch.randn(
    (1, pipe.unet.config.in_channels, height // 8, width // 8),
    generator=generator,
)

# Interpolating between embeddings pairs for the given number of interpolation steps.
interpolated_prompt_embeds = []
interpolated_negative_prompts_embeds = []
for i in range(batch_size - 1):
    interpolated_prompt_embeds.append(
        slerp(
            prompts_embeds[i],
            prompts_embeds[i + 1],
            num_interpolation_steps
        )
    )
    interpolated_negative_prompts_embeds.append(
        slerp(
            negative_prompts_embeds[i],
            negative_prompts_embeds[i + 1],
            num_interpolation_steps,
        )
    )

interpolated_prompt_embeds = torch.cat(
    interpolated_prompt_embeds, dim=0
).to(device)

interpolated_negative_prompts_embeds = torch.cat(
    interpolated_negative_prompts_embeds, dim=0
).to(device)


# Finally, we need to generate images based on the embeddings.

# Generating images using the interpolated embeddings.
images = []
for prompt_embeds, negative_prompt_embeds in tqdm(
    zip(interpolated_prompt_embeds, interpolated_negative_prompts_embeds),
    total=len(interpolated_prompt_embeds),
):
    images.append(
        pipe(
            height=height,
            width=width,
            num_images_per_prompt=1,
            prompt_embeds=prompt_embeds[None, ...],
            negative_prompt_embeds=negative_prompt_embeds[None, ...],
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            latents=latents,
        ).images
    )

# Display of saved generated images.
path = save_images(images, save_path)
display_gif(path)


# Finalizing

When you finish working you have to remember to **stop the runtime**, because there is a time limit and to avoid wasting resources. To stop the runtime click Manage Sessions on the Runtime menu. Once the dialog opens click terminate on the current runtime.

> But when you stop the runtime everything you have not saved is ⚠ **lost** ⚠, so be sure to **download** everything you want to keep before stopping it.


# Credits

Modified from https://huggingface.co/learn/cookbook/en/stable_diffusion_interpolation by Taller Estampa https://tallerestampa.com / https://github.com/estampa