<a href="https://colab.research.google.com/github/estampa/BAU-MD-DD/blob/main/BAU_DwD_4_Stable_Diffusion_Latent_Space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring the latent space of stable diffusion

What's interesting is that you can generate an image from any point in latent space.

In [None]:
#@markdown # First, let's install all the required modules.

!pip install -q diffusers transformers accelerate
!pip install -q numpy scipy ftfy Pillow

In [None]:
#@markdown # Import modules

import torch
import numpy as np
import os

import time

from PIL import Image
from IPython import display as IPdisplay
from tqdm.auto import tqdm

from diffusers import StableDiffusionPipeline
from diffusers import (
    DDIMScheduler,
    PNDMScheduler,
    LMSDiscreteScheduler,
    DPMSolverMultistepScheduler,
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
    AutoencoderKL,
)
from transformers import logging

from google.colab import files
from google.colab import output
import io

from torchvision import transforms

from ipywidgets import Button
import shutil

logging.set_verbosity_error()

#Let's check if CUDA is available.

print(torch.cuda.is_available())

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# These settings are used to optimize the performance of PyTorch models on CUDA-enabled GPUs,
# especially when using mixed precision training or inference, which can be beneficial in terms of speed and memory usage.
# Source: https://huggingface.co/docs/diffusers/optimization/fp16#memory-efficient-attention

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
#@markdown # Model

#@markdown The [`stabilityai/stable-diffusion-2-1-base`](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) model and the EulerDiscreteSchedulerscheduler were chosen to generate images. Despite being an older technology, it continues to enjoy popularity due to its fast performance, minimal memory requirements, and the availability of numerous community fine-tuned models built on top of SD1.5. However, you are free to experiment with other models and schedulers to compare the results.


model_name_or_path = "stabilityai/stable-diffusion-2-1-base"

scheduler = EulerDiscreteScheduler.from_pretrained(model_name_or_path, subfolder="scheduler")

pipe = StableDiffusionPipeline.from_pretrained(
    model_name_or_path,
    scheduler=scheduler,
    torch_dtype=torch.float32,
).to(device)

# Disable image generation progress bar, we'll display our own
pipe.set_progress_bar_config(disable=True)

vae = AutoencoderKL.from_pretrained(model_name_or_path, subfolder="vae", torch_dtype=torch.float16).to(device)

In [None]:
#@markdown # Reduce the memory consumed by the GPU.

#@markdown More detailed information can be found here: https://huggingface.co/docs/diffusers/en/optimization/opt_overview
#@markdown In particular, information about the following methods can be found here: https://huggingface.co/docs/diffusers/optimization/memory


# Offloading the weights to the CPU and only loading them on the GPU can reduce memory consumption to less than 3GB.
pipe.enable_model_cpu_offload()

# Tighter ordering of memory tensors.
pipe.unet.to(memory_format=torch.channels_last)

# Decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time.
pipe.enable_vae_slicing()

# Splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image.
pipe.enable_vae_tiling()

# Using Flash Attention; If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling xformers.
pipe.enable_xformers_memory_efficient_attention()


In [None]:
#@markdown # Define functions

# The path where the generated GIFs will be saved
save_path = "/content/output"

if not os.path.exists(save_path):
    os.makedirs(save_path)

# Saves the images and a GIF
def save_images(images, output_path):
  # Generate a file name based on the current time, replacing colons with hyphens
    # to ensure the filename is valid for file systems that don't allow colons.
    dirname = (
        time.strftime("%H:%M:%S", time.localtime())
        .replace(":", "-")
    )

    save_path = os.path.join(output_path, dirname)

    if not os.path.exists(save_path):
      os.makedirs(save_path)

    # Convert each image in the 'images' list from an array to an Image object.
    converted = []

    for i, image in enumerate(images):
      pil_image = Image.fromarray(np.array(image[0], dtype=np.uint8))
      pil_image.save(f"{save_path}/{i:03d}.png")
      converted.append(pil_image)

    # Save the first image in the list as a GIF file at the 'save_path' location.
    # The rest of the images in the list are added as subsequent frames to the GIF.
    # The GIF will play each frame for 100 milliseconds and will loop indefinitely.
    converted[0].save(
        f"{save_path}/preview.gif",
        save_all=True,
        append_images=converted[1:],
        duration=100,
        loop=0,
    )

    return save_path

# Displays the GIF saved in a path
def display_gif(path):
    # Return the saved GIF as an IPython display object so it can be displayed in a notebook.

    gif = IPdisplay.Image(f"{path}/preview.gif")

    def download(b):
      shutil.make_archive(path, 'zip', path)
      files.download(path + '.zip')

    button = Button(description="Download Images")
    button.on_click(download)

    display(gif, button)

    # return VBox([gif, button])


# based on https://towardsdatascience.com/stable-diffusion-using-hugging-face-501d8dbdd8
def pil_to_latents(image):
    '''
    Function to convert image to latents
    '''
    init_image = transforms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device=device, dtype=torch.float16)
    init_latent_dist = vae.encode(init_image).latent_dist.sample() * 0.18215
    return init_latent_dist

def latents_to_images(latents):
    '''
    Function to convert latents to images
    '''
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample

    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    # images = ([image] for image in images)
    return images

def latents_to_pil(latents):
    '''
    Function to convert latents to images
    '''
    images = latents_to_images(latents)
    pil_images = [Image.fromarray(image[0]) for image in images]
    return pil_images

# The function presented below stands for Spherical Linear Interpolation. It is a method
# of interpolation on the surface of a sphere. This function is commonly used in computer
# graphics to animate rotations in a smooth manner and can also be used to interpolate
# between high-dimensional data points in machine learning, such as latent vectors used
# in generative models.

# The source is from Andrej Karpathy's gist: https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355.
# A more detailed explanation of this method can be found at: https://en.wikipedia.org/wiki/Slerp.

def slerp(v0, v1, num, t0=0, t1=1):
    v0 = v0.detach().cpu().numpy()
    v1 = v1.detach().cpu().numpy()

    def interpolation(t, v0, v1, DOT_THRESHOLD=0.9995):
        """helper function to spherically interpolate two arrays v1 v2"""
        dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
        if np.abs(dot) > DOT_THRESHOLD:
            v2 = (1 - t) * v0 + t * v1
        else:
            theta_0 = np.arccos(dot)
            sin_theta_0 = np.sin(theta_0)
            theta_t = theta_0 * t
            sin_theta_t = np.sin(theta_t)
            s0 = np.sin(theta_0 - theta_t) / sin_theta_0
            s1 = sin_theta_t / sin_theta_0
            v2 = s0 * v0 + s1 * v1
        return v2

    t = np.linspace(t0, t1, num)

    v3 = torch.tensor(np.array([interpolation(t[i], v0, v1) for i in range(num)]))

    return v3

### Generation parameters


* `seed`: This variable is used to set a specific random seed for reproducibility.
* `guidance_scale`: This parameter controls the extent to which the model should follow the prompt in text-to-image generation tasks, with higher values leading to stronger adherence to the prompt.       
* `num_inference_steps`: This specifies the number of steps the model takes to generate an image. More steps can lead to a higher quality image but take longer to generate.        
* `num_interpolation_steps`: This determines the number of steps used when interpolating between two points in the latent space, affecting the smoothness of transitions in generated       animations.        
* `height`: The height of the generated images in pixels.       
* `width`: The width of the generated images in pixels.

### Example 1: Prompt interpolation

In this example, interpolation between positive and negative prompt embeddings allows exploration of space between two conceptual points defined by prompts, potentially leading to variety of images blending characteristics dictated by prompts gradually. In this case, interpolation involves adding scaled deltas to original embeddings, creating a series of new embeddings that will be used later to generate images with smooth transitions between different states based on the original prompt.


![Example 1](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/sd_interpolation_1.gif)

In [None]:
# The text prompt that describes the desired output image.
prompt = "Epic shot of Sweden, ultra detailed lake with an ren dear, nostalgic vintage, ultra cozy and inviting, wonderful light atmosphere, fairy, little photorealistic, digital painting, sharp focus, ultra cozy and inviting, wish to be there. very detailed, arty, should rank high on youtube for a dream trip." # @param {type:"string"}
# A negative prompt that can be used to steer the generation away from certain features; here, it is empty.
negative_prompt = "poorly drawn,cartoon, 2d, disfigured, bad art, deformed, poorly drawn, extra limbs, close up, b&w, weird colors, blurry" # @param {type:"string"}

seed = -1 # @param {type:"integer"}
guidance_scale = 8 # @param {type:"slider", min:0, max:20, step:0.1}
num_inference_steps = 10 # @param {type:"slider", min:0, max:50, step:1}
num_interpolation_steps = 10 # @param {type:"slider", min:1, max:100, step:1}
height = 512 # @param {type:"slider", min:256, max:1024, step:1}
width = 512 # @param {type:"slider", min:256, max:1024, step:1}

if seed >= 0:
    generator = torch.manual_seed(seed)
    print( "seed:", seed)
else:
    generator = None
    print( "seed:", torch.seed() )


# The step size for the interpolation in the latent space.
step_size = 0.001

# First of all, we need to tokenize and obtain embeddings for both positive and negative text prompts. The positive prompt guides the image generation towards the desired characteristics, while the negative prompt steers it away from unwanted features.

# Tokenizing and encoding the prompt into embeddings.
prompt_tokens = pipe.tokenizer(
    prompt,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
prompt_embeds = pipe.text_encoder(prompt_tokens.input_ids.to(device))[0]


# Tokenizing and encoding the negative prompt into embeddings.
if negative_prompt is None:
    negative_prompt = [""]

negative_prompt_tokens = pipe.tokenizer(
    negative_prompt,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
negative_prompt_embeds = pipe.text_encoder(negative_prompt_tokens.input_ids.to(device))[0]

# Now let's look at the code part that generates a random initial vector using a normal distribution
# that is structured to match the dimensions expected by the diffusion model (UNet). This allows for
# the reproducibility of the results by optionally using a random number generator. After creating
# the initial vector, the code performs a series of interpolations between the two embeddings
# (positive and negative prompts), by incrementally adding a small step size for each iteration.
# The results are stored in a list named "walked_embeddings".

# Generating initial latent vectors from a random normal distribution, with the option to use a generator for reproducibility.
latents = torch.randn(
    (1, pipe.unet.config.in_channels, height // 8, width // 8),
    generator=generator,
)

walked_embeddings = []

# Interpolating between embeddings for the given number of interpolation steps.
for i in range(num_interpolation_steps):
    walked_embeddings.append(
        [prompt_embeds + step_size * i, negative_prompt_embeds + step_size * i]
    )


# Finally, let's generate a series of images based on interpolated embeddings and then displaying
# these images. We'll iterate over an array of embeddings, using each to generate an image with
# specified characteristics like height, width, and other parameters relevant to image generation.
# Then we'll collect these images into a list. Once generation is complete we'll call the
# `display_image` function to save and display these images as GIF at a given save path.

# Generating images using the interpolated embeddings.
images = []
for latent in tqdm(walked_embeddings):
    images.append(
        pipe(
            height=height,
            width=width,
            num_images_per_prompt=1,
            prompt_embeds=latent[0],
            negative_prompt_embeds=latent[1],
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            latents=latents,
        ).images
    )

# Display of saved generated images.
path = save_images(images, save_path)
display_gif(path)

### Example 2: Diffusion latents interpolation for a single prompt
Unlike the first example, in this one, we are performing interpolation between the two embeddings of the diffusion model itself, not the prompts. Please note that in this case, we use the slerp function for interpolation. However, there is nothing stopping us from adding a constant value to one embedding instead.


![Example 2](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/sd_interpolation_2.gif)

In [None]:
# The text prompt that describes the desired output image.
prompt = "" # @param {type:"string"}
# A negative prompt that can be used to steer the generation away from certain features; here, it is empty.
negative_prompt = "" # @param {type:"string"}

seed = -1 # @param {type:"integer"}
guidance_scale = 8 # @param {type:"slider", min:0, max:20, step:0.1}
num_inference_steps = 10 # @param {type:"slider", min:0, max:50, step:1}
num_interpolation_steps = 10 # @param {type:"slider", min:1, max:100, step:1}
height = 512 # @param {type:"slider", min:256, max:1024, step:1}
width = 512 # @param {type:"slider", min:256, max:1024, step:1}

if seed >= 0:
    generator = torch.manual_seed(seed)
    print( "seed:", seed)
else:
    generator = None
    print( "seed:", torch.seed() )

# Generating initial latent vectors from a random normal distribution. In this example two latent vectors are generated, which will serve as start and end points for the interpolation.
# These vectors are shaped to fit the input requirements of the diffusion model's U-Net architecture.
latents = torch.randn(
    (2, pipe.unet.config.in_channels, height // 8, width // 8),
    generator=generator,
)

# Getting our latent embeddings
interpolated_latents = slerp(latents[0], latents[1], num_interpolation_steps)

# Generating images using the interpolated embeddings.
images = []
for latent_vector in tqdm(interpolated_latents):
    images.append(
        pipe(
            prompt,
            height=height,
            width=width,
            negative_prompt=negative_prompt,
            num_images_per_prompt=1,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            latents=latent_vector[None, ...],
        ).images
    )

# Display of saved generated images.
path = save_images(images, save_path)
display_gif(path)

### Example 3: Interpolation between multiple prompts

In contrast to the first example, where we moved away from a single prompt, in this example, we will be interpolating between any number of prompts. To do so, we will take consecutive pairs of prompts and create smooth transitions between them. Then, we will combine the interpolations of these consecutive pairs, and instruct the model to generate images based on them. For interpolation we will use the slerp function, as in the second example.

![Example 3](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/sd_interpolation_3.gif)

In [None]:
# @markdown Specify a list of prompts (a prompt per line)

from ipywidgets import Textarea, HBox

try:
  value=list1.value
except:
  value=None

list1 = Textarea(
    value=value,
    layout={'width': '45%', 'height':'95%'},
    description='Prompts')

try:
  value=list2.value
except:
  value=None

list2 = Textarea(
    value=value,
    layout={'width': '45%', 'height':'95%'},
    description='Negative<br/>prompts')

display(HBox([list1,list2], layout={'height':'250px'}))

In [None]:
seed = -1 # @param {type:"integer"}
guidance_scale = 8 # @param {type:"slider", min:0, max:20, step:0.1}
num_inference_steps = 10 # @param {type:"slider", min:0, max:50, step:1}
num_interpolation_steps = 10 # @param {type:"slider", min:1, max:100, step:1}
height = 512 # @param {type:"slider", min:256, max:1024, step:1}
width = 512 # @param {type:"slider", min:256, max:1024, step:1}

if seed >= 0:
    generator = torch.manual_seed(seed)
    print( "seed:", seed)
else:
    generator = None
    print( "seed:", torch.seed() )


# Once again, let's tokenize and obtain embeddings but this time for multiple positive and negative text prompts.

prompts = list1.value.split('\n')
negative_prompts = list2.value.split('\n')

batch_size = len(prompts)

if len(negative_prompts) < batch_size:
  negative_prompts += [''] * (batch_size - len(negative_prompts))
else:
  negative_prompts = negative_prompts[:batch_size]

# Tokenizing and encoding prompts into embeddings.
prompts_tokens = pipe.tokenizer(
    prompts,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
prompts_embeds = pipe.text_encoder(
    prompts_tokens.input_ids.to(device)
)[0]

negative_prompts_tokens = pipe.tokenizer(
    negative_prompts,
    padding="max_length",
    max_length=pipe.tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
negative_prompts_embeds = pipe.text_encoder(
    negative_prompts_tokens.input_ids.to(device)
)[0]


# We will take consecutive pairs of prompts and create smooth transitions between them with `slerp` function.

# Generating initial U-Net latent vectors from a random normal distribution.
latents = torch.randn(
    (1, pipe.unet.config.in_channels, height // 8, width // 8),
    generator=generator,
)

# Interpolating between embeddings pairs for the given number of interpolation steps.
interpolated_prompt_embeds = []
interpolated_negative_prompts_embeds = []
for i in range(batch_size - 1):
    interpolated_prompt_embeds.append(
        slerp(
            prompts_embeds[i],
            prompts_embeds[i + 1],
            num_interpolation_steps
        )
    )
    interpolated_negative_prompts_embeds.append(
        slerp(
            negative_prompts_embeds[i],
            negative_prompts_embeds[i + 1],
            num_interpolation_steps,
        )
    )

interpolated_prompt_embeds = torch.cat(
    interpolated_prompt_embeds, dim=0
).to(device)

interpolated_negative_prompts_embeds = torch.cat(
    interpolated_negative_prompts_embeds, dim=0
).to(device)


# Finally, we need to generate images based on the embeddings.

# Generating images using the interpolated embeddings.
images = []
for prompt_embeds, negative_prompt_embeds in tqdm(
    zip(interpolated_prompt_embeds, interpolated_negative_prompts_embeds),
    total=len(interpolated_prompt_embeds),
):
    images.append(
        pipe(
            height=height,
            width=width,
            num_images_per_prompt=1,
            prompt_embeds=prompt_embeds[None, ...],
            negative_prompt_embeds=negative_prompt_embeds[None, ...],
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            latents=latents,
        ).images
    )

# Display of saved generated images.
path = save_images(images, save_path)
display_gif(path)


### Example 4: Circular walk through the diffusion latent space for a single prompt

This example was taken from: https://keras.io/examples/generative/random_walks_with_stable_diffusion/       

Let's imagine that we have two noise components, which we'll call x and y. We start by moving from 0 to 2π and at each step we add the cosine of x and the sine of y to the result. Using this approach, at the end of our movement we end up with the same noise values ​​that we started with. This means that vectors end up turning into themselves, ending our movement.



![Example 4](https://huggingface.co/datasets/huggingface/cookbook-images/resolve/main/sd_interpolation_4.gif)

In [None]:
# The text prompt that describes the desired output image.
prompt = "" # @param {type:"string"}
# A negative prompt that can be used to steer the generation away from certain features; here, it is empty.
negative_prompt = "" # @param {type:"string"}

seed = -1 # @param {type:"integer"}
guidance_scale = 8 # @param {type:"slider", min:0, max:20, step:0.1}
num_inference_steps = 10 # @param {type:"slider", min:0, max:50, step:1}
num_interpolation_steps = 10 # @param {type:"slider", min:1, max:100, step:1}
height = 512 # @param {type:"slider", min:256, max:1024, step:1}
width = 512 # @param {type:"slider", min:256, max:1024, step:1}

if seed >= 0:
    generator = torch.manual_seed(seed)
    print( "seed:", seed)
else:
    generator = None
    print( "seed:", torch.seed() )

# Generating initial latent vectors from a random normal distribution to create a loop interpolation between them.
latents = torch.randn(
    (2, 1, pipe.unet.config.in_channels, height // 8, width // 8),
    generator=generator,
)


# Calculation of looped embeddings
walk_noise_x = latents[0].to(device)
walk_noise_y = latents[1].to(device)

# Walking on a trigonometric circle
walk_scale_x = torch.cos(torch.linspace(0, 2, num_interpolation_steps) * np.pi).to(
    device
)
walk_scale_y = torch.sin(torch.linspace(0, 2, num_interpolation_steps) * np.pi).to(
    device
)

# Applying interpolation to noise
noise_x = torch.tensordot(walk_scale_x, walk_noise_x, dims=0)
noise_y = torch.tensordot(walk_scale_y, walk_noise_y, dims=0)

circular_latents = noise_x + noise_y

# Generating images using the interpolated embeddings.
images = []
for latent_vector in tqdm(circular_latents):
    images.append(
        pipe(
            prompt,
            height=height,
            width=width,
            negative_prompt=negative_prompt,
            num_images_per_prompt=1,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            latents=latent_vector,
        ).images
    )

# Display of saved generated images.
path = save_images(images, save_path)
display_gif(path)

# Finalizing

When you finish working you have to remember to **stop the runtime**, because there is a time limit and to avoid wasting resources. To stop the runtime click Manage Sessions on the Runtime menu. Once the dialog opens click terminate on the current runtime.

> But when you stop the runtime everything you have not saved is ⚠ **lost** ⚠, so be sure to **download** everything you want to keep before stopping it.


# Credits

Modified from https://huggingface.co/learn/cookbook/en/stable_diffusion_interpolation by Taller Estampa https://tallerestampa.com / https://github.com/estampa