In [12]:
%%writefile requirements.txt
torch
matplotlib
scikit-learn
diffusers
transformers
scipy
ftfy
accelerate

Overwriting requirements.txt


In [13]:
!%pip install -r requirements.txt

import matplotlib.pyplot as plt
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler
from tqdm.auto import tqdm
from diffusers import StableDiffusionPipeline

%matplotlib inline
%config InlineBackend.figure_format='retina' # make plots prettier

zsh:fg:1: no job control in this shell.


In [None]:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
repo_id = "CompVis/stable-diffusion-v1-4"

device = "cpu"
pipe = pipe.to(device)

vae = AutoencoderKL.from_pretrained(repo_id, subfolder="vae")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
unet = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet")

vae = vae.to(device)
text_encoder = text_encoder.to(device)
unet = unet.to(device)

scheduler = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")

num_inference_steps = 100
scheduler.set_timesteps(num_inference_steps)

def get_text_embeddings(prompt):
    text_ids = tokenizer(
        prompt, 
        padding="max_length", 
        max_length=tokenizer.model_max_length, 
        truncation=True, 
        return_tensors="pt"
    ).input_ids

    with torch.no_grad():
        text_embeddings = text_encoder(text_ids)[0]
    
    return text_embeddings

###
prompt = "House of a Swiss Family in the Swiss Alps, scenic view, beautiful lighting, ultra detailed, 8k"
text_embeddings = get_text_embeddings(prompt)
text_embeddings.shape

height = 512
width = 512
batch_size = 1
in_channels = unet.config.in_channels

def get_latents():
    latents = torch.randn(
        (batch_size, in_channels, height // 8, width // 8),
        device=device
    )
    
    return latents

latents = get_latents()

###
for t in tqdm(scheduler.timesteps):
    latent_model_input = scheduler.scale_model_input(latents, t)

    with torch.no_grad():
        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

    latents = scheduler.step(noise_pred, t, latents).prev_sample

###
latents = latents / vae.config.scaling_factor

with torch.no_grad():
    image = vae.decode(latents).sample

image.min(), image.max()

image = (image / 2 + 0.5).clamp(0, 1)
image.shape

plt.imshow(image[0].permute(1, 2, 0).detach().cpu())


Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 24.07it/s]
 37%|███▋      | 37/100 [01:40<01:46,  1.69s/it]