In [3]:
import torch
import requests
from PIL import Image
from diffusers import StableDiffusionDepth2ImgPipeline

In [2]:
from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionDepth2ImgPipeline, UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer

In [3]:
from accelerate import Accelerator

accelerator = Accelerator(
        gradient_accumulation_steps=1,
        mixed_precision='fp16',
        log_with='wandb',
        logging_dir='wandb',
    )

In [8]:
noise_scheduler = DDPMScheduler.from_pretrained('stabilityai/stable-diffusion-2-depth', subfolder="scheduler")

In [9]:
tokenizer = CLIPTokenizer.from_pretrained(
    'stabilityai/stable-diffusion-2-depth', subfolder="tokenizer", revision=None
)

In [10]:
!nvidia-smi

Wed Feb  1 02:00:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    25W /  70W |   2394MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
text_encoder = CLIPTextModel.from_pretrained(
    'stabilityai/stable-diffusion-2-depth', subfolder="text_encoder", revision=None
)

In [12]:
vae = AutoencoderKL.from_pretrained('stabilityai/stable-diffusion-2-depth', subfolder="vae", revision=None)


In [13]:
vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)


In [4]:
weight_dtype = torch.float32
if accelerator.mixed_precision == "fp16":
    weight_dtype = torch.float16
elif accelerator.mixed_precision == "bf16":
    weight_dtype = torch.bfloat16


In [14]:
vae.to(accelerator.device, dtype=weight_dtype)
text_encoder.to(accelerator.device, dtype=weight_dtype)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 1024)
      (position_embedding): Embedding(77, 1024)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1

In [15]:
!nvidia-smi

Wed Feb  1 02:00:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    25W /  70W |   3198MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
unet = UNet2DConditionModel.from_pretrained(
    'stabilityai/stable-diffusion-2-depth', subfolder="unet", revision=None
)

In [6]:
unet.to(accelerator.device, dtype=weight_dtype)


UNet2DConditionModel(
  (conv_in): Conv2d(5, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (time_proj): Timesteps()
  (time_embedding): TimestepEmbedding(
    (linear_1): Linear(in_features=320, out_features=1280, bias=True)
    (act): SiLU()
    (linear_2): Linear(in_features=1280, out_features=1280, bias=True)
  )
  (down_blocks): ModuleList(
    (0): CrossAttnDownBlock2D(
      (attentions): ModuleList(
        (0): Transformer2DModel(
          (norm): GroupNorm(32, 320, eps=1e-06, affine=True)
          (proj_in): Linear(in_features=320, out_features=320, bias=True)
          (transformer_blocks): ModuleList(
            (0): BasicTransformerBlock(
              (attn1): CrossAttention(
                (to_q): Linear(in_features=320, out_features=320, bias=False)
                (to_k): Linear(in_features=320, out_features=320, bias=False)
                (to_v): Linear(in_features=320, out_features=320, bias=False)
                (to_out): ModuleList(
               

In [7]:
!nvidia-smi

Wed Feb  1 01:59:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W /  70W |   2394MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
prompts=[
'A train on top of a surfboard.',
'A wine glass on top of a dog.',
'A bicycle on top of a boat.',
'An umbrella on top of a spoon.',
'A laptop on top of a teddy bear.',
'A giraffe underneath a microwave.',
'A donut underneath a toilet.',
'A hair drier underneath a sheep.',
'A tennis racket underneath a traffic light.',
'A zebra underneath a broccoli.',
'A banana on the left of an apple.',
'A couch on the left of a chair.',
'A car on the left of a bus.',
'A cat on the left of a dog.',
'A carrot on the left of a broccoli.',
'A pizza on the right of a suitcase.',
'A cat on the right of a tennis racket.',
'A stop sign on the right of a refrigerator.',
'A sheep to the right of a wine glass.',
'A zebra to the right of a fire hydrant.'
]

In [5]:
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

model_id = "stabilityai/stable-diffusion-2-1"

# Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
pipe.enable_attention_slicing()

Fetching 13 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 127695.44it/s]


In [13]:
for i, prompt in enumerate(prompts):
    
    image = pipe(prompt).images[0]
    image.save(f"l{str(i).zfill(4)}.png")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:41<00:00,  1.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:42<00:00,  1.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:43<00:00,  1.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:43<00:00,  1.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████

In [10]:
prompts = [
'A red colored car.',
'A black colored car.',
'A pink colored car.',
'A black colored dog.',
'A red colored dog.',
'A blue colored dog.',
'A green colored banana.',
'A red colored banana.',
'A black colored banana.',
'A white colored sandwich.',
'A black colored sandwich.',
'An orange colored sandwich.',
'A pink colored giraffe.',
'A yellow colored giraffe.',
'A brown colored giraffe.',
'A red car and a white sheep.',
'A blue bird and a brown bear.',
'A green apple and a black backpack.',
'A green cup and a blue cell phone.',
'A yellow book and a red vase.',
'A white car and a red sheep.',
'A brown bird and a blue bear.',
'A black apple and a green backpack.',
'A blue cup and a green cell phone.',
'A red book and a yellow vase.'
]

In [12]:
prompts = [
    'A blue coloured pizza.',
    'A fish eating a pelican.',
    'One cat and one dog sitting on the grass.',
    'A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.',
    'A photocopy of a photograph of a painting of a sculpture of a giraffe.'
]