[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/devdastl/EVA-8_Capstone_Assignment/blob/main/Part2-Inpainting_implementation/Attempt2_cuda_error.ipynb)

Install python dependecy required for the inference

In [None]:
!pip install diffusers
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import python depedency

In [1]:
import inspect
from typing import List, Optional, Union

import numpy as np
import torch

import PIL
from PIL import Image
from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline, PNDMScheduler, UNet2DConditionModel, DPMSolverMultistepScheduler
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from tqdm.auto import tqdm
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer


  from .autonotebook import tqdm as notebook_tqdm


Download VAE trained models from `runwayml/stable-diffusion-inpainting`

In [2]:
vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="vae")
vae.config

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


FrozenDict([('in_channels', 3),
            ('out_channels', 3),
            ('down_block_types',
             ['DownEncoderBlock2D',
              'DownEncoderBlock2D',
              'DownEncoderBlock2D',
              'DownEncoderBlock2D']),
            ('up_block_types',
             ['UpDecoderBlock2D',
              'UpDecoderBlock2D',
              'UpDecoderBlock2D',
              'UpDecoderBlock2D']),
            ('block_out_channels', [128, 256, 512, 512]),
            ('layers_per_block', 2),
            ('act_fn', 'silu'),
            ('latent_channels', 4),
            ('norm_num_groups', 32),
            ('sample_size', 256),
            ('scaling_factor', 0.18215),
            ('_class_name', 'AutoencoderKL'),
            ('_diffusers_version', '0.6.0.dev0'),
            ('_name_or_path', 'runwayml/stable-diffusion-inpainting')])

Download and instantiate other trained models for:
- stable duffusion tokenizer
- CLIP text_encoder
- Special UNET trained with 9 input channels and 4 output channel  
- scheduler for adding noise
- feature extractor

In [3]:

tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="text_encoder")
unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="unet")
scheduler = DDIMScheduler.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="scheduler")
#scheduler = PNDMScheduler.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="scheduler")
feature_extractor = CLIPFeatureExtractor.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="feature_extractor")
safety_checker = StableDiffusionSafetyChecker.from_pretrained("runwayml/stable-diffusion-inpainting", subfolder="safety_checker")

#scheduler = Union[scheduler1, scheduler2]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Now load the model in cuda device

In [4]:
device = "cuda"
vae.to(device)
text_encoder.to(device)
unet.to(device)
#feature_extractor.to(device)
safety_checker.to(device)

StableDiffusionSafetyChecker(
  (vision_model): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(257, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickG

## Utility functions

code for preprocessing images and mask

In [5]:
def make_batch(image, mask, device):
    image = np.array(Image.open(image).convert("RGB"))
    image = image.astype(np.float32)/255.0
    image = image[None].transpose(0,3,1,2)
    image = torch.from_numpy(image)

    mask = np.array(Image.open(mask).convert("L"))
    mask = mask.astype(np.float32)/255.0
    mask = mask[None,None]
    mask[mask < 0.5] = 0
    mask[mask >= 0.5] = 1
    mask = torch.from_numpy(mask)

    masked_image = (1-mask)*image

    batch = {"image": image, "mask": mask, "masked_image": masked_image}
    for k in batch:
        batch[k] = batch[k].to(device=device)
        batch[k] = batch[k]*2.0-1.0
    return batch

## Variables for inpainting 

setup variables for inpainting

In [6]:
batch_size = 1
num_inference_steps=50
generator=None
strength=0.8
prompt='dummy prompt'
guidance_scale=7.5
eta=0.0

load images for inpainting

In [7]:
!git clone https://github.com/devdastl/EVA-8_Capstone_Assignment.git

init_image = 'EVA-8_Capstone_Assignment/Part2-Inpainting_implementation/test_data/test_image.png'
mask_image = 'EVA-8_Capstone_Assignment/Part2-Inpainting_implementation/test_data/mask_image.png'

In [8]:
# set timesteps
accepts_offset = "offset" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
extra_set_kwargs = {}
offset = 0
if accepts_offset:
    offset = 1
    extra_set_kwargs["offset"] = 1

scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)

In [9]:
batch = make_batch(init_image, mask_image, device)

# preprocess image
image = batch['image']
mask = batch['mask']
masked_image = batch['masked_image']
print(image.shape, mask.shape, masked_image.shape)

torch.Size([1, 3, 512, 512]) torch.Size([1, 1, 512, 512]) torch.Size([1, 3, 512, 512])


In [10]:
with torch.no_grad():
    init_latent_dist = vae.encode(image).latent_dist
    init_latents = init_latent_dist.sample(generator=generator)
    init_latents = 0.18215 * init_latents
    print(init_latents.shape)


    latent_masked = vae.encode(masked_image).latent_dist.sample()
    latent_masked = 0.18215 * latent_masked
    print(latent_masked.shape)

torch.Size([1, 4, 64, 64])
torch.Size([1, 4, 64, 64])


In [11]:
bchw = (1, 4, 64, 64)
mask = torch.nn.functional.interpolate(mask, size=bchw[-2:])
mask.shape

torch.Size([1, 1, 64, 64])

In [12]:
init_latents_orig = init_latents
maked_latents_unchange = torch.cat([latent_masked, mask], dim=1)
#init_latents = torch.cat([init_latents, maked_latents_unchange], dim=1)
print(init_latents.shape)

torch.Size([1, 4, 64, 64])


In [14]:
# get the original timestep using init_timestep
init_timestep = int(num_inference_steps * strength) + offset
init_timestep = min(init_timestep, num_inference_steps)
timesteps = scheduler.timesteps[-init_timestep]
timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=device)

# add noise to latents using the timesteps
noise = torch.randn(init_latents.shape, generator=generator, device=device)
init_latents = scheduler.add_noise(init_latents, noise, timesteps)

# get prompt text embeddings
text_input = tokenizer(
    prompt,
    padding="max_length",
    max_length=tokenizer.model_max_length,
    truncation=True,
    return_tensors="pt",
)
text_embeddings = text_encoder(text_input.input_ids.to(device))[0]

In [15]:
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
    max_length = text_input.input_ids.shape[-1]
    uncond_input = tokenizer(
        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
    )
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(device))[0]

    # For classifier free guidance, we need to do two forward passes.
    # Here we concatenate the unconditional and text embeddings into a single batch
    # to avoid doing two forward passes
    #text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

In [16]:
#unet = torch.nn.DataParallel(unet, device_ids=[0,1])

In [17]:
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
accepts_eta = "eta" in set(inspect.signature(scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
    extra_step_kwargs["eta"] = eta

init_latents = torch.cat([init_latents, maked_latents_unchange], dim=1).to('cuda:1')
unet = unet.to('cuda:1')
text_embeddings = text_embeddings.to('cuda:1')
maked_latents_unchange = maked_latents_unchange.to('cuda:1')
latents = init_latents 

t_start = max(num_inference_steps - init_timestep + offset, 0)
print(f't_start : {t_start}')
for i, t in tqdm(enumerate(scheduler.timesteps[t_start:])):
    # expand the latents if we are doing classifier free guidance
    latent_model_input = torch.cat([latents] * 1) if do_classifier_free_guidance else latents

    # predict the noise residual
    #print(latent_model_input.shape)
    noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
    #print(noise_pred.shape)

    # perform guidance
    #if do_classifier_free_guidance:
    #    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    #    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # compute the previous noisy sample x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents[:,0:4:,:,:], **extra_step_kwargs)["prev_sample"]

    # masking
    #intermerdiate = scheduler.add_noise(maked_latents_unchange, noise, t)
    latents = torch.cat([latents, maked_latents_unchange], dim=1)
    del latent_model_input
    del noise_pred
    torch.cuda.empty_cache()
    # latents = (init_latents_proper * mask) + (latents * (1 - mask))

0it [00:00, ?it/s]

torch.Size([1, 9, 64, 64])


1it [00:01,  1.65s/it]

torch.Size([1, 4, 64, 64])
torch.Size([1, 9, 64, 64])


2it [00:01,  1.28it/s]

torch.Size([1, 4, 64, 64])
torch.Size([1, 9, 64, 64])
torch.Size([1, 4, 64, 64])


4it [00:02,  2.64it/s]

torch.Size([1, 9, 64, 64])
torch.Size([1, 4, 64, 64])
torch.Size([1, 9, 64, 64])


6it [00:02,  4.15it/s]

torch.Size([1, 4, 64, 64])
torch.Size([1, 9, 64, 64])
torch.Size([1, 4, 64, 64])
torch.Size([1, 9, 64, 64])


6it [00:02,  2.28it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 10.00 MiB (GPU 1; 15.90 GiB total capacity; 15.06 GiB already allocated; 5.75 MiB free; 15.08 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF