<a href="https://colab.research.google.com/github/cjfghk5697/temporary_for_SDM/blob/main/ControlNetDiffusersDevelopment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Diffusers in development version and dependencies.
and import libs

In [None]:
! pip install "git+https://github.com/takuma104/diffusers.git@controlnet" # Diffusers in development version 
! pip install transformers accelerate safetensors xformers opencv-python timm==0.4.12 fairscale==0.4.4

In [None]:
# clone BLIP 
import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip install pytorch_pretrained_bert --upgrade
    !git clone https://github.com/salesforce/BLIP
    %cd BLIP

In [None]:
from diffusers import StableDiffusionControlNetPipeline, EulerAncestralDiscreteScheduler
from diffusers.utils import load_image
import torch
import cv2
import numpy as np

from models.blip import blip_decoder
from torchvision import transforms
from PIL import Image

## Common 

In [None]:
euler_scheduler = EulerAncestralDiscreteScheduler.from_config("takuma104/control_sd15_canny", subfolder="scheduler")

## Canny Edge model

In [None]:
pipe_canny = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_canny", torch_dtype=torch.float16).to("cuda")
pipe_canny.scheduler = euler_scheduler
pipe_canny.enable_xformers_memory_efficient_attention()


#### Control by preprocessed image

In [None]:
canny_edged_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/vermeer_canny_edged.png")
canny_edged_image

In [None]:
generator = torch.Generator(device="cpu").manual_seed(3)
image = pipe_canny(prompt="best quality, extremely detailed", 
                   negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
                   controlnet_hint=canny_edged_image,
                   num_inference_steps=30, 
                   generator=generator).images[0]
image

#### Control by generated image

In [None]:
original_image = load_image("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_imgvar/input_image_vermeer.png")

control = cv2.Canny(np.array(original_image), threshold1=100, threshold2=200)
#control = cv2.cvtColor(control, cv2.COLOR_GRAY2RGB)
Image.fromarray(control)

In [None]:
control.shape

In [None]:
generator = torch.Generator(device="cpu").manual_seed(3)
image = pipe_canny(prompt="best quality, extremely detailed", 
                   negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
                   controlnet_hint=control,
                   num_inference_steps=30, 
                   generator=generator).images[0]
image

#### Automatic Prompt

**Generating Automatic Prompt**

In [None]:
image_size = 384
original_image = load_image("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_imgvar/input_image_vermeer.png")

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth'
    
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base')
model.eval()
model = model.to('cuda')
transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=transforms.functional.InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
  ]) 
original_image=transform(original_image.resize((image_size,image_size))).unsqueeze(0).to('cuda')  
with torch.no_grad():
    # beam search
    caption = model.generate(original_image, sample=False, num_beams=3, max_length=20, min_length=5) 
    # nucleus sampling
    # caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5) 
    print('caption: '+caption[0])

**Test Automatic Prompt**

In [None]:
generator = torch.Generator(device="cpu").manual_seed(3)
image = pipe_canny(prompt=caption[0], 
                   negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
                   controlnet_hint=control,
                   num_inference_steps=30, 
                   generator=generator).images[0]
image

## OpenPose model

In [None]:
pose_image = load_image('https://huggingface.co/takuma104/controlnet_dev/resolve/main/pose.png')
pose_image

In [None]:
pipe_op = StableDiffusionControlNetPipeline.from_pretrained("takuma104/control_sd15_openpose", torch_dtype=torch.float16).to("cuda")
pipe_op.scheduler = euler_scheduler
pipe_op.enable_xformers_memory_efficient_attention()


In [None]:
generator = torch.Generator(device="cpu").manual_seed(0)
image = pipe_op(prompt="best quality, extremely detailed, football, a boy", 
                negative_prompt="lowres, bad anatomy, worst quality, low quality",
                controlnet_hint=pose_image, 
                generator=generator,
                num_inference_steps=30).images[0]
image