### Necessary imports

In [2]:

from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
from diffusers.utils import load_image
import numpy as np
import torch

import cv2
from PIL import Image


In [3]:
from transformers import DPTFeatureExtractor, DPTForDepthEstimation

### Generator function for deterministic generation
Refer : https://huggingface.co/docs/diffusers/using-diffusers/reusing_seeds

In [4]:
generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)]

In [5]:
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
controlnet = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")



In [6]:
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    controlnet=controlnet,
    vae=vae,
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")
pipe.enable_model_cpu_offload()


diffusion_pytorch_model.safetensors: 100%|██████████| 335M/335M [00:05<00:00, 62.6MB/s] 
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00,  7.87it/s]


In [13]:
def get_depth_map(image):
    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
    with torch.no_grad(), torch.autocast("cuda"):
        depth_map = depth_estimator(image).predicted_depth

    depth_map = torch.nn.functional.interpolate(
        depth_map.unsqueeze(1),
        size=(512, 1024),
        mode="bicubic",
        align_corners=False,
    )
    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
    image = torch.cat([depth_map] * 3, dim=1)

    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
    return image

In [7]:
image = load_image("/DATA/ai22resch11001/temp_work/assets/pano_depth.png")
controlnet_conditioning_scale = 0.5  # recommended for good generalization
# depth_image = get_depth_map(image)


In [8]:
type(image)

PIL.Image.Image

In [12]:
np.array(image).shape

(512, 1024, 3)

In [10]:
prompts_set1 = [
    "360-degree panoramic image, of a simple room",
    "360-degree panoramic image, of a luxury room",
    "360-degree panoramic image, of a spaceship",
    "360-degree panoramic image, of a forest",
    "360-degree panoramic image, of an anime style natural landscape"
]

prompts_set2 = [
    "360-degree panoramic image, of a simple room, stereoscopic, equirectangular",
    "360-degree panoramic image, of a luxury room, stereoscopic, equirectangular",
    "360-degree panoramic image, of a spaceship, stereoscopic, equirectangular",
    "360-degree panoramic image, of a forest, stereoscopic, equirectangular",
    "360-degree panoramic image, of an anime style natural landscape, stereoscopic, equirectangular"
]


In [11]:
images = pipe(
    prompts_set1, image=image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale,
).images

100%|██████████| 30/30 [00:28<00:00,  1.07it/s]


In [16]:
from PIL import Image
import os

def save_images_with_prompt_prefix(dest_pth,model_name, images, prompts):
    
    if not os.path.exists(dest_pth):
        os.mkdir(dest_pth)

    for i, image in enumerate(images):
        prompt_name = prompts[i].replace(" ", "_")  # Replace spaces with underscores
        filename = dest_pth + f"{model_name}_{prompt_name}_image_{i + 1}.png"
        image.save(filename)
        print(f"Image {i + 1} saved as {filename}")

In [17]:
save_images_with_prompt_prefix('/DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/', 'controlnet', images, prompts_set1)

Image 1 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_simple_room_image_1.png
Image 2 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_luxury_room_image_2.png
Image 3 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_spaceship_image_3.png
Image 4 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_forest_image_4.png
Image 5 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_an_anime_style_natural_landscape_image_5.png


In [18]:
images2 = pipe(
    prompts_set2, image=image, num_inference_steps=30, controlnet_conditioning_scale=controlnet_conditioning_scale,
).images

100%|██████████| 30/30 [00:27<00:00,  1.08it/s]


In [19]:
save_images_with_prompt_prefix('/DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/', 'controlnet', images2, prompts_set2)

Image 1 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_simple_room,_stereoscopic,_equirectangular_image_1.png
Image 2 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_luxury_room,_stereoscopic,_equirectangular_image_2.png
Image 3 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_spaceship,_stereoscopic,_equirectangular_image_3.png
Image 4 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_a_forest,_stereoscopic,_equirectangular_image_4.png
Image 5 saved as /DATA/ai22resch11001/temp_work/assets/controlnet_pset_1/controlnet_360-degree_panoramic_image,_of_an_anime_style_natural_landscape,_stereoscopic,_equirectangular_image_5.png


In [None]:
# https://colab.research.google.com/drive/1QZHh9-3pjVtqlg2Oeqq_P11BZnH7cTpH?usp=sharing
