In [3]:
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

NVIDIA GeForce RTX 3090, 24576 MiB, 18394 MiB


In [None]:
%pip install jupyter_compare_view

In [9]:
# Fill in these environment variables
%env MODEL_NAME=stabilityai/stable-diffusion-2-depth
%env INSTANCE_DIR=/workspace/content/data/<your instance>
%env CLASS_DIR=/workspace/content/data/person
%env OUTPUT_DIR=/workspace/content/data/output-2
%load_ext autoreload

In [10]:
%autoreload 2

# !python -m debugpy --listen 0.0.0.0:5678 --wait-for-client \
!accelerate launch \
  train_dreambooth.py \
  --mixed_precision="fp16" \
  --pretrained_model_name_or_path=$MODEL_NAME  \
  --train_text_encoder \
  --instance_data_dir=$INSTANCE_DIR \
  --class_data_dir=$CLASS_DIR \
  --output_dir=$OUTPUT_DIR \
  --with_prior_preservation --prior_loss_weight=1.0 \
  --instance_prompt="a photo of <your instance> person" \
  --class_prompt="a photo of person" \
  --resolution=512 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=1 \
  --learning_rate=1e-6 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --num_class_images=200 \
  --max_train_steps=300 \
  --use_8bit_adam

# Test the model you just made

In [8]:
import PIL
import torch
from torchvision import transforms
import diffusers
import transformers
from diffusers import StableDiffusionDepth2ImgPipeline
import os
from jupyter_compare_view import compare

In [None]:
print(f'Getting model from {os.environ.get("OUTPUT_DIR")}')
pipeline = StableDiffusionDepth2ImgPipeline.from_pretrained(os.environ.get('OUTPUT_DIR'))
pipeline = pipeline.to("cuda")

In [None]:
# Use an image as an input depth map
image_path = "/workspace/content/data/samples/village.jpg" # replace with whatever you want
image = PIL.Image.open(image_path)

image_transform = transforms.Compose(
    [
        transforms.Resize((384, 384)),
        transforms.ToTensor()
    ]
)
image = image_transform(image)
image = image[None,:,:,:]
image = image.to("cuda")
depth_map = pipeline.depth_estimator(image).predicted_depth
image = transforms.ToPILImage()(image[0])
depth_min = torch.amin(depth_map, dim=[0, 1, 2], keepdim=True)
depth_max = torch.amax(depth_map, dim=[0, 1, 2], keepdim=True)
depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
depth_map = depth_map[0,:,:]
depth_map = transforms.ToPILImage()(depth_map)
compare(depth_map, image, cmap="gray", start_mode="horizontal", start_slider_pos=0.73)

In [None]:
result = pipeline("a photo of <your-instance>, standing, Kodachrome, Canon 5D, f2 aperture, extremely detailed, sharp focus", image)
compare(result[0][0], image, cmap="gray", start_mode="horizontal", start_slider_pos=0.73)