# Stable Diffusion on diffusers



**Run this cell after each session restart**

In [None]:
#@title General setup { display-mode: "form", run: "auto" }

!pip install torch torchvision
!pip install accelerate
!pip install gputil

# !pip install torchmetrics==0.10.3
# !pip install pytorch-lightning==1.8.3.post0

from IPython.display import HTML, Image, display, clear_output
from moviepy.editor import ImageSequenceClip, ipython_display
import ipywidgets as widgets
import os
import numpy as np

!apt-get -qq install ffmpeg
!pip install ninja
from google.colab import drive
drive.mount('/G', force_remount=True)
gdir = '/G/MyDrive/'
%cd $gdir

#@markdown Copying StyleGAN2 to the directory below on your Google drive (creating it, if it doesn't exist):
work_dir = 'sdfu' #@param {type:"string"}
#@markdown NB: Avoid connecting Google drive manually via the icon in Files section on the left. Doing so may break further operations.

work_dir = gdir + work_dir + '/'
if not os.path.isdir(work_dir):
  !git clone https://github.com/eps696/SDfu $work_dir
%cd $work_dir
!pip install -r requirements.txt
!pip install xformers
# !pip install git+https://github.com/openai/CLIP.git@main#egg=clip

from src.core.text import txt_clean
from src.core.utils import basename
from download import get_model
maindir = '/content/models'
command = ''
args = ''

clear_output()

# Hardware check
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
import GPUtil as GPU
gpu = GPU.getGPUs()[0]
!nvidia-smi -L
print("GPU RAM {0:.0f}MB | Free {1:.0f}MB)".format(gpu.memoryTotal, gpu.memoryFree))
print('\nDone!')

In [None]:
#@title Load models

model = "1.5-dreamlike" #@param ['1.5','1.5-dreamlike','2.1','2.1-v','2-inpainting','1.5-instruct-pix2pix','Kandinsky 2.2']
controlnet = False #@param {type:"boolean"}

if '1.5' in model:
  get_model("https://www.dropbox.com/s/9wuhum8w0iqs4o7/sdfu-v15-full-fp16.zip?dl=1", '/content/models/v1')
  get_model("https://www.dropbox.com/s/z9uycihl6tybx9y/sdfu-v1-vaes-fp16.zip?dl=1", '/content/models/v1')
if model == '1.5-dreamlike':
  get_model("https://www.dropbox.com/s/7hwh27xfzcy7a2g/sdfu-v15drm-unetx-fp16.zip?dl=1", '/content/models/v1')
if model == '1.5-instruct-pix2pix':
  get_model("https://www.dropbox.com/s/n1z21ds5eauzk4m/sdfu-ip2p-unet-fp16.zip?dl=1", '/content/models/v1')
if model in ['2.1', '2-inpainting']:
  get_model("https://www.dropbox.com/s/38876tjuklvwq82/sdfu-v21-full-fp16.zip?dl=1", '/content/models/v2')
if model == '2-inpainting':
  get_model("https://www.dropbox.com/s/r5fa1mdxpw9e8k2/sdfu-v2i-unet-fp16.zip?dl=1", '/content/models/v2')
if model == '2.1-v':
  get_model("https://www.dropbox.com/s/10gbecrugca1ydv/sdfu-v21v-full-fp16.zip?dl=1", '/content/models/v2v')

if controlnet:
  get_model("https://www.dropbox.com/s/qhe1zpbubjr3t75/sdfu-controlnet.zip?dl=1", '/content/models')

# clipseg for text masking
get_model("https://www.dropbox.com/s/c0tduhr4g0al1cq/rd64-uni.pth?dl=1", '/content/models/clipseg', unzip=False)


## Generation


### Other settings [optional]

In [None]:
command = ''
args = ''

#@markdown Run this cell to override settings, if needed
out_dir = '_out' #@param {type:"string"}
sizeX = 768 #@param {type:"integer"}
sizeY = 768 #@param {type:"integer"}

cfg_scale = 7.5 #@param {type:"number"}
strength = 0.7 #@param {type:"number"}
steps = 50 #@param {type:"integer"}

sampler = 'ddim' #@param ['ddim', 'pndm', 'euler', 'klms', 'euler_a']
VAE = 'ema' #@param ['original', 'ema', 'mse']
batch = 1 #@param {type:"integer"}
seed = 696 #@param {type:"integer"}
ddim_eta = 0. #@param {type:"number"}

unprompt = 'low quality, poorly drawn, out of focus, blurry, tiled, segmented, oversaturated' #@param {type:"string"}
verbose = True #@param {type:"boolean"}

args = ''
args += ' -o %s ' % out_dir
args += ' -sz %d-%d ' % (sizeX, sizeY)
args += ' -C %g ' % cfg_scale
args += ' -f %g ' % strength
args += ' -s %d ' % steps
args += ' -sm %s ' % sampler
args += ' --vae %s ' % VAE
args += ' -b %d ' % batch
args += ' -S %d ' % seed
args += ' --ddim_eta %g ' % ddim_eta
args += ' -un "%s" ' % unprompt
if verbose:
  args += ' -v '



### Inputs

In [None]:
#@markdown All paths below are relative to the work directory on G drive (set during General setup above).

#@markdown Specify a text string or path to a text file to use **txt2img**:
prompt = '' #@param {type:"string"}

#@markdown Specify path to an image or directory to use **img2img**:
images = '' #@param {type:"string"}
#@markdown Specify mask as an image or directory or text string to use **inpainting**:
mask = '' #@param {type:"string"}
# mask_text = 'human, person' #@param {type:"string"}

#@markdown Load **finetuned** files if needed:
method = "custom" #@param ['text','lora','custom']
load_file = '' #@param {type:"string"}

maindir = '/content/models'
%cd $work_dir
command = ' -md %s ' % maindir
workname = ''

if len(prompt) > 0:
  if os.path.exists(prompt):
    print('found', prompt)
    command += ' -t %s ' % prompt
  else:
    command += ' -t "%s" ' % prompt
  workname = txt_clean(basename(prompt))[:44]

if len(images) > 0 and os.path.exists(images):
  print('found', images)
  command += ' -im %s ' % images
  if len(workname) == 0:
    workname = txt_clean(basename(images))
else:
  command += ' -f 1 '
  print('Strength is set to 1 for txt2img method')

if len(mask) > 0:
  if not os.path.exists(mask):
      mask = '"%s"' % mask
  command += ' -M %s ' % mask

if len(load_file) > 0 and os.path.exists(load_file):
  print('found', load_file)
  cmd = 'rt' if method=='text' else 'rl' if method=='lora' else 'rd'
  command += ' -%s %s ' % (cmd, load_file)

# !echo $command $args


In [None]:
#@markdown Add reference images to control output by **Controlnet** if needed:
cnet_images = '' #@param {type:"string"}
cnet_mode = "depth" #@param ['depth','canny','pose']
cnet_strength = 0.7 #@param {type:"number"}

#@markdown Or do it with **Instruct pix2pix** (if scale > 0):
ip2p_scale = 0. #@param {type:"number"}

if len(cnet_images) > 0 and os.path.exists(cnet_images):
  print('using Controlnet with', cnet_images)
  %run src/preproc.py -i $cnet_images --type $cnet_mode -o /content/$cnet_mode
  command += ' -cmod %s -cimg %s -cts %f ' % (cnet_mode, '/content/'+cnet_mode, cnet_strength)

elif ip2p_scale > 0:
  if 'instruct-pix2pix' in model:
    print('using Instruct-pix2pix with scale', ip2p_scale)
    command += ' --img_scale %f ' % ip2p_scale
  else:
    print('Select Instruct-pix2pix model above!')


### Generate images

In [None]:
#@markdown Separate frames

if "Kandinsky" in model:
  %run src/kand.py $args $command
else:
  %run src/gen.py $args $command

In [None]:
#@markdown Interpolations

latent_blending = 0. #@param {type:"number"}
frame_step = 10 #@param {type:"integer"}
num_repeat = 3 #@param {type:"integer"}

command += ' -fs %d -n %d ' % (frame_step, num_repeat)

if latent_blending > 0:
  command += ' --latblend %f' % latent_blending

if "Kandinsky" in model:
  %run src/kand.py $args $command
else:
  %run src/latwalk.py $args $command


set `latent_blending` if you need smooth transitions.  
`frame_step` = length of the transition between prompts or images (in frames).  
`num_repeat` = multiplies inputs to make animation e.g. from a single prompt or image.

### Generate video

In [None]:
#@markdown Load Zeroscope (low-res) and Potat1 (high-res) text2video models

get_model("https://www.dropbox.com/s/38876tjuklvwq82/sdfu-v21-full-fp16.zip?dl=1", '/content/models/v2')
get_model("https://www.dropbox.com/s/uyaidznqjaot7hw/sdfu-video-unet-fp16.zip?dl=1", '/content/models/v2')


In [None]:
#@markdown Make short (~30 frames) videos.
#@markdown If provided input video is longer, it will be cut in pieces and processed one by one.

#@markdown Specify a text string:
prompt = '' #@param {type:"string"}

#@markdown Specify path to the input video, if needed:
video = '' #@param {type:"string"}

#@markdown Generate low-res or high-res version (or both):
lo = True #@param {type:"boolean"}
hi = False #@param {type:"boolean"}

#@markdown Other parameters:
sampler = 'euler_a' #@param ['ddim', 'pndm', 'euler', 'klms', 'euler_a']
frames = 36 #@param {type:"integer"}

command = '-sm %s  -vf %s' % (sampler, frames)
command += ' -md /content/models'
# %cd $work_dir

if len(prompt) > 0:
  command += ' -t "%s" ' % prompt
if len(video) > 0 and os.path.exists(video):
  command += ' -iv %s ' % video
if lo:
  command += ' -m vzs '
if hi:
  command += ' -up vpot '

if len(prompt) > 0 or (len(video) > 0 and os.path.exists(video)):
  %run src/vid.py $command


## Fine-tune

In [None]:
#@title Data setup
#@markdown Put your target images as zip-archive onto Google drive and type its path below (relative to G-drive root).
#@markdown If you use Custom diffusion, prepare also a bunch of generic reference images of similar class, to start from.
#@markdown `new_token` will be used in the prompts to summon your target imagery, e.g. as `<mycat1>`.
new_token = 'mycat1' #@param {type:"string"}
ref_class = 'cat' #@param {type:"string"}
target_data = 'tgt.zip' #@param {type:"string"}
ref_data = 'ref.zip' #@param {type:"string"}

data_dir = os.path.join('/content/data/', new_token)
!rm -rf $data_dir
os.makedirs(data_dir)
%cd $data_dir

tgt_path = os.path.join(gdir, target_data)
!unzip -j -o -q $tgt_path -d tgt
tgt_dir = os.path.join(data_dir, 'tgt')

ref_path = os.path.join(gdir, ref_data)
if len(ref_data) > 0 and os.path.isfile(ref_path):
  !unzip -j -o -q $ref_path -d ref
  ref_dir = os.path.join(data_dir, 'ref')
  with_prior = True
else:
  with_prior = False

%cd $work_dir


Use either of methods:
* [Textual inversion](https://textual-inversion.github.io) = adds new token to the text encoder. Generic but stable. Trained embeddings can be combined together on load.
* [LoRA](https://github.com/cloneofsimo/lora) = adds new token + partially finetunes Unet attention layers. Faster, precise, but may interfere with wider spectrum of topics.
* [Custom diffusion](https://github.com/adobe-research/custom-diffusion) = similar to LoRA (in a way). Can achieve impressive reproduction quality (including faces) with simple prompts, but may lose the point with too complex ones. To train it, you'll need to specify above both **target** reference images and **generic** ones (more random, of similar subjects). Apparently, you can generate the latter with SD itself.  

Mark `style` if you're training for a style, rather than an object.  
Mark `low_mem` if you get OOM.  

In [None]:
#@title Run training
method = "custom" #@param ['text','lora','custom']
style = False #@param  {type:"boolean"}
low_mem = False #@param  {type:"boolean"}
train_steps = 2000 #@param  {type:"integer"}
save_steps = 500 #@param  {type:"integer"}
batch = 1 #@param  {type:"integer"}

command = ' -t $s --term %s --data %s ' % (new_token, ref_class, tgt_dir)
command += ' -ts %d --save_step %d -b %d ' %(train_steps, save_steps, batch)
command += ' -val -md %s ' % maindir

if low_mem:
  command += ' --low_mem '
if style
  command += ' --style '

if method == 'text':
  %run src/train.py $command -lr 0.001 --type text
elif method == 'lora':
  %run src/train.py $command -lr 0.0001 --type lora
elif method == 'custom':
  %run src/train.py $command --term_data $ref_dir --type custom
