# Stable Diffusion on diffusers



**Run this cell after each session restart**

## General Setup

In [None]:
!pip install gputil

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.getLogger('diffusers.models.modeling_utils').setLevel(logging.CRITICAL)

from IPython.display import HTML, Image, display, clear_output
from moviepy.editor import ImageSequenceClip, ipython_display
import ipywidgets as widgets
import os
import numpy as np

# !apt-get -qq install ffmpeg
!pip install ninja
from google.colab import drive
drive.mount('/G', force_remount=True)
gdir = '/G/MyDrive/'
%cd $gdir

#@markdown Setting up Stable Diffusion code in the directory below on your Google drive (creating it, if it doesn't exist):
work_dir = 'sdfu' #@param {type:"string"}
#@markdown NB: Avoid connecting Google drive manually via the icon in Files section on the left. Doing so may break further operations.

work_dir = gdir + work_dir + '/'
if not os.path.isdir(work_dir):
  !git clone https://github.com/eps696/SDfu $work_dir
%cd $work_dir
# the order is important! basicsr (for controlnet) needs pt latest
!pip install --no-deps xformers==0.0.29 # for torch 2.5.1+cu124
!pip install --no-deps optimum-quanto # 0.2.4 for torch 2.4

!pip install --no-deps basicsr
# workaround for making outdated basicsr work with modern torchvision
import sys, types
from torchvision.transforms.functional import rgb_to_grayscale
# Create a module for `torchvision.transforms.functional_tensor`
functional_tensor = types.ModuleType("torchvision.transforms.functional_tensor")
functional_tensor.rgb_to_grayscale = rgb_to_grayscale
# Add this module to sys.modules so other imports can access it
sys.modules["torchvision.transforms.functional_tensor"] = functional_tensor

!pip install --no-deps git+https://github.com/openai/CLIP
!pip install --no-deps torchmetrics omegaconf trampoline torchdiffeq torchsde jsonmerge clean-fid resize_right av
# !pip install -r requirements.txt

from src.core.text import txt_clean
from src.core.utils import basename
from download import get_model
maindir = '/content/models'
command = ''
args = ''

clear_output()

# Hardware check
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
import GPUtil as GPU
gpu = GPU.getGPUs()[0]
!nvidia-smi -L
print("GPU RAM {0:.0f}MB | Free {1:.0f}MB)".format(gpu.memoryTotal, gpu.memoryFree))
print('\nDone!')

## Images


In [None]:
#@title Load base model

model = "1.5-dreamlike" #@param ['1.5','1.5-dreamlike','LCM','2.1','2-inpainting','Kandinsky 2.2','SDXL','SDXL Lightning 4 steps','SDXL Lightning 8 steps']

#@markdown NB: Options and methods below work mostly with SD 1.x models.
#@markdown Some can work with SDXL. Models 2.x and Kandinsky don't support them, and are given only for reference.

if '1.5' in model and not os.path.exists('/content/models/v1'):
  get_model("https://www.dropbox.com/s/9wuhum8w0iqs4o7/sdfu-v15-full-fp16.zip?dl=1", '/content/models/v1')
  get_model("https://www.dropbox.com/s/z9uycihl6tybx9y/sdfu-v1-vaes-fp16.zip?dl=1", '/content/models/v1')
if model == '1.5-dreamlike' and not os.path.exists('/content/models/v1/unet15drm'):
  get_model("https://www.dropbox.com/s/7hwh27xfzcy7a2g/sdfu-v15drm-unetx-fp16.zip?dl=1", '/content/models/v1')
if model == 'LCM' and not os.path.exists('/content/models/lcm'):
  get_model("https://www.dropbox.com/s/3sql6gsjmtvw2zo/sdfu-lcm-fp16.zip?dl=1", '/content/models')
if model in ['2.1', '2-inpainting'] and not os.path.exists('/content/models/v2'):
  get_model("https://www.dropbox.com/s/38876tjuklvwq82/sdfu-v21-full-fp16.zip?dl=1", '/content/models/v2')
if model == '2-inpainting' and not os.path.exists('/content/models/v2/unet2i'):
  get_model("https://www.dropbox.com/s/r5fa1mdxpw9e8k2/sdfu-v2i-unet-fp16.zip?dl=1", '/content/models/v2')

# clipseg for text masking
if not os.path.exists('/content/models/xtra/clipseg'):
  get_model("https://www.dropbox.com/scl/fi/blwzoyu9abp4q6wfq4hoj/rd64-uni.pth?rlkey=mhfp4jlles5oio1sn1eneyx4l&dl=1", '/content/models/xtra/clipseg', unzip=False)

modict = {'1.5':'15', '1.5-dreamlike':'15drm', 'LCM':'lcm', '2-inpainting':'2i', '2.1':'21', 'Kandinsky 2.2':'kand', 'SDXL':'xl', 'SDXL Lightning 4 steps':'xl4', 'SDXL Lightning 8 steps':'xl8'}
modname = modict[model]


### Inputs

In [None]:
#@markdown Run this for every new generation!

#@markdown All paths below are relative to the work directory on G drive (which is set during General setup above).

#@markdown Specify a text string or path to a text file to use **txt2img**:
prompt = '' #@param {type:"string"}
#@markdown Specify path to a reference image or directory to use it as a **visual prompt** with IP adapter:
imgref = '' #@param {type:"string"}

#@markdown Specify path to an image or directory to use **img2img**:
images = '' #@param {type:"string"}

#@markdown Specify mask as a text description or a path to an image or directory, to use **inpainting**:
mask = '' #@param {type:"string"}

#@markdown Load **finetuned** files if needed (ensure that you selected correct base model!):
method = "custom" #@param ['text','lora','custom']
load_file = '' #@param {type:"string"}

maindir = '/content/models'
%cd $work_dir
command = ' -md %s ' % maindir
workname = ''

if len(prompt) > 0:
  if os.path.exists(os.path.join(gdir, prompt)):
    prompt = os.path.join(gdir, prompt)
  if os.path.exists(prompt):
    print('found', prompt)
    command += ' -t %s ' % prompt
  else:
    command += ' -t "%s" ' % prompt
  workname = txt_clean(basename(prompt))[:44]

if len(images) > 0:
  if not os.path.exists(images):
    images = os.path.join(gdir, images)
  if os.path.exists(images):
    print('found', images)
    command += ' -im %s ' % images
    if len(workname) == 0:
      workname = txt_clean(basename(images))
else:
  command += ' -f 1 '
  print('Strength is set to 1 for txt2img method')

if len(imgref) > 0:
  if not os.path.exists(imgref):
    imgref = os.path.join(gdir, imgref)
  if os.path.exists(imgref):
    if not os.path.exists('/content/models/image'):
      print('downloading IP adapter')
      get_model("https://www.dropbox.com/scl/fi/zcvg092n1n4aqpmoufd70/sdfu-imageref-fp16.zip?rlkey=ompjbfu2um90hyid2rlglv4d8&dl=1", maindir)
    print('using IP adapter with', imgref)
    command += ' -imr %s ' % imgref
    workname += '-' + txt_clean(basename(imgref))

if len(mask) > 0:
  if not os.path.exists(mask):
    mask = '"%s"' % mask
  command += ' -M %s ' % mask

if len(load_file) > 0:
  if not os.path.exists(load_file):
    load_file = os.path.join(gdir, load_file)
  if os.path.exists(load_file):
    print('found', load_file)
    cmd = 'rt' if method=='text' else 'rl' if method=='lora' else 'rd'
    command += ' -%s %s ' % (cmd, load_file)

command += ' --model %s ' % modname

if 'Lightning' in model:
  command += ' --lightning '

# !echo $command $args


In [None]:
#@markdown Add reference images to control output by **Controlnet** if needed:
cnet_images = '' #@param {type:"string"}
cnet_mode = "depth" #@param ['depth any 2','depth','canny','pose']
cnet_strength = 0.7 #@param {type:"number"}
if cnet_mode == 'depth any 2': cnet_mode = 'deptha'

if len(cnet_images) > 0 and cnet_strength > 0:
    if not os.path.exists(cnet_images):
      cnet_images = os.path.join(gdir, cnet_images)
    if os.path.exists(cnet_images):
      if not os.path.exists('/content/models/control'):
        print('downloading Controlnet model')
        get_model("https://www.dropbox.com/s/qhe1zpbubjr3t75/sdfu-controlnet.zip?dl=1", '/content/models')
      print('using Controlnet with', cnet_images)
      predir = '/content/%s/%s' % (basename(cnet_images), cnet_mode)
      %run src/preproc.py -i $cnet_images --type $cnet_mode -o $predir -md /content/models/control
      command += ' -cmod %s -cnimg %s -cts %f ' % (cnet_mode, predir, cnet_strength)


### Other settings [optional]

Set low `steps` (2-8) if using TCD sampler or LCM model.

In [None]:
#@markdown Run this cell to override default settings, if needed

out_dir = '_out' #@param {type:"string"}
sizeX = 768 #@param {type:"integer"}
sizeY = 768 #@param {type:"integer"}

steps = 33 #@param {type:"integer"}
cfg_scale = 7.5 #@param {type:"number"}
strength = 1. #@param {type:"number"}
imgref_weight = 0.5 #@param {type:"number"}
ip_adapter = 'orig' #@param ['orig','plus','face-full']

sampler = 'ddim' #@param ['ddim','ddpm','pndm','lms','euler', 'euler_a','uni','dpm','TCD']
VAE = 'ema' #@param ['original', 'ema', 'mse']
batch = 1 #@param {type:"integer"}
seed = 696 #@param {type:"integer"}

eta = 0. #@ param {type:"number"}

unprompt = 'low quality, poorly drawn, out of focus, blurry, tiled, segmented, oversaturated' #@param {type:"string"}
verbose = True #@param {type:"boolean"}

args = ''
args += ' -o %s ' % out_dir
args += ' -sz %d-%d ' % (sizeX, sizeY)

steps = 4 if modname=='xl4' else 8 if modname=='xl8' else steps
args += ' -s %d ' % steps

args += ' -C %g ' % cfg_scale
args += ' -f %g ' % strength
args += ' -imw %g ' % imgref_weight
if ip_adapter != 'orig':
  args += ' -ip %s ' % ip_adapter

if sampler=='TCD' and modname=='xl':
  args += ' -sm TCD --load_lora h1t/TCD-SDXL-LoRA -C 1 '
elif sampler=='TCD' and modname[0]=='1':
  args += ' -sm tcd --load_lora h1t/TCD-SD15-LoRA -C 1 '
elif sampler=='TCD':
  print('TCD sampler works with SDXL or 1.x models!')
else:
  args += ' -sm %s ' % sampler

args += ' --vae %s ' % VAE
args += ' -b %d ' % batch
args += ' -S %d ' % seed
args += ' --eta %g ' % eta
args += ' -un "%s" ' % unprompt
if verbose:
  args += ' -v '



### Generate

In [None]:
#@markdown Separate frames

if "Kandinsky" in model:
  %run src/kand.py $args $command
elif "SDXL" in model:
  %run src/sdxl.py $args $command
else:
  %run src/gen.py $args $command

In [None]:
#@markdown Interpolations

latent_blending = 0. #@param {type:"number"}
frame_step = 10 #@param {type:"integer"}
num_repeat = 3 #@param {type:"integer"}

command += ' -fs %d -n %d ' % (frame_step, num_repeat)

if "Kandinsky" in model:
  %run src/kand.py $args $command
elif "SDXL" in model:
  %run src/sdxl.py $args $command
else:
  if latent_blending > 0:
    command += ' --latblend %f' % latent_blending
  %run src/latwalk.py $args $command


set `latent_blending` if you need smooth transitions (except Kandinsky, SDXL and LCM models). value range is 0~1; 0.7 is a good start.  
`frame_step` = length of the transition between prompts or images (in frames).  
`num_repeat` = repeating inputs to make animation e.g. from a single prompt or image *(ignored when interpolating between images)*.

In [None]:
#@markdown Edit image sequence with Tokenflow

!pip install av
clear_output()

images = '' #@param {type:"string"}

if len(prompt) > 0 and len(images) > 0 and os.path.exists(images):
  %run src/tokenflow.py -md $maindir -im $images -t "$prompt" --batch_size 4 --batch_pivot --cpu
else:
  print("Set text prompt and path to the images in the Inputs above!")


## Video

In [None]:
#@markdown Load AnimateDiff motion adapter (for SD 1.x base models), CogX or Zeroscope (lo-res) model

animodel = "AnimateDiff" #@param ['AnimateDiff','Zeroscope']
# 'CogX' > 20gb RAM

if 'AnimateDiff' in animodel and not os.path.exists('/content/models/anima'):
  assert modname[0]=='1' and os.path.exists('/content/models/v1'), "AnimateDiff works only with 1.x base SD models"
  get_model("https://www.dropbox.com/scl/fi/4gorn5lf9owygizhgwuy6/sdfu-animatediff-fp16.zip?rlkey=mh54vl9ngre9n898pcgsr8fry&dl=1", '/content/models')
elif 'Zeroscope' in animodel and not os.path.exists('/content/models/v2/unetvzs'):
  if not os.path.exists('/content/models/v2'):
    get_model("https://www.dropbox.com/s/38876tjuklvwq82/sdfu-v21-full-fp16.zip?dl=1", '/content/models/v2')
  if not os.path.exists('/content/models/v2/unetvzs'):
    get_model("https://www.dropbox.com/s/uyaidznqjaot7hw/sdfu-video-unet-fp16.zip?dl=1", '/content/models/v2')


In [None]:
#@markdown **Set the inputs and run the process**

#@markdown Specify a text string:
prompt = '' #@param {type:"string"}
#@markdown Specify reference image(s) to use it as a **visual prompt**:
imgref = '' #@param {type:"string"}
imgref_weight = 0.3 #@param {type:"number"}
use_all_at_once = False #@param {type:"boolean"}

#@markdown Specify path to the input video (or a frame sequence), if needed:
video = '' #@param {type:"string"}

#@markdown Specify video length in frames (leave 0 to process complete input video):
frames = 0 #@param {type:"integer"}

#@markdown Other parameters:
sampler = 'euler' #@param ['ddim','ddpm','euler','uni']

cmds = {'AnimateDiff':'anima', 'CogX':'cogx', 'Zeroscope':'vid'}
command = 'src/%s.py ' % cmds[animodel]
modpath = ' -ad anima ' if animodel == 'AnimateDiff' else ''

if animodel == 'AnimateDiff':
  dirname = ''.join(e for e in ''.join(prompt.split(' ')[:3]) if (e.isalnum()))
  print('saving as subdir', dirname)
  command += '-o _out/%s ' % dirname

command += '-sm %s ' % sampler
if frames > 0:
  command += '-vf %d ' % frames
command += ' -md /content/models ' + modpath
# %cd $work_dir

if len(prompt) > 0:
  command += ' -t "%s" ' % prompt
if len(imgref) > 0 and imgref_weight != 0:
  if not os.path.exists(imgref):
    imgref = os.path.join(gdir, imgref)
  if os.path.exists(imgref):
    if animodel == 'AnimateDiff':
      if not os.path.exists('/content/models/image'):
        print('downloading IP adapter')
        get_model("https://www.dropbox.com/scl/fi/zcvg092n1n4aqpmoufd70/sdfu-imageref-fp16.zip?rlkey=ompjbfu2um90hyid2rlglv4d8&dl=1", maindir)
      print('using IP adapter with', imgref)
      command += ' -imr %s -imw %f ' % (imgref, imgref_weight)
      if use_all_at_once:
        command += ' --allref '
    elif animodel == 'CogX':
      command += ' -im %s ' % (imgref)
if len(video) > 0:
  if not os.path.exists(video):
    video = os.path.join(gdir, video)
  if os.path.exists(video):
    command += ' -iv %s ' % video

if len(prompt) > 0 or (len(video) > 0 and os.path.exists(video)):
  %run $command
else:
  print('Not enough inputs yet to run the generation')

print('done')

## Fine-tune

In [None]:
#@title Load base model

model = "1.5-dreamlike" #@param ['1.5','1.5-dreamlike','2.1']

if '1.5' in model and not os.path.exists('/content/models/v1'):
  get_model("https://www.dropbox.com/s/9wuhum8w0iqs4o7/sdfu-v15-full-fp16.zip?dl=1", '/content/models/v1')
  get_model("https://www.dropbox.com/s/z9uycihl6tybx9y/sdfu-v1-vaes-fp16.zip?dl=1", '/content/models/v1')
if model == '1.5-dreamlike' and not os.path.exists('/content/models/v1/unet15drm'):
  get_model("https://www.dropbox.com/s/7hwh27xfzcy7a2g/sdfu-v15drm-unetx-fp16.zip?dl=1", '/content/models/v1')
if model in ['2.1', '2-inpainting'] and not os.path.exists('/content/models/v2'):
  get_model("https://www.dropbox.com/s/38876tjuklvwq82/sdfu-v21-full-fp16.zip?dl=1", '/content/models/v2')
if model == '2-inpainting' and not os.path.exists('/content/models/v2/unet2i'):
  get_model("https://www.dropbox.com/s/r5fa1mdxpw9e8k2/sdfu-v2i-unet-fp16.zip?dl=1", '/content/models/v2')

modict = {'1.5':'15', '1.5-dreamlike':'15drm', '2.1':'21'}
modname = modict[model]


In [None]:
#@title Data setup
#@markdown Put your target images as zip-archive onto Google drive and copy its path below (relative to G-drive root).
target_data = 'tgt.zip' #@param {type:"string"}

#@markdown For Text Inversion or Custom Diffusion, provide a token to use in the prompts to summon your target imagery, e.g. as `<mycat1>`
new_token = 'mycat1' #@param {type:"string"}

#@markdown For Custom diffusion, prepare also a bunch of generic reference images of similar class, to start from.
ref_class = 'cat' #@param {type:"string"}
ref_data = 'ref.zip' #@param {type:"string"}

data_dir = os.path.join('/content/data/', new_token)
!rm -rf $data_dir
os.makedirs(data_dir)
%cd $data_dir

tgt_path = os.path.join(gdir, target_data)
!unzip -j -o -q $tgt_path -d tgt
tgt_dir = os.path.join(data_dir, 'tgt')

ref_path = os.path.join(gdir, ref_data)
if len(ref_data) > 0 and os.path.isfile(ref_path):
  !unzip -j -o -q $ref_path -d ref
  ref_dir = os.path.join(data_dir, 'ref')
  with_prior = True
else:
  with_prior = False

%cd $work_dir


Use either of methods:
* [Textual inversion](https://textual-inversion.github.io) = adds new token to the text encoder. Generic but stable. Trained embeddings can be combined together on load.
* [LoRA](https://github.com/cloneofsimo/lora) = partially finetunes low-rank add-ons, injected to Unet attention layers. Universal method, **industry standard**, precise, may interfere with wide spectrum of topics.
* [Custom diffusion](https://github.com/adobe-research/custom-diffusion) = similar to LoRA (in a way). Can achieve impressive reproduction quality (including faces) with simple prompts, but may lose the point with too complex ones. To train it, you'll need to specify above both **target** reference images and **generic reference** ones (more random, of similar subjects). Apparently, you can generate the latter with SD itself.  

Mark `style` if you're training for a style, rather than an object.  
Mark `low_mem` if you get OOM.  

In [None]:
#@title Run training
method = "text" #@param ['text','lora','custom']
style = False #@param  {type:"boolean"}
low_mem = False #@param  {type:"boolean"}
train_steps = 3000 #@param  {type:"integer"}
save_steps = 500 #@param  {type:"integer"}
batch = 1 #@param  {type:"integer"}

command = ' --data %s ' % tgt_dir
if method in ['text','custom']:
  command += ' -t %s --term %s ' % (new_token, ref_class)
if method in ['custom']:
  command += ' --term_data %s ' % ref_dir

command += ' -ts %d --save_step %d -b %d ' %(train_steps, save_steps, batch)
command += ' -val -md %s ' % maindir
command += ' --model %s ' % modname

if low_mem:
  command += ' --low_mem '
if style:
  command += ' --style '

if method == 'text':
  %run src/train.py $command -lr 0.001 --type text
elif method == 'lora':
  %run src/train.py $command -lr 0.0002 --type lora
elif method == 'custom':
  %run src/train.py $command --type custom
