<a href="https://colab.research.google.com/github/freedomtan/clip_score_on_android/blob/main/verify_unet_fp16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q diffusers
!pip install -q accelerate
!pip install -q keras_cv

In [2]:
from diffusers import StableDiffusionPipeline
import torch

import keras_cv
import keras

import tensorflow as tf
import math
import numpy as np

model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
# pipe = StableDiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to("cuda")

torch_text_encoder = pipe.text_encoder

keras.mixed_precision.set_global_policy("mixed_float16")
pipeline = keras_cv.models.StableDiffusion(
    img_width=512, img_height=512, jit_compile=True
)
# use v1.5 weights instead of v1.4 ones
pipeline.diffusion_model.load_weights("/content/drive/MyDrive/ml/stable_diffusion_v15_weights/diffusion_model_v15.h5")

Using TensorFlow backend


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

By using this model checkpoint, you acknowledge that its usage is subject to the terms of the CreativeML Open RAIL-M license at https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE


In [3]:
first_10_prompts = [
    "A city at night with people walking around.",
    "The large clock was prominently displaying the time.",
    "A person in winter gear riding a snowboard.",
    "A small bird is perched on an empty bird feeder.",
    "Baseballs players sliding to base and jumping during the game.",
    "A woman playing tennis in a white outfit",
    "A woman looking at her phone next to a car being towed.",
    "A man in a wheelchair and another sitting on a bench that is overlooking the water.",
    "Rows of motor bikes and helmets in a city"
]

def get_timestep_embedding(timestep, batch_size, dim=320, max_period=10000):
  half = dim // 2
  freqs = tf.math.exp(
      -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
  )
  args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
  embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
  embedding = tf.reshape(embedding, [1, -1])
  return tf.repeat(embedding, batch_size, axis=0)

In [4]:
count = 0

for p in first_10_prompts:
  text_encoded = pipeline.encode_text(p)

  r_1 = tf.random.stateless_normal((1, 64, 64, 4), seed=[count, count])
  r_2 = tf.transpose(r_1, perm=[0, 3, 1, 2])
  r_2_fp16 = torch.from_numpy(r_2.numpy()).to("cuda").type(torch.float16)

  inputs = {
    "latent": r_1,
    "timestep_embedding": get_timestep_embedding(0, 1),
    "context": text_encoded
  }
  r_tf = pipeline.diffusion_model(inputs)

  r_torch = pipe.unet(sample=r_2_fp16, timestep=0, encoder_hidden_states=torch.from_numpy(text_encoded).to("cuda"))
  r_3 = r_torch.sample.cpu().detach().numpy()
  r_torch_nhwc = np.transpose(r_3, (0, 2, 3, 1))

  print(r_torch_nhwc - r_tf.numpy())
  count += 1

[[[[-4.883e-04 -4.883e-04  0.000e+00  7.324e-04]
   [-4.883e-04  3.662e-04  9.155e-05  2.747e-04]
   [-6.104e-05  6.104e-04  3.052e-05  0.000e+00]
   ...
   [-3.052e-04  0.000e+00  1.221e-04  0.000e+00]
   [ 3.357e-04 -7.324e-04  0.000e+00 -4.883e-04]
   [-1.373e-04  0.000e+00  0.000e+00  0.000e+00]]

  [[ 9.155e-05 -4.883e-04 -1.221e-04  9.155e-05]
   [ 3.662e-04  2.441e-04  1.251e-03 -6.714e-04]
   [-7.324e-04 -1.221e-04  1.465e-03 -3.662e-04]
   ...
   [ 9.155e-05  1.678e-04  1.465e-03  4.883e-04]
   [-4.883e-04  0.000e+00  6.104e-04  0.000e+00]
   [-4.883e-04 -7.324e-04  0.000e+00  2.441e-04]]

  [[ 0.000e+00  1.221e-04  7.324e-04  2.441e-04]
   [ 0.000e+00 -6.104e-04  1.297e-03  0.000e+00]
   [-2.441e-04  2.441e-04 -4.883e-04 -2.441e-04]
   ...
   [-6.104e-04 -2.441e-04  6.714e-04  4.883e-04]
   [-4.883e-04  1.099e-03  2.441e-04  3.357e-04]
   [ 4.883e-04 -1.221e-04  2.441e-04 -4.883e-04]]

  ...

  [[-7.324e-04 -1.221e-04 -1.221e-03  0.000e+00]
   [-2.441e-04  3.662e-04 -2.747e-0