Copyright 2023 Google LLC

Use of this source code is governed by an MIT-style
license that can be found in the LICENSE file or at
https://opensource.org/licenses/MIT.

1. Download CoCo-Stuff [annotations](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) and [val images](http://images.cocodataset.org/zips/val2017.zip).
*  Please first download the [annotations](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) and rename `val2017` to `annotation_val2017`.


2.  Download [Cityscapes](https://www.cityscapes-dataset.com/).
* Cityscapes download requires login.
* Please download `leftImg8bit_trainvaltest.zip` and `gtFine_trainvaltest.zip` to your data folder.

3. Please run the cells in order and choose 2a or 2b, not both.
* 2a: load CoCo-Stuff data.
* 2b: load Cityscapes data.

4. Metrics
* The inference code will return pixel accuracy (ACC) and mean IoU (mIoU).

# Imports

In [1]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from diffseg.segmentor import DiffSeg
from keras_cv.src.models.stable_diffusion.image_encoder import ImageEncoder
from third_party.keras_cv.stable_diffusion import StableDiffusion 
from data.cityscapes import cityscapes_data
from data.coco import coco_data
from diffseg.utils import hungarian_matching

!nvidia-smi

2023-12-26 17:29:16.217399: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-26 17:29:16.217429: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-26 17:29:16.217479: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-26 17:29:16.225945: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend


  from .autonotebook import tqdm as notebook_tqdm
2023-12-26 17:29:21.635707: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9613 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:17:00.0, compute capability: 7.5
2023-12-26 17:29:21.636562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9621 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-12-26 17:29:21.637258: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 9624 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:b3:00.0, compute capability: 7.5


Tue Dec 26 17:29:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.182.03   Driver Version: 470.182.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:17:00.0 Off |                  N/A |
| 31%   51C    P2    66W / 250W |   9970MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:65:00.0 Off |                  N/A |
| 30%   44C    P2    65W / 250W |    345MiB / 11016MiB |      1%      Default |
|       

# 1. Initialize SD Model

In [2]:
# Initialize Stable Diffusion Model on all GPUs.
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
with strategy.scope():
  image_encoder = ImageEncoder()
  vae=tf.keras.Model(
            image_encoder.input,
            image_encoder.layers[-1].output,
        )
  model = StableDiffusion(img_width=512, img_height=512)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2')
Number of devices: 3
By using this model checkpoint, you acknowledge that its usage is subject to the terms of the CreativeML Open RAIL-M license at https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE


# 2a. Load COCO-Stuff Data

In [3]:
ROOT = "../coco_data/" # Change this directory to your coco data folder.
FINE_TO_COARSE_PATH = "./data/coco/fine_to_coarse_dict.pickle"
BATCH_SIZE = strategy.num_replicas_in_sync

# Load fine to coarse label mapping.
fine_to_coarse_map = coco_data.get_fine_to_coarse(FINE_TO_COARSE_PATH)

# Prepare the coco-stuff validation dataset.
file_list = coco_data.load_imdb("./data/coco/Coco164kFull_Stuff_Coarse_7.txt")
image_list, label_list = coco_data.create_path(ROOT, file_list)
val_dataset = coco_data.prepare_dataset(
    image_list, label_list, batch_size=BATCH_SIZE
)

# 2b. Load Cityscapes Data

In [4]:
ROOT = "../cityscapes_data/"
BATCH_SIZE = strategy.num_replicas_in_sync

# Load fine to coarse label mapping.
fine_to_coarse_map = cityscapes_data.get_fine_to_coarse()

# Prepare the cityscapes validation dataset.
image_list, label_list = cityscapes_data.create_path(ROOT)
val_dataset = cityscapes_data.prepare_dataset(
    image_list, label_list, batch_size=BATCH_SIZE
)

['../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_013016_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_023369_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_004327_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_035864_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_082087_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_067092_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_015091_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_025921_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000000_022254_leftImg8bit.png', '../cityscapes_data/leftImg8bit/val/frankfurt/frankfurt_000001_055603_leftImg8bit.png'] ['../cityscapes_data/gtFine/val/frankfurt/frankfurt_000001_013016_gtFine_labelIds.png', '../cityscapes_data/gtFine/val/

# 3. Run Inference

In [5]:
N_CLASS = 27
TP = np.zeros(N_CLASS)
FP = np.zeros(N_CLASS)
FN = np.zeros(N_CLASS)
ALL = 0

# Initialize DiffSeg
KL_THRESHOLD = [1.1]*3 # This controls the merge threshold for masks (1.1 for CoCo-Stuff and 0.9 for Cityscapes)
NUM_POINTS = 16
REFINEMENT = False # Whether use K-Means refinement. Increase inference time from 2s to 3s.

with strategy.scope():
  segmentor = DiffSeg(KL_THRESHOLD, REFINEMENT, NUM_POINTS)

  for i,batch in enumerate(tqdm(val_dataset)):
    images = batch["images"]
    labels = fine_to_coarse_map(batch["labels"][:,:,:,0])
    latent = vae(images, training=False)

    # Extract attention maps from a single iteration of diffusion.
    images, weight_64, weight_32, weight_16, weight_8, _, _, _, _ = model.text_to_image(
      None,
      batch_size=images.shape[0],
      latent=latent,
      timestep=300
    )

    # Segment using DiffSeg.
    pred = segmentor.segment(weight_64, weight_32, weight_16, weight_8) # b x 512 x 512
    
    # Run hungarian matching for evaluation.
    tp, fp, fn, all = hungarian_matching(pred, labels, N_CLASS)
    TP += tp
    FP += fp
    FN += fn
    ALL += all

    # Print accuracy and mean IoU occasionally.
    if (i+1) % 10 == 0:
      acc = TP.sum()/ALL
      iou = TP / (TP + FP + FN)
      miou = np.nanmean(iou)
      print("pixel accuracy:{}, mIoU:{}".format(acc, miou))

# Print final accuracy and mean IoU.
acc = TP.sum()/ALL
iou = TP / (TP + FP + FN)
miou = np.nanmean(iou)
print("final pixel accuracy:{}, mIoU:{}".format(acc, miou))

  0%|          | 0/167 [00:00<?, ?it/s]2023-12-26 17:29:28.217775: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-12-26 17:29:28.901734: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-12-26 17:29:29.032288: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-12-26 17:30:53.792349: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 6.48GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-12-26 17:30:53.792409: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 6.48GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be perf

pixel accuracy:0.7343958548286543, mIoU:0.20758489679870973


 12%|█▏        | 20/167 [05:38<23:11,  9.47s/it]

pixel accuracy:0.7302876985311674, mIoU:0.1952900326722755


 18%|█▊        | 30/167 [07:11<21:16,  9.32s/it]

pixel accuracy:0.7343824517346734, mIoU:0.2367399972185437


 24%|██▍       | 40/167 [08:45<19:46,  9.34s/it]

pixel accuracy:0.7293901559144972, mIoU:0.23010125699642567


 30%|██▉       | 50/167 [10:19<18:22,  9.42s/it]

pixel accuracy:0.7318654152087017, mIoU:0.22963153168292508


 36%|███▌      | 60/167 [11:54<17:11,  9.64s/it]

pixel accuracy:0.7339949418299054, mIoU:0.2207377520793885


 42%|████▏     | 70/167 [13:28<15:10,  9.38s/it]

pixel accuracy:0.7304356746390714, mIoU:0.224216894607605


 48%|████▊     | 80/167 [15:01<13:35,  9.37s/it]

pixel accuracy:0.72997781657941, mIoU:0.22237104200898095


 54%|█████▍    | 90/167 [16:35<11:59,  9.35s/it]

pixel accuracy:0.7330500614847218, mIoU:0.22352254866015028


 60%|█████▉    | 100/167 [18:08<10:27,  9.37s/it]

pixel accuracy:0.7380844732028365, mIoU:0.21754904926213314


 66%|██████▌   | 110/167 [19:43<09:05,  9.58s/it]

pixel accuracy:0.7434422752594196, mIoU:0.2183564561569375


 72%|███████▏  | 120/167 [21:14<06:22,  8.14s/it]

pixel accuracy:0.747896129197371, mIoU:0.2174967583571593


 78%|███████▊  | 130/167 [22:34<04:45,  7.71s/it]

pixel accuracy:0.7520767913075149, mIoU:0.21632577169990158


 84%|████████▍ | 140/167 [23:53<03:08,  6.97s/it]

pixel accuracy:0.755494759256177, mIoU:0.21561852963419265


 90%|████████▉ | 150/167 [25:17<02:38,  9.31s/it]

pixel accuracy:0.7566498225521037, mIoU:0.2165497587867709


 96%|█████████▌| 160/167 [26:58<01:10, 10.03s/it]

pixel accuracy:0.7574370979397163, mIoU:0.21694253008024764


100%|██████████| 167/167 [29:02<00:00, 10.44s/it]

final pixel accuracy:0.7599767668441958, mIoU:0.22153910138396324



  iou = TP / (TP + FP + FN)
