# 570 Final Project - Parth R. Doshi

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


INSTALLING DEPENDENCIES

In [None]:
!pip install diffusers accelerate safetensors transformers
import PIL
import requests
import torch
import numpy as np
import tensorflow_datasets as tfds
from PIL import Image
from IPython import display
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler

PRELIMINARY EXPERIMENTS

In [None]:
  #LOADING THE MODEL
  url = "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/main/imgs/example.jpg"
  def download_image(url):
      image = PIL.Image.open(requests.get(url, stream=True).raw)
      image = PIL.ImageOps.exif_transpose(image)
      image = image.convert("RGB")
      return image
  image = download_image(url)

  model_id = "timbrooks/instruct-pix2pix"
  instruction = "make him into a cyborg"
  pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
  pipe.to("cuda")
  pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

In [None]:
 #EXPERIMENT 1
instruction = "make him into a blue clown"
images = pipe(instruction, image=image, num_inference_steps=10, image_guidance_scale=1.0).images
print(f"\nInstructPix2Pix Output:\n")
images[0]

VARYING NUMBER OF INFERENCE STEPS PARAMETER

In [None]:
  #EXPERIMENT 2 - PART 1
  images = pipe(instruction, image=image, num_inference_steps=10, image_guidance_scale=1.0).images
  print(f"\nInstructPix2Pix Output:\n")
  images[0]

In [None]:
  #EXPERIMENT 2 - PART 2
  images = pipe(instruction, image=image, num_inference_steps=100, image_guidance_scale=1.0).images
  print(f"\nIncreased number of inference steps (num_inference_steps = 10):\n")
  images[0]

In [None]:
  #EXPERIMENT 2 - PART 3
  images = pipe(instruction, image=image, num_inference_steps=1000, image_guidance_scale=1.0).images
  print(f"\nIncreased number of inference steps (num_inference_steps = 1000):\n")
  images[0]

VARYING IMAGE GUIDANCE SCALE PARAMETER

In [None]:
  #EXPERIMENT 3 - PART 1
  instruction = "make him into a doctor"
  images = pipe(instruction, image=image, num_inference_steps=10, image_guidance_scale=1.0).images
  print(f"\nInstructPix2Pix Output:\n")
  images[0]

In [None]:
  #EXPERIMENT 3 - PART 2
  instruction = "make him into a doctor"
  images = pipe(instruction, image=image, num_inference_steps=10, image_guidance_scale=10.0).images
  print(f"\nInstructPix2Pix Output:\n")
  images[0]

In [None]:
  #EXPERIMENT 3 - PART 3
  instruction = "make him into a doctor"
  images = pipe(instruction, image=image, num_inference_steps=10, image_guidance_scale=0.1).images
  print(f"\nInstructPix2Pix Output:\n")
  images[0]

EXTENDED EXPERIMENTS

In [21]:
#LOADING THE DATASETS
DATASETS = [
    "fractal20220817_data",
    "kuka",
    "bridge",
    "taco_play",
    "jaco_play",
    "berkeley_cable_routing",
    "roboturk",
    "nyu_door_opening_surprising_effectiveness",
    "viola",
    "berkeley_autolab_ur5",
    "toto",
    "language_table",
    "columbia_cairlab_pusht_real",
    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds",
    "nyu_rot_dataset_converted_externally_to_rlds",
    "stanford_hydra_dataset_converted_externally_to_rlds",
    "austin_buds_dataset_converted_externally_to_rlds",
    "nyu_franka_play_dataset_converted_externally_to_rlds",
    "maniskill_dataset_converted_externally_to_rlds",
    "cmu_franka_exploration_dataset_converted_externally_to_rlds",
    "ucsd_kitchen_dataset_converted_externally_to_rlds",
    "ucsd_pick_and_place_dataset_converted_externally_to_rlds",
    "austin_sailor_dataset_converted_externally_to_rlds",
    "austin_sirius_dataset_converted_externally_to_rlds",
    "bc_z",
    "usc_cloth_sim_converted_externally_to_rlds",
    "utokyo_pr2_opening_fridge_converted_externally_to_rlds",
    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds",
    "utokyo_saytap_converted_externally_to_rlds",
    "utokyo_xarm_pick_and_place_converted_externally_to_rlds",
    "utokyo_xarm_bimanual_converted_externally_to_rlds",
    "robo_net",
    "berkeley_mvp_converted_externally_to_rlds",
    "berkeley_rpt_converted_externally_to_rlds",
    "kaist_nonprehensile_converted_externally_to_rlds",
    "stanford_mask_vit_converted_externally_to_rlds",
    "tokyo_u_lsmo_converted_externally_to_rlds",
    "dlr_sara_pour_converted_externally_to_rlds",
    "dlr_sara_grid_clamp_converted_externally_to_rlds",
    "dlr_edan_shared_control_converted_externally_to_rlds",
    "asu_table_top_converted_externally_to_rlds",
    "stanford_robocook_converted_externally_to_rlds",
    "eth_agent_affordances",
    "imperialcollege_sawyer_wrist_cam",
    "iamlab_cmu_pickup_insert_converted_externally_to_rlds",
    "uiuc_d3field",
    "utaustin_mutex",
    "berkeley_fanuc_manipulation",
    "cmu_play_fusion",
    "cmu_stretch",
    "berkeley_gnm_recon",
    "berkeley_gnm_cory_hall",
    "berkeley_gnm_sac_son",
]


def dataset2path(name):
    if name == "robo_net":
        version = "1.0.0"
    elif name == "language_table":
        version = "0.0.1"
    else:
        version = "0.1.0"
    return f"gs://gresearch/robotics/{name}/{version}"


def as_gif(images, path="temp.gif"):
    # Render the images as the gif:
    images[0].save(path, save_all=True, append_images=images[1:], duration=1000, loop=0)
    gif_bytes = open(path, "rb").read()
    return gif_bytes


def mse_loss(first_image, last_image, ip2p_image):
    # Ensure the images have the same shape
    assert first_image.shape == last_image.shape == ip2p_image.shape, "Input images must have the same shape"

    # Calculate MSE loss
    mse = np.mean((last_image - ip2p_image) ** 2)

    return mse

In [22]:
# choose the dataset path in the dropdown on the right and rerun this cell
# to see multiple samples

dataset = "fractal20220817_data"  # @param ['fractal20220817_data', 'kuka', 'bridge', 'taco_play', 'jaco_play', 'berkeley_cable_routing', 'roboturk', 'nyu_door_opening_surprising_effectiveness', 'viola', 'berkeley_autolab_ur5', 'toto', 'language_table', 'columbia_cairlab_pusht_real', 'stanford_kuka_multimodal_dataset_converted_externally_to_rlds', 'nyu_rot_dataset_converted_externally_to_rlds', 'stanford_hydra_dataset_converted_externally_to_rlds', 'austin_buds_dataset_converted_externally_to_rlds', 'nyu_franka_play_dataset_converted_externally_to_rlds', 'maniskill_dataset_converted_externally_to_rlds', 'furniture_bench_dataset_converted_externally_to_rlds', 'cmu_franka_exploration_dataset_converted_externally_to_rlds', 'ucsd_kitchen_dataset_converted_externally_to_rlds', 'ucsd_pick_and_place_dataset_converted_externally_to_rlds', 'austin_sailor_dataset_converted_externally_to_rlds', 'austin_sirius_dataset_converted_externally_to_rlds', 'bc_z', 'usc_cloth_sim_converted_externally_to_rlds', 'utokyo_pr2_opening_fridge_converted_externally_to_rlds', 'utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds', 'utokyo_saytap_converted_externally_to_rlds', 'utokyo_xarm_pick_and_place_converted_externally_to_rlds', 'utokyo_xarm_bimanual_converted_externally_to_rlds', 'robo_net', 'berkeley_mvp_converted_externally_to_rlds', 'berkeley_rpt_converted_externally_to_rlds', 'kaist_nonprehensile_converted_externally_to_rlds', 'stanford_mask_vit_converted_externally_to_rlds', 'tokyo_u_lsmo_converted_externally_to_rlds', 'dlr_sara_pour_converted_externally_to_rlds', 'dlr_sara_grid_clamp_converted_externally_to_rlds', 'dlr_edan_shared_control_converted_externally_to_rlds', 'asu_table_top_converted_externally_to_rlds', 'stanford_robocook_converted_externally_to_rlds', 'eth_agent_affordances', 'imperialcollege_sawyer_wrist_cam', 'iamlab_cmu_pickup_insert_converted_externally_to_rlds', 'uiuc_d3field', 'utaustin_mutex', 'berkeley_fanuc_manipulation', 'cmu_food_manipulation', 'cmu_play_fusion', 'cmu_stretch', 'berkeley_gnm_recon', 'berkeley_gnm_cory_hall', 'berkeley_gnm_sac_son']
display_key = "image"

b = tfds.builder_from_directory(builder_dir=dataset2path(dataset))
if display_key not in b.info.features["steps"]["observation"]:
    raise ValueError(
        f"The key {display_key} was not found in this dataset.\n"
        + "Please choose a different image key to display for this dataset.\n"
        + "Here is the observation spec:\n"
        + str(b.info.features["steps"]["observation"]))

In [None]:
ds = b.as_dataset(split="train[:10]").shuffle(10)                # take only first 10 episodes
episodes = [ep for ep in list(ds)]
eps = [list(ep["steps"].as_numpy_iterator()) for ep in episodes] # list of videos/episodes
my_dataset_dict = {'Initial image': [], 'Prompt': [], 'Expected Output': [], 'Output Image': []}
total_loss = 0

for i in range(0, len(eps)):
  print(f'i: {i}')
  first_ep = eps[i]
  first_st = first_ep[0]                                                    # contains the first video, first frame
  first_obs = first_st["observation"]                                       # contains the observation
  first_image = first_obs['image']
  instruction = first_obs['natural_language_instruction'].decode("utf-8")   # gives the instruction
  last_st = first_ep[-1]                                                    # gives the last frame of the video
  last_image = last_st["observation"]["image"]                              # gives the last image

  # prompt: display first and last image
  display.display(Image.fromarray(first_image))
  print('\n')
  print(f'Instruction : {instruction}')
  print('\n')
  display.display(Image.fromarray(last_image))


  model_id = "timbrooks/instruct-pix2pix"
  pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
  pipe.to("cuda")
  pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

  images = pipe(instruction, image=first_image, num_inference_steps=100, image_guidance_scale=1.0).images
  images[0]

  ip2p_image = np.array(images[0])
  display.display(Image.fromarray(ip2p_image))

  my_dataset_dict['Initial image'].append(first_image)
  my_dataset_dict['Prompt'].append(instruction)
  my_dataset_dict['Expected Output'].append(last_image)
  my_dataset_dict['Output Image'].append(ip2p_image)

  # Calculate MSE loss
  loss = mse_loss(first_image, last_image, ip2p_image)

  print(f"\nMSE Loss: {loss}\n")
  total_loss += loss

print(f"\nAverage MSE Loss: {total_loss/len(eps)}\n")
# print(my_dataset_dict)

TRAINING FUNCTION IMPLEMENTATION

In [None]:
# %cd /content/drive/MyDrive/Colab/instruct-pix2pix
# %yq eval-all -i 'select(fileIndex == 0) * select(fileIndex == 1)' configs/train.yaml script.yml
# %python main.py --name default --base configs/train.yaml --train

# data:
#   target: main.DataModuleFromConfig
#   params:
#     batch_size: 32
#     num_workers: 2
#     train:
#       target: edit_dataset.EditDataset
#       params:
#         path: data/my_dict_dataset-dataset
#         split: train
#         min_resize_res: 256
#         max_resize_res: 256
#         crop_res: 256
#         flip_prob: 0.5
#     validation:
#       target: edit_dataset.EditDataset
#       params:
#         path: data/my_dict_dataset-dataset
#         split: val
#         min_resize_res: 256
#         max_resize_res: 256
#         crop_res: 256

CUSTOM INPUT SELECTION

In [None]:
  # import cv2
  # # Read your images (replace these paths with image paths)
  # first_image = cv2.imread('path_to_first_image.jpg', cv2.IMREAD_COLOR)
  # last_image = cv2.imread('path_to_last_image.jpg', cv2.IMREAD_COLOR)
  # expected_image = cv2.imread('path_to_expected_image.jpg', cv2.IMREAD_COLOR)

  # # Convert images to NumPy arrays
  # first_image = np.asarray(first_image)
  # last_image = np.asarray(last_image)
  # expected_image = np.asarray(expected_image)

  # model_id = "timbrooks/instruct-pix2pix"
  # pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
  # pipe.to("cuda")
  # pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

  # images = pipe(instruction, image=first_image, num_inference_steps=100, image_guidance_scale=1.0).images
  # images[0]
  # ip2p_image = np.array(images[0])
  # display.display(Image.fromarray(ip2p_image))

RERUN AFTER RETRAINING IP2P DIFFUSION MODEL

In [None]:
# ds = b.as_dataset(split="train[:10]").shuffle(10)                # take only first 10 episodes
# episodes = [ep for ep in list(ds)]
# eps = [list(ep["steps"].as_numpy_iterator()) for ep in episodes] # list of videos/episodes
# my_dataset_dict = {'Initial image': [], 'Prompt': [], 'Expected Output': [], 'Output Image': []}
# total_loss = 0

# for i in range(0, len(eps)):
#   print(f'i: {i}')
#   first_ep = eps[i]
#   first_st = first_ep[0]                                                    # contains the first video, first frame
#   first_obs = first_st["observation"]                                       # contains the observation
#   first_image = first_obs['image']
#   instruction = first_obs['natural_language_instruction'].decode("utf-8")   # gives the instruction
#   last_st = first_ep[-1]                                                    # gives the last frame of the video
#   last_image = last_st["observation"]["image"]                              # gives the last image

#   # prompt: display first and last image
#   display.display(Image.fromarray(first_image))
#   print('\n')
#   print(f'Instruction : {instruction}')
#   print('\n')
#   display.display(Image.fromarray(last_image))

#   model_id = "timbrooks/instruct-pix2pix"
#   pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
#   pipe.to("cuda")
#   pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

#   images = pipe(instruction, image=first_image, num_inference_steps=100, image_guidance_scale=1.0).images
#   images[0]

#   ip2p_image = np.array(images[0])
#   display.display(Image.fromarray(ip2p_image))

#   my_dataset_dict['Initial image'].append(first_image)
#   my_dataset_dict['Prompt'].append(instruction)
#   my_dataset_dict['Expected Output'].append(last_image)
#   my_dataset_dict['Output Image'].append(ip2p_image)


#   # Calculate MSE loss
#   loss = mse_loss(first_image, last_image, ip2p_image)

#   print(f"\nMSE Loss: {loss}\n")
#   total_loss += loss

# print(f"\nAverage MSE Loss: {total_loss/len(eps)}\n")
# # print(my_dataset_dict)