In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import argparse
import numpy as np
from PIL import Image

from models.image_text_transformation import ImageTextTransformation
from utils.util import read_image_width_height, resize_long_edge

In [3]:

parser = argparse.ArgumentParser()
# parser.add_argument('--image_src', default='examples/1.jpg')
# parser.add_argument('--out_image_name', default='output/1_result.jpg')
parser.add_argument(
    '--gpt_version',
    choices=['gpt-3.5-turbo', 'gpt4'],
    default='gpt-3.5-turbo',
)
parser.add_argument(
    '--image_caption',
    action='store_true',
    dest='image_caption',
    default=True,
    help='Set this flag to True if you want to use BLIP2 Image Caption',
)
parser.add_argument(
    '--dense_caption',
    action='store_true',
    dest='dense_caption',
    default=True,
    help='Set this flag to True if you want to use Dense Caption',
)
parser.add_argument(
    '--semantic_segment',
    action='store_true',
    dest='semantic_segment',
    default=True,
    help='Set this flag to True if you want to use semantic segmentation',
)

parser.add_argument(
    '--sam_arch',
    choices=['vit_b', 'vit_l', 'vit_h'],
    dest='sam_arch',
    default='vit_b',
    help='vit_b is the default model (fast but not accurate), vit_l and vit_h are larger models',
)
parser.add_argument(
    '--captioner_base_model',
    choices=['blip', 'blip2'],
    dest='captioner_base_model',
    default='blip',
    help='blip2 requires 15G GPU memory, blip requires 6G GPU memory',
)
parser.add_argument(
    '--region_classify_model',
    choices=['ssa', 'edit_anything'],
    dest='region_classify_model',
    default='ssa',
    help='Select the region classification model: edit anything is ten times faster than ssa, but less accurate.',
)

backends = ['cuda', 'cuda:0', 'cuda:1', 'cpu']
parser.add_argument(
    '--image_caption_device',
    choices=backends,
    default='cuda:0',
    help=f'Select the device: {backends}, gpu memory larger than 14G is recommended',
)
parser.add_argument(
    '--dense_caption_device', choices=backends,
    default='cuda:0',
    help=f'Select the device: {backends}, < 6G GPU is not recommended>',
)
parser.add_argument(
    '--semantic_segment_device',
    choices=backends,
    default='cuda:1',
    help=f'Select the device: {backends}, gpu memory larger than 14G is recommended. '
         ' Make sure this model and image_caption model on same device. (Only relevant for EditAnything?)',
)
parser.add_argument(
    '--contolnet_device',
    choices=backends,
    default='cuda:0',
    help=f'Select the device: {backends}, <6G GPU is not recommended>',
)

args = parser.parse_args(args=[])


In [4]:
processor = ImageTextTransformation(args)

{'gpt_version': 'gpt-3.5-turbo',
 'image_caption': True,
 'dense_caption': True,
 'semantic_segment': True,
 'sam_arch': 'vit_b',
 'captioner_base_model': 'blip',
 'region_classify_model': 'ssa',
 'image_caption_device': 'cuda:0',
 'dense_caption_device': 'cuda:0',
 'semantic_segment_device': 'cuda:1',
 'contolnet_device': 'cuda:0'}
[1;34m-------------------Welcome to the Image2Paragraph toolbox...--------------------[0m
[1;33m-----------------------------Initializing models...-----------------------------[0m
[1;31m---------------------This is time-consuming, please wait...---------------------[0m
[1;33m-------------------0.00 Initializing ImageCaptioning blip...--------------------[0m
blip initialized on cuda:0
[1;33m---------------------19.74 Initializing DenseCaptioning...----------------------[0m
DenseCaptioning initialize is empty and device: cuda:0 is used in a weird way
[1;33m----------------19.74 Initializing ImageToText gpt-3.5-turbo...-----------------[0m
[1;33m-

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet.StableDiffusionControlNetPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


SD-ControlNet initialized on cpu with cuda:0 requested
SD-ControlNet pipeline initialized on cpu with cuda:0 requested
[1;33m--------------------25.86 Initializing RegionSemantic ssa...--------------------[0m




SemanticSegment initialized on cuda:1
[1;32m----------------------55.71 Model initialization finished!----------------------[0m


In [5]:
# import torch
# from transformers import (CLIPProcessor, CLIPModel, AutoProcessor, CLIPSegForImageSegmentation,
#                           OneFormerProcessor, OneFormerForUniversalSegmentation,
#                           BlipProcessor, BlipForConditionalGeneration)
                          
# model_name = "shi-labs/oneformer_ade20k_swin_tiny"
# oneformer_ade20k_processor = OneFormerProcessor.from_pretrained(model_name, dtype=torch.float16)
# oneformer_ade20k_model = OneFormerForUniversalSegmentation.from_pretrained(model_name)
# oneformer_ade20k_model = oneformer_ade20k_model.to(device="cuda:1", dtype=torch.float16)

In [6]:
root_path = "/home/dmcconachie/single_test_log_20230508-105542.camera.images"
filename = "00000000f1182120-000000.png"
image_path = f"{root_path}/{filename}"

# ref_image = resize_long_edge(Image.open(image_path), 384)
# width, height = read_image_width_height(img_src)

image_caption = processor.image_caption_model.image_caption(image_path)



[1;35m****************************************************************************************************[0m

Step1, blip caption:
a table with a bunch of oranges on it
[1;35m****************************************************************************************************[0m


In [7]:
dense_caption = processor.dense_caption_model.image_dense_caption(image_path)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[1;35m****************************************************************************************************[0m
Step2, Dense Caption:

wooden cutting board: [1, 3, 380, 211]; four oranges on the table: [172, 42, 240, 99]; a black and red cord: [281, 1, 382, 213]; a round white object: [258, 156, 319, 209]; a blue plastic container: [336, 48, 383, 110]; a wooden cutting board: [39, 13, 290, 162]; a blue and silver kitchen appliance: [46, 158, 133, 213]; a silver and black coffee pot: [349, 109, 382, 213]; metal parts under desk: [194, 152, 351, 213]; the phone is black: [48, 82, 87, 131]; 
[1;35m****************************************************************************************************[0m


In [8]:
region_semantic = processor.region_semantic_model.region_semantic(image_path)

[1;35m****************************************************************************************************[0m

Step3, Semantic Prompt:
extract region segmentation with SAM model....

finished...

generate region supervision with blip/blip2 model....

