In [11]:
import sys
sys.path.append("..")
from transformers import pipeline
from PIL import Image
from src.pipeline.feature_extractor import FeatureExtractor
import torch

In [4]:
extractor = FeatureExtractor()

The image processor of type `Mask2FormerImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 
Loading weights: 100%|██████████| 782/782 [00:00<00:00, 1813.99it/s, Materializing param=model.transformer_module.queries_features.weight]                                                       


In [None]:
img = Image.open("Tokyo_Skytree_2014.jpg")
img_input = extractor.mask_processor(img, return_tensors="pt")
with torch.no_grad():
    out = extractor(img_input, [img], [img.size[::-1]])

In [35]:
mask_img = [Image.fromarray(mask).convert("RGB") for mask in out["masks"][0]]

In [33]:
smol_pipe = pipeline("image-text-to-text", model="HuggingFaceTB/SmolVLM-256M-Instruct")

Loading weights: 100%|██████████| 471/471 [00:00<00:00, 2015.24it/s, Materializing param=model.vision_model.post_layernorm.weight]                      


In [57]:
prompt = """You are an image annotation expert.
You will be given an original image and several images of its segmentation part along with labels.
By matching the original image and the segment maps, your task is to assign a probability score to each segment label of how much it contribute to the original image representation.
The total probability score of all segments must be 1.

example output:
```json
{
    "segment_scores": [
        {
            "building": 0.2,
            "tower": 0.7,
            "sky": 0.1
        }
    ]
}
```
"""

In [58]:
out["labels"][0]

['building', 'sky', 'tower']

In [67]:
messages = [
    # {
    #     "role": "system",
    #     "content": prompt
    # },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "which one of the images segmentation below the most representing the landmark?"},
            {"type": "image", "image": img},
            {"type": "text", "text": "original image"},
            {"type": "image", "image": mask_img[0]},
            {"type": "text", "text": f"mask label: {out["labels"][0][0]}"},
            {"type": "image", "image": mask_img[1]},
            {"type": "text", "text": f"mask label: {out["labels"][0][1]}"},
            {"type": "image", "image": mask_img[2]},
            {"type": "text", "text": f"mask label: {out["labels"][0][2]}"},
        ]
    }
]
smol_pipe.model.generation_config.max_length = None
smol_out = smol_pipe(text=messages, return_full_text=False, max_length=256)
smol_out

Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[{'input_text': [{'role': 'user',
    'content': [{'type': 'text',
      'text': 'which one of the images segmentation below the most representing the landmark?'},
     {'type': 'image',
      'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'original image'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: building'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: sky'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: tower'}]}],
  'generated_text': ' The black background is a white silhouette of a gun.'}]

In [64]:
smol_out

[{'input_text': [{'role': 'user',
    'content': [{'type': 'text',
      'text': 'which one of the images segmentation below the most representing the landmark?'},
     {'type': 'image',
      'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'original image'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: building'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: sky'},
     {'type': 'image',
      'image': <PIL.Image.Image image mode=RGB size=960x1471>},
     {'type': 'text', 'text': 'mask label: tower'}]}],
  'generated_text': ' The black background is a white silhouette of a gun.'}]