<a href="https://colab.research.google.com/github/donbcolab/composable_vlms/blob/main/notebooks/finetuned_object_detection_using_adapters_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuned Object Detection - using vision model adapters

In [15]:
!pip install -q einops flash_attn timm peft

In [16]:
import requests
import torch

from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

## Start with a Small Vision Model - with Fine-Tuning capabilities

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer

base_vision_model_id = "microsoft/Florence-2-large-ft"

### Set the model for Object Detection mode

In [18]:
hardcoded_od_prompt = "<OD>"

## Patient Blood Sample Biopsy

In [19]:
src_image_url = "https://huggingface.co/spaces/dwb2023/omniscience/resolve/main/examples/BloodImage_00099_jpg.rf.0a65e56401cdd71253e7bc04917c3558.jpg"
src_image = Image.open(requests.get(src_image_url, stream=True).raw)

### Setup and validate the Base Model

In [20]:
model = AutoModelForCausalLM.from_pretrained(base_vision_model_id, torch_dtype=torch_dtype, trust_remote_code=True).to(device)

processor = AutoProcessor.from_pretrained(base_vision_model_id, trust_remote_code=True)

In [21]:
inputs = processor(text=hardcoded_od_prompt, images=src_image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    do_sample=False,
    num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(src_image.width, src_image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[0.3199999928474426, 0.23999999463558197, 639.0399780273438, 479.2799987792969]], 'labels': ['jellyfish']}}


### add the fine-tuned blood biopsy adapter

In [22]:
blood_cell_adapter = "dwb2023/florence2-large-bccd-base-ft"

model.load_adapter(blood_cell_adapter)

In [23]:
inputs = processor(text=hardcoded_od_prompt, images=src_image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    do_sample=False,
    num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(src_image.width, src_image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[332.47998046875, 153.36000061035156, 369.6000061035156, 186.47999572753906], [213.44000244140625, 227.27999877929688, 380.47998046875, 349.1999816894531]], 'labels': ['Platelets', 'WBC']}}


## Patient Liver Sample Biopsy

In [24]:
src_image_url = "https://huggingface.co/spaces/dwb2023/omniscience/resolve/main/examples/15_242_212_25_25_jpg.rf.f6bbadf4260dd2c1f5b4ace1b09b0a1b.jpg"
src_image = Image.open(requests.get(src_image_url, stream=True).raw)

### Set up and validate the Base Model

In [25]:
model = AutoModelForCausalLM.from_pretrained(base_vision_model_id, torch_dtype=torch_dtype, trust_remote_code=True).to(device)

processor = AutoProcessor.from_pretrained(base_vision_model_id, trust_remote_code=True)

In [26]:
inputs = processor(text=hardcoded_od_prompt, images=src_image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    do_sample=False,
    num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(src_image.width, src_image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[0.14949999749660492, 0.14949999749660492, 298.5514831542969, 298.5514831542969]], 'labels': ['flower']}}


### add the fine-tuned liver biopsy adapter

In [27]:
liver_disease_adapter = "dwb2023/florence2-large-liver-disease-ft"

model.load_adapter(liver_disease_adapter)

In [28]:
inputs = processor(text=hardcoded_od_prompt, images=src_image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    do_sample=False,
    num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(src_image.width, src_image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[0.14949999749660492, 69.21849822998047, 70.1155014038086, 172.07449340820312]], 'labels': ['fibrosis']}}
