In [None]:
import xretrieval

xretrieval.list_datasets()

In [None]:
dataset = xretrieval.load_dataset("coco-val-2017")

In [None]:
dataset

In [None]:
import xinfer

xinfer.list_models("blip2")

In [None]:
model = xinfer.create_model("Salesforce/blip2-opt-2.7b", device="cuda", dtype="float16")

In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)


In [None]:
dataset

In [8]:
def generate_captions_batch(examples):
    image_paths = [filename for filename in examples['image_path']]
    prompts = [""] * len(image_paths) 
    
    results = model.infer_batch(image_paths, prompts, max_new_tokens=20)
    captions = [res.text for res in results]

    examples['caption'] = captions
    return examples

In [None]:
blip2_captioned_dataset = dataset.map(generate_captions_batch, 
                                      batched=True, 
                                      batch_size=300, 
                                      desc="Generating captions")
blip2_captioned_dataset


In [11]:
df = blip2_captioned_dataset.to_pandas()

In [None]:
df

In [1]:
import xretrieval
import pandas as pd

df = pd.read_parquet("blip2_captioned_coco_val_2017.parquet")
df

Unnamed: 0,image_id,file_name,image_path,caption,name
0,139,000000000139.jpg,data/coco/val2017/000000000139.jpg,a living room with a fireplace and a table,"book,chair,clock,dining table,microwave,person..."
1,285,000000000285.jpg,data/coco/val2017/000000000285.jpg,a brown bear sitting in the grass,bear
2,632,000000000632.jpg,data/coco/val2017/000000000632.jpg,"a bedroom with a bed, a dresser, a bookcase an...","bed,book,chair,potted plant"
3,724,000000000724.jpg,data/coco/val2017/000000000724.jpg,a stop sign on a street corner,"car,stop sign,truck"
4,776,000000000776.jpg,data/coco/val2017/000000000776.jpg,a group of three teddy bears,"bed,teddy bear"
...,...,...,...,...,...
4995,581317,000000581317.jpg,data/coco/val2017/000000581317.jpg,a woman standing on a hill,"cell phone,person"
4996,581357,000000581357.jpg,data/coco/val2017/000000581357.jpg,a man on a skateboard,"bench,person,skateboard"
4997,581482,000000581482.jpg,data/coco/val2017/000000581482.jpg,a large clock in a large building with a large...,clock
4998,581615,000000581615.jpg,data/coco/val2017/000000581615.jpg,a urinal in a bathroom,toilet


In [2]:
xretrieval.list_models("blip2")

In [3]:
xretrieval.run_benchmark(
    dataset=df,
    model_id="transformers/Salesforce/blip2-itm-vit-g",
    mode="text-to-text",
    top_k=5
)

[32m2024-11-20 12:41:28.669[0m | [1mINFO    [0m | [36mxretrieval.core[0m:[36mrun_benchmark[0m:[36m87[0m - [1mEncoding database text for transformers/Salesforce/blip2-itm-vit-g[0m


Encoding captions:   0%|          | 0/157 [00:00<?, ?it/s]

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
[32m2024-11-20 12:41:29.979[0m | [1mINFO    [0m | [36mxretrieval.core[0m:[36mrun_benchmark[0m:[36m95[0m - [1mEncoding query text for transformers/Salesforce/blip2-itm-vit-g[0m


Encoding captions:   0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

{'MRR': 0.2617,
 'NormalizedDCG': 0.2874,
 'Precision': 0.1589,
 'Recall': 0.3622,
 'HitRate': 0.3622,
 'MAP': 0.2556}

In [4]:
xretrieval.run_benchmark(
    dataset=df,
    model_id="transformers/Salesforce/blip2-itm-vit-g",
    mode="text-to-image",
    top_k=5
)


[32m2024-11-20 12:41:36.938[0m | [1mINFO    [0m | [36mxretrieval.core[0m:[36mrun_benchmark[0m:[36m84[0m - [1mEncoding database images for transformers/Salesforce/blip2-itm-vit-g[0m


Encoding images:   0%|          | 0/157 [00:00<?, ?it/s]

[32m2024-11-20 12:42:58.863[0m | [1mINFO    [0m | [36mxretrieval.core[0m:[36mrun_benchmark[0m:[36m95[0m - [1mEncoding query text for transformers/Salesforce/blip2-itm-vit-g[0m


Encoding captions:   0%|          | 0/157 [00:00<?, ?it/s]

{'MRR': 0.3999,
 'NormalizedDCG': 0.5039,
 'Precision': 0.276,
 'Recall': 0.7264,
 'HitRate': 0.7264,
 'MAP': 0.3882}