# Captioning with PromptCaptioning model

### Load drive

In [5]:
import sys

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Install dependencies

In [29]:
!pip install promptcap
!pip install inflect



## Captioning

In [1]:
from promptcap import PromptCap
from tqdm import tqdm
import os, torch, json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = PromptCap("tifa-benchmark/promptcap-coco-vqa")  # also support OFA checkpoints. e.g. "OFA-Sys/ofa-large"

if torch.cuda.is_available():
  model.cuda()

tifa-benchmark/promptcap-coco-vqa
<super: <class 'OFATokenizer'>, <OFATokenizer object>>


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizerFast'. 
The class this function is called from is 'OFATokenizer'.
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [3]:
image_class_dict = {}
#image_class_txt = "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/data/FSC/FSC_147/ImageClasses_FSC147.txt"
image_class_txt = "../data/FSC/FSC_147/ImageClasses_FSC_147.txt"

with open(image_class_txt, 'r') as file:
    lines = file.readlines()

    for line in lines:
        image_name, label = line.strip().split('\t')
        image_class_dict[image_name] = label

In [4]:
import inflect

def retrieve_singular_word(plural_word):
    p = inflect.engine()
    singular_word = p.singular_noun(plural_word)
    if singular_word:
        return singular_word
    else:
        return None

In [11]:
#directory = "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/data/FSC/images_384_VarV2"
directory = "../data/FSC/images_384_VarV2"

# list all files in the directory
files = os.listdir(directory)

results = []

for file_name in files[:10]: # tqdm(files, desc="Processing images"):
    # construct the full path to the image file (OS agnostic)
    file_path = os.path.join(directory, file_name)

    class_name = image_class_dict[file_name]

    singular = retrieve_singular_word(class_name)
    if singular is None:
      singular = class_name

    #print(singular)
    prompt = f"Can you describe the {singular}?"

    generated_caption = model.caption(prompt, file_path)
    print(f'{singular} - {generated_caption} - {file_name}')

    # store the result in a dictionary
    result = {
        "img_name": file_name,
        "class_name" : class_name,
        "generated_caption": generated_caption
      }
    results.append(result)

cap - a pile of baseball caps - 6749.jpg
bead - a dog wearing a bead necklace - 3257.jpg
sunglass - a wall of sunglasses - 2035.jpg
lipstick - a pile of pink lipstick - 2497.jpg
bead - a bead can you bead - 3894.jpg
gemstone - a gemstone on a table - 3152.jpg
flamingo - a flock of flamingo flying in the sky - 6070.jpg
cupcake - a cupcake on a table - 3825.jpg
chicken wing - a chicken wing on a grill - 3559.jpg
green pea - a pile of green peas - 4286.jpg


: 

In [10]:
# save the results to a JSON file
#output_dir = "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/captioning-module"
output_dir = "./"

output_file = "promptcap_vqa_generated_captions_FSC_147.json"
with open(os.path.join(output_dir, output_file), "w") as json_file:
    json.dump(results, json_file, indent=4)

print("Generated captions saved to", output_file)

Generated captions saved to promptcap_vqa_generated_captions_FSC_147.json
