# Capiton Indexing

## Load Drive


In [1]:
import sys

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# add directory following directory to package dir
#if "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER" not in sys.path:
#  sys.path.append("/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER")

Mounted at /content/drive


## Load captioning module


In [2]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
from tqdm import tqdm
import os, torch, json

In [3]:
# path to the directory containing the images
directory = "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/data/FSC/images_384_VarV2"

In [4]:
checkpoint = "microsoft/git-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Load images and process them to generate caption

In [5]:
# list all files in the directory
print(directory)
files = os.listdir(directory)

results = []

for file_name in tqdm(files, desc="Processing images"):
    # construct the full path to the image file (OS agnostic)
    file_path = os.path.join(directory, file_name)

    # load the image using Pillow
    image = Image.open(file_path)

    # process image and generate caption
    inputs = processor(images=image, return_tensors="pt").to(device)
    pixel_values = inputs.pixel_values.to(device)

    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    #print(generated_caption)

    # store the result in a dictionary
    result = {
        "img_name": file_name,
        "generated_caption": generated_caption
      }
    results.append(result)

/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/data/FSC/images_384_VarV2


Processing images: 100%|██████████| 6146/6146 [28:23<00:00,  3.61it/s]


## Save results on JSON file

In [6]:
# save the results to a JSON file
output_dir = "/content/drive/MyDrive/CLASS_AGNOSTIC_COUNTER/captioning-disambiguation-module"
output_file = "generated_captions.json"
with open(os.path.join(output_dir, output_file), "w") as json_file:
    json.dump(results, json_file, indent=4)

print("Generated captions saved to", output_file)

Generated captions saved to generated_captions.json
