In [1]:
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from sentence_transformers import SentenceTransformer, util

# ------------------------------------------------------------
# 1. Tiny BLIP captioning (gives you a compact scene summary)
# ------------------------------------------------------------
def generate_caption(image_path: str):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=30)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# ------------------------------------------------------------
# 2. Semantic retrieval of related knowledge sentences
# ------------------------------------------------------------
def retrieve_textual_knowledge(caption: str, question: str, corpus, top_k=3):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query = f"{caption}. {question}"
    q_emb = model.encode(query, convert_to_tensor=True)
    c_emb = model.encode(corpus, convert_to_tensor=True)
    scores = util.cos_sim(q_emb, c_emb)[0]
    top_idx = torch.topk(scores, k=min(top_k, len(corpus))).indices.tolist()
    return [corpus[i] for i in top_idx]

# ------------------------------------------------------------
# 3. Fallback logic (if similarity fails)
# ------------------------------------------------------------
def fallback_rules(question: str):
    q = question.lower()
    if "mouth" in q:
        return ["A cigarette, food, or toothpick can be held in a mouth."]
    if "road" in q:
        return ["Vehicles travel on roads; motorcyclists wear helmets."]
    return ["People interact with objects depending on context."]

# ------------------------------------------------------------
# 4. High-level function
# ------------------------------------------------------------
def get_small_knowledge(image_path, question, corpus):
    caption = generate_caption(image_path)
    facts = retrieve_textual_knowledge(caption, question, corpus)
    if not facts:
        facts = fallback_rules(question)
    return {
        "caption": caption,
        "facts": facts
    }



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
image_path = "C:\\workspace\\misc\\5980\\coco\\val2017\\000000461751.jpg"

In [4]:
generate_caption(image_path)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download.

'a man on a motorcycle'

In [None]:
# # Example usage
# if __name__ == "__main__":
#     corpus = [
#         "A cigarette is often held in the mouth.",
#         "Motorcyclists ride motorcycles and wear helmets.",
#         "Food can be eaten or chewed in the mouth.",
#         "A popsicle stick is held in the mouth when eating ice cream."
#     ]
#     context = get_small_knowledge("motorcyclist.jpg", "What is in the motorcyclist's mouth?", corpus)
#     print(context)
