In [3]:
# !pip install spacy

In [4]:
# !python -m spacy download en_core_web_sm

In [6]:
# !pip install sentence-transformers

In [40]:
# !pip install ultralytics

In [41]:
# !pip install nltk

In [1]:
import requests
import spacy
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources once
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()


In [3]:
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # fast, small, good quality

In [4]:
import requests
from functools import lru_cache
import spacy

nlp = spacy.load("en_core_web_sm")

# ‚úÖ ALL 34 ConceptNet Relations
RELATION_TEMPLATES = {
    "RelatedTo": "{a} is related to {b}.",
    "FormOf": "{a} is an inflected form of {b}.",
    "IsA": "{a} is a type of {b}.",
    "PartOf": "{a} is part of {b}.",
    "HasA": "{a} has a {b}.",
    "UsedFor": "{a} is used for {b}.",
    "CapableOf": "{a} is capable of {b}.",
    "AtLocation": "{a} is typically found at {b}.",
    "Causes": "{a} can cause {b}.",
    "HasSubevent": "{a} includes the event {b}.",
    "HasFirstSubevent": "{a} begins with {b}.",
    "HasLastSubevent": "{a} ends with {b}.",
    "HasPrerequisite": "{a} requires {b} to happen first.",
    "HasProperty": "{a} has the property of being {b}.",
    "MotivatedByGoal": "{a} is done in order to {b}.",
    "ObstructedBy": "{a} can be obstructed by {b}.",
    "Desires": "{a} desires {b}.",
    "CreatedBy": "{a} is created by {b}.",
    "Synonym": "{a} and {b} have similar meanings.",
    "Antonym": "{a} and {b} are opposites.",
    "DistinctFrom": "{a} is distinct from {b}.",
    "DerivedFrom": "{a} is derived from {b}.",
    "SymbolOf": "{a} symbolizes {b}.",
    "DefinedAs": "{a} is defined as {b}.",
    "MannerOf": "{a} is a way of doing {b}.",
    "LocatedNear": "{a} is located near {b}.",
    "HasContext": "{a} is used in the context of {b}.",
    "SimilarTo": "{a} is similar to {b}.",
    "EtymologicallyRelatedTo": "{a} and {b} share a common etymology.",
    "EtymologicallyDerivedFrom": "{a} is etymologically derived from {b}.",
    "CausesDesire": "{a} makes someone want {b}.",
    "MadeOf": "{a} is made of {b}.",
    "ReceivesAction": "{a} can be {b}.",
    "ExternalURL": "For more information about {a}, see {b}."
}

In [5]:
len(RELATION_TEMPLATES)

34

In [6]:
@lru_cache(maxsize=5000)
def query_conceptnet_triples(term, max_results=5, lang="en", return_dict=False):
    """
    Query ConceptNet for all relations of a given term.
    Returns natural-language statements or structured triples.

    Args:
        term (str): The concept to query.
        max_results (int): Max number of results to return.
        lang (str): Language code (default: 'en').
        return_dict (bool): If True, returns structured triples instead of text.

    Returns:
        list: List of sentences or dicts representing ConceptNet triples.
    """
    term = term.strip().lower().replace(" ", "_")
    url = f"http://api.conceptnet.io/c/{lang}/{term}"

    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        print(f"[Warning] Error querying ConceptNet for '{term}': {e}")
        return []

    results = []
    for edge in data.get("edges", []):
        rel = edge.get("rel", {}).get("label", "")
        if rel not in RELATION_TEMPLATES:
            continue

        start = edge["start"].get("label", "")
        end = edge["end"].get("label", "")
        if not start or not end or start.lower() == end.lower():
            continue

        # Handle direction (ConceptNet sometimes reverses)
        if f"/c/{lang}/{term}" == edge["end"].get("@id", ""):
            start, end = end, start  # flip direction

        template = RELATION_TEMPLATES[rel]
        sentence = template.format(a=start, b=end)

        triple = {
            "subject": start,
            "relation": rel,
            "object": end,
            "sentence": sentence
        }

        # results.append(triple if return_dict else sentence)
        results.append(triple)

        # if len(results) >= max_results:
        #     break

    return results

In [31]:
# ‚úÖ Example Usage
terms = ["cat", "sleep", "bridge"]
for t in terms:
    print(f"\nüîπ Facts about '{t}':")
    for fact in query_conceptnet_triples(t, max_results=7):
        print("  -", fact)



üîπ Facts about 'cat':
  - {'subject': 'a cat', 'relation': 'AtLocation', 'object': 'my lap', 'sentence': 'a cat is typically found at my lap.'}
  - {'subject': 'a cat', 'relation': 'AtLocation', 'object': 'a bed', 'sentence': 'a cat is typically found at a bed.'}
  - {'subject': 'a cat', 'relation': 'AtLocation', 'object': 'the windowsill', 'sentence': 'a cat is typically found at the windowsill.'}
  - {'subject': 'Cat', 'relation': 'CapableOf', 'object': 'hunt mice', 'sentence': 'Cat is capable of hunt mice.'}
  - {'subject': 'A cat', 'relation': 'HasA', 'object': 'four legs', 'sentence': 'A cat has a four legs.'}
  - {'subject': 'a cat', 'relation': 'CapableOf', 'object': 'drink water', 'sentence': 'a cat is capable of drink water.'}
  - {'subject': 'a cat', 'relation': 'CapableOf', 'object': 'catch a mouse', 'sentence': 'a cat is capable of catch a mouse.'}
  - {'subject': 'cat', 'relation': 'RelatedTo', 'object': 'feline', 'sentence': 'cat is related to feline.'}
  - {'subject':

In [36]:
from spacy.lang.en.stop_words import STOP_WORDS

def extract_keywords(question: str, max_terms: int = 3) -> list[str]:
    """
    Extracts key content words (mainly nouns) from the question.
    Cleans, lemmatizes, and removes stopwords/punctuation/numbers.
    Returns up to `max_terms` unique keywords.
    """
    if not isinstance(question, str):
        raise ValueError(f"Expected string input, got {type(question)}")

    doc = nlp(question)

    # Collect candidate tokens: prioritize noun chunks first
    candidates = []
    for chunk in doc.noun_chunks:
        root = chunk.root.lemma_.lower().strip()
        if root.isalpha() and root not in STOP_WORDS:
            candidates.append(root)

    # If too few candidates, add additional strong content words (nouns/adjectives)
    if len(candidates) < max_terms:
        for token in doc:
            if (
                token.pos_ in {"NOUN", "PROPN", "ADJ"} and
                token.is_alpha and
                token.lemma_.lower() not in STOP_WORDS
            ):
                candidates.append(token.lemma_.lower())

    # Deduplicate while preserving order
    cleaned = list(dict.fromkeys(candidates))

    return cleaned[:max_terms]


In [8]:
def rank_triples_by_relevance(question: str, triples: list[dict], top_k: int = 5) -> list[dict]:
    """
    Rank ConceptNet triples by cosine similarity between question and triple sentence.
    """
    if not triples:
        return []

    question_emb = embedder.encode(question, convert_to_tensor=True)
    triple_sents = [t["sentence"] for t in triples]
    triple_embs = embedder.encode(triple_sents, convert_to_tensor=True)

    scores = util.cos_sim(question_emb, triple_embs)[0]
    top_indices = torch.topk(scores, min(top_k, len(scores))).indices.tolist()

    ranked = [triples[i] for i in top_indices]
    for i, t in enumerate(ranked):
        t["similarity"] = float(scores[top_indices[i]])
    return ranked


In [25]:
def get_knowledge_context(question: str, top_k: int = 5) -> dict:
    """
    Retrieve and rank ConceptNet triples by relevance to the question.
    """
    print(f"Question: {question}")
    terms = extract_keywords(question)
    all_triples = []

    for term in terms:
        all_triples.extend(query_conceptnet_triples(term))

    # Deduplicate triples by (subject, relation, object)
    seen = set()
    unique_triples = []
    for t in all_triples:
        # print(f"Retrieved triple: {t}")
        key = (t["subject"].lower(), t["relation"], t["object"].lower())
        if key not in seen:
            seen.add(key)
            unique_triples.append(t)

    # Rank by semantic similarity
    ranked_triples = rank_triples_by_relevance(question, unique_triples, top_k=top_k)

    # Build context string
    sentences = " ".join([t["sentence"] for t in ranked_triples])

    return {
        "triples": ranked_triples,
        "sentences": sentences
    }


In [26]:
question = "Why are the wood platforms strapped to the elephants?"
context = get_knowledge_context(question, top_k=3)

print("üîπ Ranked Triples:")
for t in context["triples"]:
    print(f"({t['subject']} - {t['relation']} -> {t['object']})  [score={t['similarity']:.3f}]")

print("\nüîπ Knowledge Summary:")
print(context["sentences"])


Question: Why are the wood platforms strapped to the elephants?
üîπ Ranked Triples:
(elephants - HasA -> trunks)  [score=0.630]
(elephants - HasProperty -> very big)  [score=0.612]
(elephants - HasProperty -> large)  [score=0.592]

üîπ Knowledge Summary:
elephants has a trunks. elephants has the property of being very big. elephants has the property of being large.


In [21]:
# def build_augmented_prompt(image_path, question, choices):
#     knowledge = get_knowledge_context(question)
#     prompt = f"""
# You are a visual question answering model.

# Image: {image_path}
# Question: {question}

# Retrieved Knowledge:
# {knowledge}

# Answer choices:
# {', '.join(choices)}

# Based on the image and the knowledge, choose the best answer.
# """
#     return prompt.strip()


In [12]:
from ultralytics import YOLO

# Load pretrained detector
detector = YOLO('yolov8s.pt')  # small but good baseline

def detect_objects(image_path, conf_threshold=0.3, max_objects=10):
    """
    Detect objects in the image and return class labels.
    """
    results = detector(image_path)
    labels = []
    for r in results:
        for c in r.boxes.cls:
            labels.append(results[0].names[int(c)])
    return list(dict.fromkeys(labels))[:max_objects]


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 21.5MB 28.9MB/s 0.7s0.7s<0.0s


In [27]:
from collections import defaultdict

def expand_conceptnet_terms(term, max_neighbors=3):
    """
    Get semantically related terms for a given term from ConceptNet.
    Used as a fallback when no relevant terms found.
    """
    neighbors = []
    url = f"http://api.conceptnet.io/query?node=/c/en/{term}&rel=/r/RelatedTo&limit={max_neighbors}"
    try:
        import requests
        response = requests.get(url).json()
        for edge in response.get("edges", []):
            other = edge["end"]["label"] if edge["start"]["label"].lower() == term.lower() else edge["start"]["label"]
            neighbors.append(other.lower())
    except Exception:
        pass
    return list(set(neighbors))

def get_visual_question_terms(image_path, question, intersection_first=True, expand_if_empty=True):
    """
    Combines visual and question-derived terms, robustly handling all empty cases.
    Returns relevant_terms (for retrieval), plus raw question_terms and visual_terms.
    """
    print(f"Question: {question}")
    question_terms = extract_keywords(question)
    visual_terms = detect_objects(image_path)

    # Step 1: intersection (precise)
    if intersection_first:
        relevant_terms = list(set(question_terms) & set(visual_terms))
    else:
        relevant_terms = list(set(question_terms) | set(visual_terms))

    # Step 2: if intersection empty, fall back to union
    if not relevant_terms:
        relevant_terms = list(set(question_terms) | set(visual_terms))

    # Step 3: if still empty, use ConceptNet expansion of question terms
    if expand_if_empty and not relevant_terms and question_terms:
        expanded = []
        for qt in question_terms:
            expanded.extend(expand_conceptnet_terms(qt))
        relevant_terms = list(set(question_terms + expanded))

    # Step 4: final safety net
    if not relevant_terms:
        relevant_terms = question_terms if question_terms else [question]

    return relevant_terms, question_terms, visual_terms


In [28]:
def sanitize_terms(terms, max_words=3):
    """
    Clean and filter terms before querying ConceptNet.
    Includes stopword removal and lemmatization.
    """
    clean_terms = []
    
    for t in terms:
        if not isinstance(t, str): 
            continue
        
        # Lowercase + remove punctuation
        t = t.strip().lower()
        t = re.sub(r"[^\w\s]", "", t)
        
        # Tokenize
        tokens = t.split()
        
        # Remove stopwords and lemmatize
        tokens = [LEMMATIZER.lemmatize(tok) for tok in tokens if tok not in STOPWORDS]
        
        # Skip if empty or numeric
        if not tokens or all(tok.isnumeric() for tok in tokens):
            continue
        
        # Rejoin meaningful multi-word concepts
        processed = " ".join(tokens)
        
        # Skip phrases that are too long
        if len(tokens) > max_words:
            continue
        
        clean_terms.append(processed)
    
    # Deduplicate while preserving order
    return list(dict.fromkeys(clean_terms))


In [29]:
def get_grounded_knowledge_context(image_path, question, top_k=5):
    relevant_terms, q_terms, v_terms = get_visual_question_terms(image_path, question)
    
    # üîß Sanitize terms before retrieval
    # Use improved sanitization
    sanitized_terms = sanitize_terms(relevant_terms)

    if not sanitized_terms:
        sanitized_terms = sanitize_terms(q_terms)
    if not sanitized_terms:
        sanitized_terms = [question.split()[0]]  # last-resort fallback
    
    print(f"Question terms: {q_terms}")
    print(f"Visual terms: {v_terms}")
    print(f"Retrieving ConceptNet for: {sanitized_terms}")
    
    all_triples = []
    for term in sanitized_terms:
        triples = query_conceptnet_triples(term, max_results=10)
        if triples:
            all_triples.extend(triples)

    # Deduplicate
    seen = set()
    unique_triples = []
    for t in all_triples:
        key = (t['subject'].lower(), t['relation'], t['object'].lower())
        if key not in seen:
            seen.add(key)
            unique_triples.append(t)

    # If still no triples, gracefully degrade to linguistic similarity
    if not unique_triples:
        print("‚ö†Ô∏è No ConceptNet triples found, using question terms as pseudo-context.")
        sentences = f"The question is about {', '.join(sanitized_terms)}."
        return {
            "triples": [],
            "sentences": sentences,
            "visual_terms": v_terms,
            "question_terms": q_terms
        }

    # Rank triples normally
    ranked = rank_triples_by_relevance(question, unique_triples, top_k=top_k)
    sentences = " ".join([t["sentence"] for t in ranked])

    return {
        "triples": ranked,
        "sentences": sentences,
        "visual_terms": v_terms,
        "question_terms": q_terms
    }


In [38]:
# def build_grounded_prompt(image_path, question, choices):
#     context = get_grounded_knowledge_context(image_path, question, top_k=5)

#     prompt = f"""
#     You are a visual question answering model.

#     Image: {image_path}
#     Question: {question}

#     Visual objects detected: {', '.join(context['visual_terms'])}

#     Retrieved Knowledge (ConceptNet):
#     {'; '.join([f"({t['subject']} - {t['relation']} -> {t['object']})" for t in context['triples']])}

#     Knowledge Summary:
#     {context['sentences']}

#     Answer choices:
#     {', '.join(choices)}

#     Based on the image, the detected objects, and the knowledge, choose the best answer.
#     """
#     return prompt.strip()


In [17]:
import os
import shutil
aokvqa_dir = os.getenv('AOKVQA_DIR', r"C:\workspace\misc\5980\aokvqa")
coco_dir = os.getenv('COCO_DIR', r"C:\workspace\misc\5980\coco")
coco_filtered_dir = os.getenv('COCO_FILTERED_DIR', r"C:\workspace\misc\5980\coco_filtered")

In [18]:
from load_aokvqa import load_aokvqa, get_coco_path
val_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'val')  
train_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'train')  
test_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'test')

In [19]:
print(f"Train dataset size: {len(train_aokvqa_dataset)}")
print(f"Validation dataset size: {len(val_aokvqa_dataset)}")
print(f"Test dataset size: {len(test_aokvqa_dataset)}")

Train dataset size: 17056
Validation dataset size: 1145
Test dataset size: 6702


In [39]:
def build_grounded_prompt(image_path, question, choices):
    context = get_grounded_knowledge_context(image_path, question, top_k=5)

    prompt = f"""
    You are an intelligent visual question answering system that uses both visual perception and external knowledge.

    **Task**:
    Answer the question about the image as accurately as possible.
    Use the detected visual objects for grounding and the provided knowledge for reasoning.

    ---

    **Image Path**: {image_path}
    **Question**: {question}

    **Detected Visual Objects**:
    {', '.join(context['visual_terms'])}

    **Relevant Knowledge (from ConceptNet and related sources)**:
    {context['sentences']}

    **Answer Choices**:
    {', '.join(choices)}

    ---

    **Instructions**:
    1. First, interpret the question carefully ‚Äî identify what kind of information it asks for (object, color, action, relation, etc.).
    2. Then, look at the detected visual objects and find which ones are relevant.
    3. Use the knowledge summary to fill in facts or relationships that are not directly visible in the image.
    4. If the knowledge seems unrelated or missing, rely on the image context alone.
    5. Finally, choose **one best answer** from the choices that fits both the visual evidence and the external knowledge.

    **Respond only with the final answer choice.**
    """
    return prompt.strip()


In [34]:
image_id = 461751
question = "What is in the motorcyclist's mouth?"
choices = [
    "toothpick",
    "food",
    "popsicle stick",
    "cigarette"
]
image_path = get_coco_path("val", image_id, coco_dir)
image_path   

'C:\\workspace\\misc\\5980\\coco\\val2017\\000000461751.jpg'

In [40]:
grounded_prompt = build_grounded_prompt(
    image_path=image_path,
    question=question,
    choices=choices
)
print(grounded_prompt)

Question: What is in the motorcyclist's mouth?



image 1/1 C:\workspace\misc\5980\coco\val2017\000000461751.jpg: 576x640 4 persons, 1 car, 1 motorcycle, 135.4ms
Speed: 4.5ms preprocess, 135.4ms inference, 1.0ms postprocess per image at shape (1, 3, 576, 640)
Question terms: ['mouth', 'motorcyclist']
Visual terms: ['person', 'car', 'motorcycle']
Retrieving ConceptNet for: ['mouth', 'motorcyclist', 'car', 'motorcycle', 'person']
You are an intelligent visual question answering system that uses both visual perception and external knowledge.

    **Task**:
    Answer the question about the image as accurately as possible.
    Use the detected visual objects for grounding and the provided knowledge for reasoning.

    ---

    **Image Path**: C:\workspace\misc\5980\coco\val2017\000000461751.jpg
    **Question**: What is in the motorcyclist's mouth?

    **Detected Visual Objects**:
    person, car, motorcycle

    **Relevant Knowledge (from ConceptNet and related sources)**:
    motorcyclist is derived from motorcycle. motorcyclist is rel