In [3]:
import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    """
    Set random seed for reproducibility across Python, NumPy, and PyTorch.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For deterministic behavior (slower but fully reproducible)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
import json
import re
from tqdm import tqdm
import networkx as nx
from collections import deque
from gradio_client import Client
import re
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
from typing import Dict, List, Tuple, Set, Iterable, Optional
import networkx as nx
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import math

In [None]:
import networkx as nx
from sentence_transformers import SentenceTransformer, util
import numpy as np

# embed_model = SentenceTransformer("all-mpnet-base-v2")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
def extract_keywords_spacy(question):
    doc = nlp(question)

    keywords = []
    for token in doc:
        # Filter out stopwords, punctuation, and select meaningful POS
        if not token.is_stop and not token.is_punct:
            if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"}:
                keywords.append(token.lemma_.lower())

    return list(dict.fromkeys(keywords))  # Remove duplicates, preserve order


# Example
question = "Why would someone bring an umbrella outside on a sunny day?"
print(extract_keywords_spacy(question))

['bring', 'umbrella', 'sunny', 'day']


In [5]:
with open(f"data/conceptnet_relations.json", "r") as fp:
    conceptnet_relations = json.load(fp)

len(conceptnet_relations)

5230

In [16]:
question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
question_concept = "revolving door"
choices = [
    "bank",
    "library",
    "department store",
    "mall",
    "new york"
]

In [6]:
"""
kg_prompt_builder.py

Builds a MultiDiGraph from ConceptNet-like dicts, finds paths between question keywords
and choices (using all relations, no weighting/filtering), and extracts atomic facts per choice.

Requirements:
    pip install networkx
"""

from typing import Dict, List, Tuple, Iterable, Set
import networkx as nx

# Type aliases
CNNode = str
Relation = str
Score = float
CNData = Dict[Relation, List[Tuple[CNNode, Score]]]  # the score is present but ignored per user's request
Triple = Tuple[str, str, str]  # (subject, relation, object)


def build_graph_from_conceptnet_dicts(conceptnet_dict_by_word: Dict[str, CNData]) -> nx.MultiDiGraph:
    """
    Build a MultiDiGraph where nodes are concept strings and edges have attribute 'rel'.
    We add edges for every relation entry found in the dict for each word.
    We do NOT use the numeric scores for weighting (user requested to use all relations and not weight them).
    """
    G = nx.MultiDiGraph()
    for subject_word, rel_dict in conceptnet_dict_by_word.items():
        # ensure the subject node exists
        G.add_node(subject_word)
        for rel, obj_list in rel_dict.items():
            for obj, _score in obj_list:
                # add node for object and directed edge subject -> object labeled with relation
                G.add_node(obj)
                G.add_edge(subject_word, obj, rel=rel)
                # Additionally, add an inverse relation edge for easier path discovery (optional but helpful).
                # We label the reverse as "rev:<REL>" so it remains explicit.
                G.add_edge(obj, subject_word, rel=f"rev:{rel}")
    return G


def find_paths_between_keywords_and_choices(
    G: nx.MultiDiGraph,
    keywords: Iterable[str],
    choices: Iterable[str],
    max_hops: int = 3,
    top_k_paths: int = 10
) -> Dict[str, Dict[str, List[List[str]]]]:
    """
    For each keyword and each choice, return up to top_k_paths simple node-paths of length <= max_hops.
    The returned structure:
      { keyword: { choice: [ [node0,node1,...], ... ] } }
    Paths are unweighted; selection is by path length (shortest first) and then arbitrary order.
    """
    result = {}
    for kw in keywords:
        result[kw] = {}
        for choice in choices:
            if kw not in G.nodes or choice not in G.nodes:
                result[kw][choice] = []
                continue
            # all_simple_paths yields simple paths up to cutoff length in edges
            try:
                raw_paths = list(nx.all_simple_paths(G, source=kw, target=choice, cutoff=max_hops))
            except nx.NetworkXNoPath:
                raw_paths = []
            # sort by path length (fewest nodes -> fewest hops)
            raw_paths.sort(key=lambda p: len(p))
            result[kw][choice] = raw_paths[:top_k_paths]
    return result


def triples_from_path(G: nx.MultiDiGraph, path: List[str]) -> List[Triple]:
    """
    Given a node path [n0, n1, n2, ...], return the list of triple facts for each adjacent pair.
    Because this graph is a MultiDiGraph, there can be multiple edges (relations) between two nodes.
    We will return a triple for each relation found between the pair in the direction used by the path.
    """
    triples: List[Triple] = []
    for i in range(len(path) - 1):
        u, v = path[i], path[i + 1]
        # enumerate all edges from u -> v and extract 'rel' attribute
        # MultiDiGraph stores keys for multiedges; use G.get_edge_data(u, v)
        data = G.get_edge_data(u, v, default={})
        # data is a dict mapping keys to edge attribute dicts
        for key, attrs in data.items():
            rel = attrs.get("rel", "RelatedTo")
            triples.append((u, rel, v))
    return triples


def collect_facts_by_choice(
    G: nx.MultiDiGraph,
    paths_by_kw_choice: Dict[str, Dict[str, List[List[str]]]]
) -> Dict[str, Set[Triple]]:
    """
    Aggregate triples found on all paths for each choice (across all keywords).
    Returns: { choice: set_of_triples }
    """
    facts_by_choice: Dict[str, Set[Triple]] = {}
    for kw, choice_paths in paths_by_kw_choice.items():
        for choice, paths in choice_paths.items():
            if choice not in facts_by_choice:
                facts_by_choice[choice] = set()
            for path in paths:
                triples = triples_from_path(G, path)
                for t in triples:
                    facts_by_choice[choice].add(t)
    return facts_by_choice


def facts_to_sorted_strings(facts: Iterable[Triple]) -> List[str]:
    """
    Convert set/list of triples into deterministic sorted list of strings "subject relation object".
    Sorted for consistent prompt output.
    """
    return sorted([f"{s} {rel} {o}" for (s, rel, o) in facts])


def build_prompt(question: str, choices: List[str], facts_by_choice: Dict[str, Set[Triple]]) -> str:
    """
    Create a simple text prompt grouping facts by choice.
    """
    lines = []
    lines.append("Use the knowledge below (derived from ConceptNet) to choose the best answer.\n")
    lines.append("Question:")
    lines.append(question + "\n")
    lines.append("Choices:")
    for i, c in enumerate(choices):
        lines.append(f"{chr(ord('A') + i)}. {c}")
    lines.append("\nKnowledge by choice:")
    for i, c in enumerate(choices):
        lines.append(f"\nChoice {chr(ord('A') + i)}: {c}")
        facts = facts_to_sorted_strings(facts_by_choice.get(c, []))
        if not facts:
            lines.append("  - (no connecting facts found)")
        else:
            for f in facts:
                lines.append(f"  - {f}")
    lines.append("\nTask: Based only on the knowledge listed above, pick the best choice and briefly explain using those facts.")
    return "\n".join(lines)


In [17]:
question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
question_concept = "revolving door"
choices = [
    "bank",
    "library",
    "department store",
    "mall",
    "new york"
]

conceptnet_map = {
    c: conceptnet_relations[c] for c in [question_concept] + choices
}
keywords = extract_keywords_spacy(question)          # extracted keywords

# Build the graph
G = build_graph_from_conceptnet_dicts(conceptnet_map)

# Find paths (up to 3 hops) from each keyword to each choice
paths = find_paths_between_keywords_and_choices(G, keywords, choices, max_hops=3, top_k_paths=20)

# Collect facts per choice
facts_by_choice = collect_facts_by_choice(G, paths)

In [18]:
conceptnet_map

{'revolving door': {'RelatedTo': [['drehtÃ¼r', 1.0],
   ['revolving doors', 1.0],
   ['tourniquet', 1.0],
   ['bussola', 1.0]],
  'UsedFor': [['entering building', 3.464],
   ['enter building', 1.0],
   ['exiting building', 1.0],
   ['getting into building', 1.0]],
  'AtLocation': [['lobby', 2.0],
   ['bank', 1.0],
   ['building', 1.0],
   ['department store', 1.0],
   ['entrance to building', 1.0],
   ['hotel lobby', 1.0],
   ['mall', 1.0]],
  'Synonym': [['drehtÃ¼r', 1.0]]},
 'bank': {'RelatedTo': [['money', 12.58],
   ['building', 6.957],
   ['account', 6.253],
   ['institution', 5.629],
   ['place', 4.756],
   ['financial', 4.107],
   ['vault', 3.958],
   ['store', 3.862],
   ['robbery', 3.045],
   ['interest', 2.294],
   ['branch', 2.174],
   ['vault', 2.038],
   ['safe', 1.746],
   ['Ø¨Ù†Ùƒ', 1.0]],
  'IsA': [['company', 1.0],
   ['where people keep money', 1.0],
   ['find downtown', 1.0]],
  'UsedFor': [['storing money', 6.633],
   ['store money', 3.464],
   ['save money', 2.0],

In [19]:
G.edges(data=True)

OutMultiEdgeDataView([('revolving door', 'drehtÃ¼r', {'rel': 'RelatedTo'}), ('revolving door', 'drehtÃ¼r', {'rel': 'Synonym'}), ('revolving door', 'revolving doors', {'rel': 'RelatedTo'}), ('revolving door', 'tourniquet', {'rel': 'RelatedTo'}), ('revolving door', 'bussola', {'rel': 'RelatedTo'}), ('revolving door', 'entering building', {'rel': 'UsedFor'}), ('revolving door', 'enter building', {'rel': 'UsedFor'}), ('revolving door', 'exiting building', {'rel': 'UsedFor'}), ('revolving door', 'getting into building', {'rel': 'UsedFor'}), ('revolving door', 'lobby', {'rel': 'AtLocation'}), ('revolving door', 'bank', {'rel': 'AtLocation'}), ('revolving door', 'building', {'rel': 'AtLocation'}), ('revolving door', 'department store', {'rel': 'AtLocation'}), ('revolving door', 'entrance to building', {'rel': 'AtLocation'}), ('revolving door', 'hotel lobby', {'rel': 'AtLocation'}), ('revolving door', 'mall', {'rel': 'AtLocation'}), ('drehtÃ¼r', 'revolving door', {'rel': 'rev:RelatedTo'}), ('d

In [14]:
# Build prompt
prompt_text = build_prompt(question, choices, facts_by_choice)

print("=== Prompt to send to smolVLM ===\n")
print(prompt_text)

=== Prompt to send to smolVLM ===

Use the knowledge below (derived from ConceptNet) to choose the best answer.

Question:
A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?

Choices:
A. bank
B. library
C. department store
D. mall
E. new york

Knowledge by choice:

Choice A: bank
  - (no connecting facts found)

Choice B: library
  - (no connecting facts found)

Choice C: department store
  - (no connecting facts found)

Choice D: mall
  - (no connecting facts found)

Choice E: new york
  - (no connecting facts found)

Task: Based only on the knowledge listed above, pick the best choice and briefly explain using those facts.


latest

In [59]:
# ================================================================
# 1. Load embedding model
# ================================================================
# embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


# ================================================================
# 2. ADAPTER: Convert your data format into triple dicts
# ================================================================
def convert_raw_conceptnet_format(concept, raw_data):
    """
    raw_data format:
    {
       "UsedFor": [("cut", 5.657), ("cut hair", 1.0)],
       "RelatedTo": [("paper", 1.197)],
       ...
    }

    Converts to:
    [
        {"start": concept, "relation": "UsedFor", "end": "cut"},
        {"start": concept, "relation": "UsedFor", "end": "cut hair"},
        {"start": concept, "relation": "RelatedTo", "end": "paper"},
    ]
    """
    triples = []
    for relation, lst in raw_data.items():
        for end, weight in lst:
            triples.append({
                "start": concept,
                "relation": relation,
                "end": end
            })
    return triples


# ================================================================
# 3. Your conceptnet lookup
# ================================================================
def conceptnet_lookup(concept):
    """
    Replace this with your real conceptnet_data lookup.
    """
    concept = concept.lower()

    # # Example fake datasetâ€”replace with your real dataset
    # sample_data = {
    #     "scissors": {
    #         "UsedFor": [("cut paper", 5.0), ("cut hair", 1.0)],
    #         "RelatedTo": [("paper", 1.2)],
    #         "CapableOf": [("cut cardboard", 0.9)]
    #     },
    #     "people": {
    #         "UsedFor": [("work", 1.0)],
    #         "MotivatedByGoal": [("earn money", 1.0)],
    #         "CapableOf": [("talk to each other", 1.0)]
    #     },
    #     "work": {
    #         "UsedFor": [("accomplish tasks", 1.0)],
    #         "MotivatedByGoal": [("feel productive", 1.0)]
    #     }
    # }

    raw = conceptnet_relations.get(concept, {})
    return convert_raw_conceptnet_format(concept, raw)


# ================================================================
# 4. Extract N-hop KG
# ================================================================
def extract_local_subgraph(conceptnet_lookup, seeds, hops=2):
    G = nx.DiGraph()
    visited = set(seeds)
    frontier = set(seeds)

    for _ in range(hops):
        next_frontier = set()
        for node in frontier:
            facts = conceptnet_lookup(node)

            for f in facts:
                u = f["start"].lower()
                v = f["end"].lower()
                rel = f["relation"]

                G.add_edge(u, v, relation=rel)
                next_frontier.add(v)

        frontier = next_frontier - visited
        visited |= frontier

    return G


# ================================================================
# 5. Graph â†’ triples
# ================================================================
def graph_to_triples_extended(G):
    triples = []
    rel_types = []
    targets = []

    for u, v, data in G.edges(data=True):
        rel = data.get("relation", "")
        triples.append(f"{u} {rel} {v}")
        rel_types.append(rel)
        targets.append(v)

    return triples, rel_types, targets


# ================================================================
# 6. Ranking (relation-aware)
# ================================================================
def rank_triples_relation_aware(
    question,
    triples,
    relations,
    targets,
    k=15,
    w_triple=0.6,
    w_rel=0.25,
    w_target=0.15
):
    if not triples:
        return []

    q_emb = embed_model.encode(question, convert_to_tensor=True)

    tri_emb = embed_model.encode(triples, convert_to_tensor=True)
    rel_emb = embed_model.encode(relations, convert_to_tensor=True)
    tar_emb = embed_model.encode(targets, convert_to_tensor=True)

    s_triple = util.cos_sim(q_emb, tri_emb)[0]
    s_rel = util.cos_sim(q_emb, rel_emb)[0]
    s_target = util.cos_sim(q_emb, tar_emb)[0]

    total_score = (
        w_triple * s_triple
        + w_rel * s_rel
        + w_target * s_target
    )

    top_idx = np.argsort(-total_score.cpu().numpy())[:k]
    return [triples[i] for i in top_idx]


# ================================================================
# 7. Context formatter
# ================================================================
def build_context(question, choices, triples):
    out = f"Question:\n{question}\n\nChoices:\n"
    for i, c in enumerate(choices, 1):
        out += f"{i}. {c}\n"

    out += "\nRelevant Knowledge:\n"
    for t in triples:
        out += f"- {t}\n"

    return out


# ================================================================
# 8. Full Pipeline
# ================================================================
def knowledge_graph_pipeline(question, question_concept, keywords, choices, hops=2, k=5):
    if question_concept:
        question_concept = [question_concept.lower()]
    seeds = (
        question_concept +
        [kw.lower() for kw in keywords] +
        [c.lower() for c in choices]
    )
    seeds = list(set(seeds))  # deduplicate
    G = extract_local_subgraph(conceptnet_lookup, seeds, hops=hops)

    triples, rels, targets = graph_to_triples_extended(G)

    top_triples = rank_triples_relation_aware(
        question, triples, rels, targets, k=k
    )

    context = build_context(question, choices, top_triples)
    return context, top_triples, G


In [60]:
# ================================================================
# 9. Example usage
# ================================================================
question = "What do people aim to do at work?"
question_concept = "people"
choices = [
    "complete job",
    "learn from each other",
    "kill animals",
    "wear hats",
    "talk to each other"
]
keywords = extract_keywords_spacy(question)          # extracted keywords
print("Extracted keywords:", keywords)

ctx, triples, graph = knowledge_graph_pipeline(
    question, question_concept, keywords, choices
)

print("---- TOP TRIPLES ----")
for t in triples:
    print("â€¢", t)

print("\n---- CONTEXT BLOCK ----")
print(ctx)


Extracted keywords: ['people', 'aim', 'work']
---- TOP TRIPLES ----
â€¢ work MotivatedByGoal feel productive
â€¢ job RelatedTo workplace
â€¢ people UsedFor work
â€¢ office UsedFor doing job
â€¢ desk UsedFor make people think work

---- CONTEXT BLOCK ----
Question:
What do people aim to do at work?

Choices:
1. complete job
2. learn from each other
3. kill animals
4. wear hats
5. talk to each other

Relevant Knowledge:
- work MotivatedByGoal feel productive
- job RelatedTo workplace
- people UsedFor work
- office UsedFor doing job
- desk UsedFor make people think work



In [63]:
# ================================================================
# 9. Example usage
# ================================================================
question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
# question_concept = "revolving door"
question_concept = []
choices = [
    "bank",
    "library",
    "department store",
    "mall",
    "new york"
]
keywords = extract_keywords_spacy(question)

ctx, triples, graph = knowledge_graph_pipeline(
    question, question_concept, keywords, choices
)

print("---- TOP TRIPLES ----")
for t in triples:
    print("â€¢", t)

print("\n---- CONTEXT BLOCK ----")
print(ctx)


---- TOP TRIPLES ----
â€¢ door UsedFor entering or exiting area
â€¢ lock AtLocation door
â€¢ handle AtLocation door
â€¢ building AtLocation door
â€¢ safe AtLocation door with lock

---- CONTEXT BLOCK ----
Question:
A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?

Choices:
1. bank
2. library
3. department store
4. mall
5. new york

Relevant Knowledge:
- door UsedFor entering or exiting area
- lock AtLocation door
- handle AtLocation door
- building AtLocation door
- safe AtLocation door with lock



In [3]:
client = Client("cstr/conceptnet_normalized")

relations = [
            'RelatedTo','IsA','PartOf','HasA','UsedFor','CapableOf','AtLocation',
            'Causes','HasSubevent','HasFirstSubevent','HasLastSubevent',
            'HasPrerequisite','HasProperty','MotivatedByGoal','ObstructedBy',
            'Desires','CreatedBy','Synonym','Antonym','DistinctFrom','DerivedFrom',
            'SymbolOf','DefinedAs','MannerOf','LocatedNear','HasContext','SimilarTo',
            'EtymologicallyRelatedTo','EtymologicallyDerivedFrom','CausesDesire',
            'MadeOf','ReceivesAction','ExternalURL','NotDesires','NotUsedFor',
            'NotCapableOf','NotHasProperty'
        ]

Loaded as API: https://cstr-conceptnet-normalized.hf.space âœ”


In [4]:
def get_conceptnet_profile(word, relations):
    result = client.predict(
        word=word,
        lang="en",
        selected_relations=relations,
        api_name="/get_semantic_profile"
    )
    return result

In [5]:
def parse_conceptnet_profile(profile_text):
    """
    Parse ConceptNet semantic profile text into a dictionary.
    
    Args:
        profile_text: String output from get_conceptnet_profile
        
    Returns:
        Dictionary with relation types as keys and list of (word, score) tuples as values.
        Only includes relations that have values, sorted by score in descending order.
    """
    result = {}
    
    # Extract the queried word from the header
    # Pattern: # ðŸ§  Semantic Profile: 'word' (EN)
    header_match = re.search(r"# ðŸ§  Semantic Profile: '([^']+)'", profile_text)
    queried_word = header_match.group(1) if header_match else None
    
    # Normalize the queried word for comparison (both space and underscore versions)
    if queried_word:
        queried_normalized_space = queried_word.replace('_', ' ')
        queried_normalized_underscore = queried_word.replace(' ', '_')
    else:
        queried_normalized_space = None
        queried_normalized_underscore = None
    
    # Split by relation headers (## RelationType)
    sections = re.split(r'## (\w+)', profile_text)
    
    # sections[0] is the header before first relation, then alternates between relation name and content
    for i in range(1, len(sections), 2):
        relation = sections[i]
        content = sections[i + 1] if i + 1 < len(sections) else ""
        
        # Temporary list for this relation
        relation_list = []
        
        # Find all relation entries
        # Pattern: - *word1* or **word1** RelationType â†’ *word2* or **word2** `[score]`
        pattern = r'-\s+(?:\*\*?([^*]+?)\*\*?)\s+\w+\s+â†’\s+(?:\*\*?([^*]+?)\*\*?)\s+`\[([0-9.]+)\]`'
        matches = re.findall(pattern, content)
        
        for match in matches:
            word1, word2, score = match
            # Remove any extra whitespace
            word1 = word1.strip()
            word2 = word2.strip()
            score = float(score)
            
            # Check if either word matches the queried word (in either format)
            def is_queried_word(word):
                if not queried_word:
                    return False
                word_space = word.replace('_', ' ')
                word_underscore = word.replace(' ', '_')
                return (word == queried_word or 
                        word_space == queried_normalized_space or 
                        word_underscore == queried_normalized_underscore)
            
            # Determine which word is NOT the queried word
            if is_queried_word(word1):
                target_word = word2
            elif is_queried_word(word2):
                target_word = word1
            else:
                # If neither matches exactly, prefer word1 (usually the related concept)
                target_word = word1
            
            relation_list.append((target_word, score))
        
        # Only add to result if there are values, and sort by score descending
        if relation_list:
            # Sort by score (second element of tuple) in descending order
            relation_list.sort(key=lambda x: x[1], reverse=True)
            result[relation] = relation_list
    
    return result

In [10]:
import networkx as nx

def add_conceptnet_node(concept, data, G):
    for relation, targets in data.items():
        for target, weight in targets:
            G.add_edge(concept, target, relation=relation, weight=weight)


def edge_score(source, target, G):
    if G.has_edge(source, target):
        return G[source][target]['weight']
    return 0


def get_conceptnet_data(concept):
    conceptnet_text = get_conceptnet_profile(concept, relations)
    conceptnet_data = parse_conceptnet_profile(conceptnet_text)
    return conceptnet_data

In [None]:
question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
question_concept = "revolving door"
choices = [
    "bank",
    "library",
    "department store",
    "mall",
    "new york"
]

G = nx.DiGraph()

rev_door_data = get_conceptnet_data(question_concept)
add_conceptnet_node(question_concept, rev_door_data, G)

for choice in choices:
    choice_data = get_conceptnet_data(choice)
    add_conceptnet_node(choice, choice_data, G)

scores = {c: edge_score(question_concept, c) for c in choices}
print(scores)

{'bank': 1.0, 'library': 0, 'department store': 1.0, 'mall': 1.0, 'new york': 0}


In [11]:
#!/usr/bin/env python3
"""
kg_csq_scoring.py

Single-file implementation:
- Build a knowledge graph from ConceptNet-style data
- Score multiple-choice candidates for a question_concept + question
- Uses semantic boosting based on embeddings (no hard-coded keywords)
- Supports direct edges and one-hop (concept -> neighbor -> choice) with decay

Dependencies:
    pip install networkx numpy scipy sentence-transformers

Usage:
    python kg_csq_scoring.py
"""

import math
import pprint
from typing import Dict, List, Tuple, Any

import networkx as nx
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

# ---------------------------
# Example ConceptNet-style data (use your dataset instead)
# Replace or extend this dict with your full dataset
# Format: concept -> { relation_label: [(target, weight), ...], ... }
CONCEPTNET_DATA = {
    "revolving door": {
        "RelatedTo": [("drehtÃ¼r", 1.0), ("revolving doors", 1.0), ("tourniquet", 1.0), ("bussola", 1.0)],
        "UsedFor": [("entering building", 3.464), ("enter building", 1.0), ("exiting building", 1.0), ("getting into building", 1.0)],
        "AtLocation": [("lobby", 2.0), ("bank", 1.0), ("building", 1.0), ("department store", 1.0), ("entrance to building", 1.0), ("hotel lobby", 1.0), ("mall", 1.0)],
        "Synonym": [("drehtÃ¼r", 1.0)]
    },
    "bank": {
        "RelatedTo": [("money", 12.58), ("building", 6.957), ("account", 6.253)],
        "IsA": [("company", 1.0)],
        "UsedFor": [("storing money", 6.633), ("store money", 3.464), ("keeping money safe", 1.0)],
        "AtLocation": [("money", 7.746), ("secure place", 1.0)],
        "Synonym": [("bank", 0.5)]
    },
    "mall": {
        "RelatedTo": [("department", 1.619), ("shopping", 1.406)],
        "IsA": [("building", 1.0)],
        "UsedFor": [("shopping", 4.472)],
        "AtLocation": [("shopping arcade", 4.472), ("movie theater", 4.0)]
    },
    "library": {
        "RelatedTo": [("book", 5.0), ("building", 3.574), ("reading", 1.959)],
        "IsA": [("building", 3.464)],
        "UsedFor": [("do research", 6.633), ("borrowing books", 4.0)],
        "AtLocation": [("computers", 10.198), ("bookshelf", 7.211)]
    },
    "department store": {
        "RelatedTo": [("kaufhaus", 1.0)],
        "IsA": [("building", 1.0)],
        "UsedFor": [("shopping", 2.828)],
        "AtLocation": [("mall", 2.0)]
    },
    "new york": {
        "RelatedTo": [("city", 3.057), ("statue", 1.735)],
        "IsA": [("state", 5.292)],
        "AtLocation": [("subway station", 3.464)]
    }
}

# ---------------------------
# Utilities: embeddings and cosine similarity
class Embedder:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        # small, fast SBERT model; replace if you prefer another
        self.model = SentenceTransformer(model_name)

    def embed(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    # cosine returns distance; similarity = 1 - distance
    # handle zero vectors defensively
    if a is None or b is None:
        return 0.0
    if np.all(a == 0) or np.all(b == 0):
        return 0.0
    dist = cosine(a, b)
    if math.isnan(dist):
        return 0.0
    return max(0.0, 1.0 - dist)  # clip negative tiny numerical errors

# ---------------------------
# Build graph from ConceptNet-style dict
def build_graph(conceptnet_data: Dict[str, Dict[str, List[Tuple[str, float]]]]) -> nx.MultiDiGraph:
    G = nx.MultiDiGraph()
    for source, rels in conceptnet_data.items():
        G.add_node(source)
        for rel_label, targets in rels.items():
            for target, w in targets:
                # add target node too
                G.add_node(target)
                # store relation label and weight on the edge
                G.add_edge(source, target, relation=rel_label, weight=float(w))
    return G

# ---------------------------
# Scoring components
def direct_edge_info(G: nx.MultiDiGraph, source: str, target: str) -> List[Dict[str, Any]]:
    """
    Returns list of edge data dicts for edges source->target; each dict has keys 'relation' and 'weight'
    """
    infos = []
    if G.has_edge(source, target):
        data_dict = G.get_edge_data(source, target)
        for key, d in data_dict.items():
            # in MultiDiGraph, d contains the attributes
            infos.append({"relation": d.get("relation", ""), "weight": float(d.get("weight", 0.0))})
    return infos

def neighbors_of(G: nx.MultiDiGraph, node: str) -> List[str]:
    # return unique neighbors
    return list(G.successors(node))

# ---------------------------
# Main scoring function (single-call for a question)
def score_choices_for_question(
    G: nx.MultiDiGraph,
    embedder: Embedder,
    question: str,
    question_concept: str,
    choices: List[str],
    multi_hop_decay: float = 0.6,
    hop_weight: float = 0.20,
    base_edge_weight: float = 0.40,
    semantic_edge_weight: float = 0.30,
    embedding_similarity_weight: float = 0.10
) -> Dict[str, float]:
    """
    Compute final score for each choice.

    Approach (no hard-coded keywords):
    - base_edge: sum of ConceptNet weights for direct edges question_concept -> choice
    - semantic_edge: for each direct triple, boost by similarity(question, relation_label) and similarity(question, target)
    - hop_score: for one-hop paths question_concept -> neighbor -> choice, use edge weights and similarity(question, neighbor)
    - embedding_similarity: similarity between question and choice (text-level)
    - final weighted combination
    """

    # collect all text pieces that we'll embed: question, choice strings, relation labels and neighbor labels
    texts_to_embed = [question]
    unique_texts = set(texts_to_embed)

    for c in choices:
        unique_texts.add(c)
    # relation labels and targets from edges out of question_concept and neighbors
    if question_concept in G:
        for _, target, d in G.out_edges(question_concept, data=True):
            unique_texts.add(d.get("relation", ""))
            unique_texts.add(target)

        # include neighbors' labels (for multi-hop)
        for neigh in neighbors_of(G, question_concept):
            unique_texts.add(neigh)
            for _, t2, d2 in G.out_edges(neigh, data=True):
                unique_texts.add(d2.get("relation", ""))
                unique_texts.add(t2)

    # create list and compute embeddings in batch
    embed_list = list(unique_texts)
    embeddings = embedder.embed(embed_list)
    emb_map = {txt: embeddings[i] for i, txt in enumerate(embed_list)}

    # helper to get embedding (fall back to zero vector if missing)
    def e(x: str) -> np.ndarray:
        return emb_map.get(x, np.zeros(embeddings.shape[1], dtype=float))

    question_emb = e(question)

    scores = {}
    for choice in choices:
        # 1) base_edge: sum weights of edges question_concept -> choice
        base_edge = 0.0
        semantic_edge = 0.0

        edge_infos = direct_edge_info(G, question_concept, choice)
        for info in edge_infos:
            w = float(info["weight"])
            rel_label = info["relation"]
            # semantic boosts computed from question <-> relation_label and question <-> target_text (choice)
            boost_rel = cosine_sim(question_emb, e(rel_label))
            boost_target = cosine_sim(question_emb, e(choice))
            semantic_boost = (boost_rel + boost_target) / 2.0
            base_edge += w
            semantic_edge += w * semantic_boost

        # 2) multi-hop: concept -> neighbor -> choice
        hop_score = 0.0
        for neighbor in neighbors_of(G, question_concept):
            # weights concept->neighbor, neighbor->choice (sum if multiple edges)
            w1 = sum(float(d.get("weight", 0.0)) for _, _, d in G.get_edge_data(question_concept, neighbor).items())
            if G.has_edge(neighbor, choice):
                w2 = sum(float(d.get("weight", 0.0)) for _, _, d in G.get_edge_data(neighbor, choice).items())
                # relevance of neighbor to question
                neigh_boost = cosine_sim(question_emb, e(neighbor))
                # hop contribution with decay and neighbor semantic relevance
                hop_score += (w1 + w2) * neigh_boost * multi_hop_decay

        # 3) embedding similarity between question and choice (text-level)
        emb_sim = cosine_sim(question_emb, e(choice))

        # final weighted sum
        final = (
            base_edge_weight * base_edge +
            semantic_edge_weight * semantic_edge +
            hop_weight * hop_score +
            embedding_similarity_weight * emb_sim
        )

        # small normalization: if all components zero, use small fallback based on emb_sim only
        if final == 0.0:
            final = emb_sim * 1e-3

        scores[choice] = float(final)

    return scores

# ---------------------------
# Convenience: pick best and pretty-print
def pick_best(scores: Dict[str, float]) -> Tuple[str, float]:
    if not scores:
        return "", 0.0
    best = max(scores.items(), key=lambda kv: kv[1])
    return best

# ---------------------------
# Example main demonstrating the pipeline on your provided question
def main():
    # Example question & choices
    question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
    question_concept = "revolving door"
    choices = ["bank", "library", "department store", "mall", "new york"]

    print("Building graph from ConceptNet-like data...")
    G = build_graph(CONCEPTNET_DATA)

    print("Loading embedding model (may download first time)...")
    embedder = Embedder(model_name="all-MiniLM-L6-v2")

    print("Scoring choices...")
    scores = score_choices_for_question(G, embedder, question, question_concept, choices)

    print("\nScores (higher is better):")
    pprint.pprint(scores)

    best_choice, best_score = pick_best(scores)
    print(f"\nPredicted best answer: {best_choice} (score={best_score:.6f})")

if __name__ == "__main__":
    main()


Building graph from ConceptNet-like data...
Loading embedding model (may download first time)...
Scoring choices...


ValueError: not enough values to unpack (expected 3, got 2)