In [60]:
import random
import numpy as np
import torch

def set_seed(seed: int = 42):
    """
    Set random seed for reproducibility across Python, NumPy, and PyTorch.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # For deterministic behavior (slower but fully reproducible)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [87]:
import json
import re
from tqdm import tqdm
import networkx as nx
from collections import deque
from gradio_client import Client
import re
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [62]:
client = Client("cstr/conceptnet_normalized")

relations = [
            'RelatedTo','IsA','PartOf','HasA','UsedFor','CapableOf','AtLocation',
            'Causes','HasSubevent','HasFirstSubevent','HasLastSubevent',
            'HasPrerequisite','HasProperty','MotivatedByGoal','ObstructedBy',
            'Desires','CreatedBy','Synonym','Antonym','DistinctFrom','DerivedFrom',
            'SymbolOf','DefinedAs','MannerOf','LocatedNear','HasContext','SimilarTo',
            'EtymologicallyRelatedTo','EtymologicallyDerivedFrom','CausesDesire',
            'MadeOf','ReceivesAction','ExternalURL','NotDesires','NotUsedFor',
            'NotCapableOf','NotHasProperty'
        ]

Loaded as API: https://cstr-conceptnet-normalized.hf.space âœ”


In [63]:
def get_conceptnet_profile(word, relations):
    result = client.predict(
        word=word,
        lang="en",
        selected_relations=relations,
        api_name="/get_semantic_profile"
    )
    return result

In [64]:
def parse_conceptnet_profile(profile_text):
    """
    Parse ConceptNet semantic profile text into a dictionary.
    
    Args:
        profile_text: String output from get_conceptnet_profile
        
    Returns:
        Dictionary with relation types as keys and list of (word, score) tuples as values.
        Only includes relations that have values, sorted by score in descending order.
    """
    result = {}
    
    # Extract the queried word from the header
    # Pattern: # ðŸ§  Semantic Profile: 'word' (EN)
    header_match = re.search(r"# ðŸ§  Semantic Profile: '([^']+)'", profile_text)
    queried_word = header_match.group(1) if header_match else None
    
    # Normalize the queried word for comparison (both space and underscore versions)
    if queried_word:
        queried_normalized_space = queried_word.replace('_', ' ')
        queried_normalized_underscore = queried_word.replace(' ', '_')
    else:
        queried_normalized_space = None
        queried_normalized_underscore = None
    
    # Split by relation headers (## RelationType)
    sections = re.split(r'## (\w+)', profile_text)
    
    # sections[0] is the header before first relation, then alternates between relation name and content
    for i in range(1, len(sections), 2):
        relation = sections[i]
        content = sections[i + 1] if i + 1 < len(sections) else ""
        
        # Temporary list for this relation
        relation_list = []
        
        # Find all relation entries
        # Pattern: - *word1* or **word1** RelationType â†’ *word2* or **word2** `[score]`
        pattern = r'-\s+(?:\*\*?([^*]+?)\*\*?)\s+\w+\s+â†’\s+(?:\*\*?([^*]+?)\*\*?)\s+`\[([0-9.]+)\]`'
        matches = re.findall(pattern, content)
        
        for match in matches:
            word1, word2, score = match
            # Remove any extra whitespace
            word1 = word1.strip()
            word2 = word2.strip()
            score = float(score)
            
            # Check if either word matches the queried word (in either format)
            def is_queried_word(word):
                if not queried_word:
                    return False
                word_space = word.replace('_', ' ')
                word_underscore = word.replace(' ', '_')
                return (word == queried_word or 
                        word_space == queried_normalized_space or 
                        word_underscore == queried_normalized_underscore)
            
            # Determine which word is NOT the queried word
            if is_queried_word(word1):
                target_word = word2
            elif is_queried_word(word2):
                target_word = word1
            else:
                # If neither matches exactly, prefer word1 (usually the related concept)
                target_word = word1
            
            relation_list.append((target_word, score))
        
        # Only add to result if there are values, and sort by score descending
        if relation_list:
            # Sort by score (second element of tuple) in descending order
            relation_list.sort(key=lambda x: x[1], reverse=True)
            result[relation] = relation_list
    
    return result

In [65]:
conceptnet_text = get_conceptnet_profile("revolving door", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('drehtÃ¼r', 1.0),
  ('revolving doors', 1.0),
  ('tourniquet', 1.0),
  ('bussola', 1.0)],
 'UsedFor': [('entering building', 3.464),
  ('enter building', 1.0),
  ('exiting building', 1.0),
  ('getting into building', 1.0)],
 'AtLocation': [('lobby', 2.0),
  ('bank', 1.0),
  ('building', 1.0),
  ('department store', 1.0),
  ('entrance to building', 1.0),
  ('hotel lobby', 1.0),
  ('mall', 1.0)],
 'Synonym': [('drehtÃ¼r', 1.0)]}

In [193]:
conceptnet_text = get_conceptnet_profile("bank", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('money', 12.58),
  ('building', 6.957),
  ('account', 6.253),
  ('institution', 5.629),
  ('place', 4.756),
  ('financial', 4.107),
  ('vault', 3.958),
  ('store', 3.862),
  ('robbery', 3.045),
  ('interest', 2.294),
  ('branch', 2.174),
  ('vault', 2.038),
  ('safe', 1.746),
  ('Ø¨Ù†Ùƒ', 1.0)],
 'IsA': [('company', 1.0),
  ('where people keep money', 1.0),
  ('find downtown', 1.0)],
 'UsedFor': [('storing money', 6.633),
  ('store money', 3.464),
  ('save money', 2.0),
  ('charging interest', 1.0),
  ('deposit or withdraw money', 1.0),
  ('depositing money', 1.0),
  ('keeping money safe', 1.0)],
 'CapableOf': [('store money', 3.464),
  ('charge interest', 2.0),
  ('advance loan', 1.0),
  ('back loan', 1.0),
  ('demand customer repays money', 1.0),
  ('increase interest rates', 1.0),
  ('keep money', 1.0)],
 'AtLocation': [('money', 7.746),
  ('account book', 4.0),
  ('checkbook holder', 3.464),
  ('cheque', 2.828),
  ('coin', 2.828),
  ('cubicle', 2.828),
  ('dollar', 

In [212]:
conceptnet_text = get_conceptnet_profile("mall", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('department', 1.619),
  ('shopping', 1.406),
  ('intercept', 1.0),
  ('lease line', 1.0),
  ('mall walker', 1.0),
  ('mall walking', 1.0),
  ('malled', 1.0),
  ('mallgoer', 1.0),
  ('stores', 0.541),
  ('centre', 0.506),
  ('shopping centre', 0.506),
  ('shops', 0.478),
  ('center', 0.346),
  ('shopping center', 0.346)],
 'IsA': [('building', 1.0), ('place for shopping', 1.0)],
 'HasA': [('many shops', 2.0),
  ('many bright lights outside', 1.0),
  ('many different stores in', 1.0)],
 'UsedFor': [('shopping', 4.472),
  ('hanging out at', 2.0),
  ('meeting friends', 2.0),
  ('selling stuff', 2.0),
  ('buy clothes', 1.0),
  ('community building', 1.0),
  ('concentrated foot traffic', 1.0)],
 'AtLocation': [('shopping arcade', 4.472),
  ('escalator', 4.0),
  ('health food store', 4.0),
  ('movie theater', 4.0),
  ('bookshop', 3.464),
  ('ficus', 2.828),
  ('hairdressing salon', 2.828),
  ('city', 2.0),
  ('suburbs', 2.0),
  ('town', 2.0),
  ('american suburbia', 1.0),
  ('

In [213]:
conceptnet_text = get_conceptnet_profile("library", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('book', 5.0),
  ('books', 4.802),
  ('book', 3.586),
  ('building', 3.574),
  ('place', 2.826),
  ('athenaeum', 2.0),
  ('reading', 1.959),
  ('many books', 1.53),
  ('due', 1.438),
  ('book building', 1.336),
  ('house', 1.128),
  ('Ù…ÙƒØªØ¨Ø©', 1.0),
  ('bibliothek', 1.0),
  ('bÃ¼cherei', 1.0)],
 'IsA': [('building', 3.464),
  ('place', 2.0),
  ('place of wonder', 2.0),
  ('archive', 1.0),
  ('big storage of information', 1.0),
  ('collection of documents', 1.0),
  ('place opens and closes', 1.0)],
 'PartOf': [('book', 7.211)],
 'HasA': [('books', 4.0),
  ('lots of books', 2.0),
  ('books for loan', 1.0),
  ('librarian', 1.0),
  ('literature', 1.0),
  ('many books and magazines', 1.0),
  ('many shelves', 1.0)],
 'UsedFor': [('do research', 6.633),
  ('borrowing books', 4.0),
  ('reading', 2.828),
  ('storing books', 2.828),
  ('finding books to read', 2.0),
  ('research', 2.0),
  ('borrow books', 1.0)],
 'CapableOf': [('allow books to borrowed', 1.0),
  ('fine borrowe

In [214]:
conceptnet_text = get_conceptnet_profile("department store", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('Ù…ØªØ¬Ø±', 1.0),
  ('Ù…Ø®Ø²Ù†', 1.0),
  ('kaufhaus', 1.0),
  ('warenhaus', 1.0),
  ("bloomie's", 1.0),
  ('bridal registry', 1.0),
  ('catalog', 1.0),
  ('gum', 0.5)],
 'IsA': [('building', 1.0),
  ('large structure', 1.0),
  ('place', 1.0),
  ('point of purchase', 1.0)],
 'HasA': [('multiple sales departments', 1.0)],
 'UsedFor': [('shopping', 2.828),
  ('buying clothes', 2.0),
  ('anchor mall', 1.0),
  ('browsing', 1.0),
  ('buy all sorts of things', 1.0),
  ('buy clothes', 1.0),
  ('buy items', 1.0)],
 'AtLocation': [('escalator', 2.828),
  ('sale', 2.828),
  ('city', 2.0),
  ('mall', 2.0),
  ('changing room', 2.0),
  ('dressing room', 2.0),
  ('fitting room', 2.0),
  ('mouse', 2.0),
  ('shoulder bag', 2.0),
  ('big city', 1.0),
  ('idaho', 1.0),
  ('new york', 1.0),
  ('shopping mall', 1.0)],
 'HasProperty': [('big', 1.0)],
 'Synonym': [('kaufhaus', 1.0),
  ('warenhaus', 1.0),
  ('grand magasin', 1.0),
  ('department store', 0.5),
  ('kaufhaus', 0.5),
  ('departmen

In [215]:
conceptnet_text = get_conceptnet_profile("new york", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
conceptnet_data

{'RelatedTo': [('city', 3.057),
  ('statue', 1.735),
  ('liberty', 1.344),
  ('bronx', 1.0),
  ('brooklyn', 1.0),
  ('kings county', 1.0),
  ('manhattan', 1.0),
  ('queens', 1.0),
  ('staten island', 1.0),
  ('wall street', 1.0),
  ('adrian', 1.0),
  ('afton', 1.0),
  ('akron', 1.0),
  ('albany', 1.0)],
 'IsA': [('state', 5.292),
  ('big city in america', 1.0),
  ('city', 1.0),
  ('city and state', 1.0),
  ('city in new york', 1.0),
  ('city in new york state', 1.0),
  ('in united states', 1.0),
  ('largest city in new york', 1.0)],
 'PartOf': [('united states', 1.0), ('long island', 1.0)],
 'HasA': [('many tall buildings', 2.0),
  ('beaches', 1.0),
  ('many museums', 1.0)],
 'AtLocation': [('kosher deli', 4.0),
  ('chinese restaurant', 3.464),
  ('subway station', 3.464),
  ('apple tree', 2.828),
  ('kosher restaurant', 2.828),
  ('mouse', 2.828),
  ('advertising panel', 1.0)],
 'Synonym': [('new york', 0.5)],
 'DerivedFrom': [('jew york', 1.0),
  ('new yorker', 1.0),
  ('new yorkese'

In [None]:
# concept net ranker:

# fetch relations from question_concept
# fetch relations from each choice
# rank all the relations from the question_concept to each relations from each choice
# display topk relations

In [None]:
class ConceptNetRanker:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def _cosine(self, a, b):
        return float(cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0])

    def build_choice_to_cn_mapping(self, choices, concept, conceptnet_data):
        """
        Build a mapping from each choice to associated ConceptNet relation strings.
        """
        mapping = {c: [] for c in choices}

        # Build all CN relation strings
        for rel, items in conceptnet_data.items():
            for item in items:
                if concept.lower() in choices:
                    mapping[concept].append(f"{concept} {rel} {item[0]}")
                elif (item[0].lower() or item[0].lower().replace(" ", "_")) in choices:
                    mapping[item[0].lower()].append(f"{concept} is {rel} {item[0]}")
                # else:
                #     print(f"Unmapped CN relation: {concept} {rel} {item[0]}")
        # print(f"Choice to CN mapping: {mapping}")
        return mapping

    def build_conceptnet_strings(self, concept, conceptnet_data):
        """
        Convert ConceptNet relations into natural-language strings.
        Example: "revolving door AtLocation bank"
        """
        relation_texts = []
        for relation, targets in conceptnet_data.items():
            for t in targets:
                t = t[0] if isinstance(t, tuple) else t  # handle tuple (string,)
                relation_texts.append(f"{concept} {relation} {t}")
        return relation_texts

    def order_cn_relations_by_rank(self, ranked, choice_to_cn):
        """
        Reorder CN relations according to choice ranking.
        """
        ordered = {}

        for choice, score in ranked:
            ordered[choice] = choice_to_cn.get(choice, [])

        return ordered

    def score(
        self,
        question: str,
        choices: list,
        concept: str,
        conceptnet_data: dict,
        w_question_choice=0.4,
        w_choice_conceptnet=0.6,
    ):
        """
        Compute combined similarity score:
        score = w1 * sim(Question, Choice) + w2 * sim(Choice, ConceptNet neighborhood)
        """
        choices = [c.lower() for c in choices]
        question = question.lower()

        # --- Step 1: encode embeddings ---
        emb_question = self.model.encode(question)
        emb_choices = self.model.encode(choices)

        # Build ConceptNet relation sentences
        cn_strings = self.build_conceptnet_strings(concept, conceptnet_data)
        emb_cn = self.model.encode(cn_strings)

        # choice to cn mapping
        choice_to_cn = self.build_choice_to_cn_mapping(choices, concept, conceptnet_data)

        # --- Step 2: compute similarity ---

        # A. Question â†’ Choice similarity
        sim_QC = [self._cosine(emb_question, ec) for ec in emb_choices]

        # B. Choice â†’ ConceptNet (mean similarity to all CN relation strings)
        sim_choice_cn = []
        for ec in emb_choices:
            sims = cosine_similarity([ec], emb_cn)[0]
            sim_choice_cn.append(float(np.mean(sims)))
        
        # --- Step 3: combine weighted scores ---
        final_scores = []
        for s1, s2 in zip(sim_QC, sim_choice_cn):
            final_scores.append(float(w_question_choice * s1 + w_choice_conceptnet * s2))

        # Return choices with scores
        ranked = sorted(zip(choices, final_scores), key=lambda x: x[1], reverse=True)

        ordered_cn_relations = self.order_cn_relations_by_rank(ranked, choice_to_cn)

        for choice, score in ranked:
            

        return ranked, {
            "sim_question_choice": sim_QC,
            "sim_choice_conceptnet": sim_choice_cn,
            "scores": final_scores,
            "cn_relations_for_ranked_choices": ordered_cn_relations
        }


In [196]:
# conceptnet_data = {
#     'RelatedTo': [('drehtÃ¼r'), ('revolving doors'), ('tourniquet'), ('bussola')],
#     'UsedFor': [('entering building'), ('enter building'), 
#                 ('exiting building'), ('getting into building')],
#     'AtLocation': [('lobby'), ('bank'), ('building'), 
#                    ('department store'), ('entrance to building'), 
#                    ('hotel lobby'), ('mall')],
#     'Synonym': [('drehtÃ¼r')]
# }
conceptnet_data = {'RelatedTo': [('drehtÃ¼r', 1.0),
  ('revolving doors', 1.0),
  ('tourniquet', 1.0),
  ('bussola', 1.0)],
 'UsedFor': [('entering building', 3.464),
  ('enter building', 1.0),
  ('exiting building', 1.0),
  ('getting into building', 1.0)],
 'AtLocation': [('lobby', 2.0),
  ('bank', 1.0),
  ('building', 1.0),
  ('department store', 1.0),
  ('entrance to building', 1.0),
  ('hotel lobby', 1.0),
  ('mall', 1.0)],
 'Synonym': [('drehtÃ¼r', 1.0)]}

question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
concept = "revolving door"
choices = [
    "bank",
    "library",
    "department store",
    "mall",
    "new york"
]

ranker = ConceptNetRanker()
ranked, debug = ranker.score(question, choices, concept, conceptnet_data)

print("Ranking:")
for c, s in ranked:
    print(f"{c:20}  score={s:.4f}")


Ranking:
bank                  score=0.1475
mall                  score=0.1400
department store      score=0.1208
library               score=0.0611
new york              score=0.0515


In [207]:
ranked, debug

([('bank', 0.14749882221221924),
  ('mall', 0.1399820104241371),
  ('department store', 0.12084722220897674),
  ('library', 0.061140642315149304),
  ('new york', 0.0514893963932991)],
 {'sim_question_choice': [0.12540851533412933,
   0.019575992599129677,
   0.051216669380664825,
   0.039423618465662,
   0.035584140568971634],
  'sim_choice_conceptnet': [0.16222569346427917,
   0.08885040879249573,
   0.1672675907611847,
   0.20702093839645386,
   0.06209290027618408],
  'scores': [0.14749882221221924,
   0.061140642315149304,
   0.12084722220897674,
   0.1399820104241371,
   0.0514893963932991],
  'cn_relations_for_ranked_choices': {'bank': ['revolving door is AtLocation bank'],
   'mall': ['revolving door is AtLocation mall'],
   'department store': ['revolving door is AtLocation department store'],
   'library': [],
   'new york': []}})

In [206]:
def create_conceptnet_strings(concept, conceptnet_data):
    """
    Convert ConceptNet relations into natural-language strings.
    Example: "revolving door AtLocation bank"
    """
    relation_texts = []
    for relation, targets in conceptnet_data.items():
        for t in targets:
            t = t[0] if isinstance(t, tuple) else t  # handle tuple (string,)
            relation_texts.append(f"{concept} {relation} {t}")
    return relation_texts

In [208]:
ranked[:2]

[('bank', 0.14749882221221924), ('mall', 0.1399820104241371)]

In [211]:
get_conceptnet_profile("bank", relations)
conceptnet_data = parse_conceptnet_profile(conceptnet_text)
create_conceptnet_strings("bank", conceptnet_data)

['bank RelatedTo money',
 'bank RelatedTo building',
 'bank RelatedTo account',
 'bank RelatedTo institution',
 'bank RelatedTo place',
 'bank RelatedTo financial',
 'bank RelatedTo vault',
 'bank RelatedTo store',
 'bank RelatedTo robbery',
 'bank RelatedTo interest',
 'bank RelatedTo branch',
 'bank RelatedTo vault',
 'bank RelatedTo safe',
 'bank RelatedTo Ø¨Ù†Ùƒ',
 'bank IsA company',
 'bank IsA where people keep money',
 'bank IsA find downtown',
 'bank UsedFor storing money',
 'bank UsedFor store money',
 'bank UsedFor save money',
 'bank UsedFor charging interest',
 'bank UsedFor deposit or withdraw money',
 'bank UsedFor depositing money',
 'bank UsedFor keeping money safe',
 'bank CapableOf store money',
 'bank CapableOf charge interest',
 'bank CapableOf advance loan',
 'bank CapableOf back loan',
 'bank CapableOf demand customer repays money',
 'bank CapableOf increase interest rates',
 'bank CapableOf keep money',
 'bank AtLocation money',
 'bank AtLocation account book',
 

In [217]:
conceptnet_data = {'RelatedTo': [('crowd', 13.576), ('person', 5.699), ('population', 5.451), ('party', 4.829), ('children', 4.031), ('family', 3.988), ('city', 3.731), ('person', 3.479), ('humans', 2.585), ('human', 2.391)], 'IsA': [('bisexuals', 4.899), ('criminals', 4.472), ('persons', 4.0), ('astronauts', 4.0), ('atheists', 4.0), ('many persons', 3.464), ('actors', 3.464), ('scouts', 3.464), ('democrats', 2.828), ('janitors', 2.828), ('americans', 2.828), ('christians', 2.828)], 'PartOf': [('head', 2.0)], 'HasA': [('feelings', 6.0), ('dogs as pets', 5.292), ('emotions', 4.0), ('brown hair', 3.464), ('cat for pet', 3.464), ('drug problem', 3.464), ('five fingers on each hand', 3.464), ('nation', 1.0)], 'UsedFor': [('work', 2.828), ('political ideology', 2.0)], 'CapableOf': [('talk to each other', 6.633), ('believe in god', 4.899), ('catch colds', 4.472), ('forget things', 4.472), ('learn from each other', 4.472), ('pay bills', 4.472), ('taste food', 4.472)], 'AtLocation': [('apartment', 4.472), ('building', 4.472), ('opera', 4.472), ('supermarket', 3.464), ('town', 3.464), ('train station', 3.464), ('conference', 2.828), ('ignorance', 1.0), ('parking area', 1.0), ('reception area', 1.0), ('shopping center', 1.0), ('sidewalk', 1.0), ('skin', 1.0), ('stupidity', 1.0)], 'HasPrerequisite': [('celebrating', 1.0), ('having party', 1.0), ('playing sport', 1.0)], 'HasProperty': [('stupid', 7.211), ('ignorant', 5.292), ('weird', 5.292), ('kind', 4.899), ('cruel', 4.472), ('human', 4.472), ('mean', 4.472), ('jews', 1.0)], 'Desires': [('compete', 4.0), ('dance', 3.464), ('give orders', 3.464), ('read books', 3.464), ('read stories', 3.464), ('play games', 2.828)], 'CreatedBy': [('machine', 1.0), ('mistake', 1.0), ('television', 1.0)], 'DerivedFrom': [('antipeople', 1.0), ('committeepeople', 1.0), ('depeople', 1.0), ('dispeople', 1.0), ('empeople', 1.0), ('meeple', 1.0), ('merpeople', 1.0)], 'HasContext': [('Ø£Ù‡Ù…Ù„', 1.0), ('exot', 1.0), ('sequacious', 1.0), ('frisar', 1.0), ('aggiogare', 1.0), ('praticare', 1.0)], 'EtymologicallyDerivedFrom': [('people', 1.0)], 'MadeOf': [('club', 1.0), ('congregation', 1.0), ('crowd', 1.0), ('cult', 1.0), ('egalitarian society', 1.0), ('mob', 1.0), ('organization', 1.0)], 'ReceivesAction': [('killed', 5.292), ('born', 4.0), ('cremated', 2.828)], 'NotCapableOf': [('keep pet', 3.464)]}

question = "What do people aim to do at work?"
concept = "people"
choices = ['complete job',
    'learn from each other',
    'kill animals',
    'wear hats',
    'talk to each other']

ranker = ConceptNetRanker()
ranked, debug = ranker.score(question, choices, concept, conceptnet_data)

print("Ranking:")
for c, s in ranked:
    print(f"{c:20}  score={s:.4f}")


Ranking:
complete job          score=0.2021
wear hats             score=0.1795
kill animals          score=0.1668
learn from each other  score=0.1460
talk to each other    score=0.0808


In [218]:
debug

{'sim_question_choice': [0.32941436767578125,
  0.11427511274814606,
  0.2179032564163208,
  0.2148442417383194,
  0.07249058783054352],
 'sim_choice_conceptnet': [0.1172032430768013,
  0.16712239384651184,
  0.13265670835971832,
  0.15600620210170746,
  0.08630145341157913],
 'scores': [0.20208769291639328,
  0.14598348140716552,
  0.1667553275823593,
  0.17954141795635223,
  0.08077710717916489],
 'cn_relations_for_ranked_choices': {'complete job': [],
  'wear hats': [],
  'kill animals': [],
  'learn from each other': ['people is CapableOf learn from each other'],
  'talk to each other': ['people is CapableOf talk to each other']}}

In [None]:
prompt = """
Question: {question}
Choices: {choices}
These are the high ranked choices:
{additional_context}
Use commonsense and reasoning ability along with the context provided to pick the correct answer from the list of choices. Give only the single correct answer. Do not include any reasoning, explanation, or extra text.
Answer (just one choice):
""".strip()
prompt.format(
    question=question,
    choices=", ".join(choices),
    additional_context="\n".join(debug["cn_relations_for_ranked_choices"][ranked[0][0]]
)

In [216]:
dataset_path = "data/commonsenseqa_validation.json"

with open(dataset_path, "r") as fp:
    csqa_dataset = json.load(fp)

len(csqa_dataset)

1221

In [159]:
csqa_dataset[0]

{'id': '1afa02df02c908a558b4036e80242fac',
 'question': 'A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 'question_concept': 'revolving door',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['bank', 'library', 'department store', 'mall', 'new york']},
 'answerKey': 'A'}

In [160]:
def map_answer_key_to_index(answer_key):
    """
    Map answer key letters (A, B, C, D, E) to corresponding indices (0, 1, 2, 3, 4).
    """
    return ord(answer_key.upper()) - ord('A')

def get_correct_answer_text(data_point):
    """
    Get the correct answer text from a data point using the answer key.
    """
    answer_key = data_point['answerKey']
    answer_index = map_answer_key_to_index(answer_key)
    return data_point['choices']['text'][answer_index]

# Test with the first data point
test_data_point = csqa_dataset[0]
print(f"Answer key: {test_data_point['answerKey']}")
print(f"Answer index: {map_answer_key_to_index(test_data_point['answerKey'])}")
print(f"Correct answer: {get_correct_answer_text(test_data_point)}")


Answer key: A
Answer index: 0
Correct answer: bank


In [166]:
knowledge_augment_csqa_val = []
first_option_correct = 0
second_option_correct = 0

for data_point in tqdm(csqa_dataset):
    id = data_point['id']
    question = data_point['question']
    choices = data_point['choices']['text']
    concept = data_point['question_concept']
    correct_answer = get_correct_answer_text(data_point)

    conceptnet_text = get_conceptnet_profile(concept, relations)
    conceptnet_data = parse_conceptnet_profile(conceptnet_text)
    
    ranked, debug = ranker.score(question, choices, concept, conceptnet_data)
        
    for idx, (c, s) in enumerate(ranked):
        if c == correct_answer and idx == 0:
            first_option_correct += 1
        elif c == correct_answer and idx == 1:
            second_option_correct += 1
        
        if idx == 1:
            break
    
    knowledge_augment_csqa_val.append({
        "id": data_point['id'],
        "question": data_point['question'],
        "question_concept": data_point['question_concept'],
        "choices": data_point['choices'],
        "answerKey": data_point['answerKey'],
        "correct_answer": correct_answer,
        "ranked_choices": ranked,
        "debug": debug
    })
    # break 

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1221/1221 [49:36<00:00,  2.44s/it]


In [171]:
first_option_correct, second_option_correct

(407, 281)

In [181]:
len(knowledge_augment_csqa_val)

1221

In [None]:
# new_knowledge_augment_csqa_val = []
# for ka, orig_data_point in zip(knowledge_augment_csqa_val, csqa_dataset):
#     new_knowledge_augment_csqa_val.append({
#         "id": orig_data_point['id'],
#         "question": orig_data_point['question'],
#         "question_concept": orig_data_point['question_concept'],
#         "choices": orig_data_point['choices'],
#         "answerKey": orig_data_point['answerKey'],
#         "correct_answer": ka['correct_answer'],
#         "ranked_choices": ka['ranked_choices'],
#         "debug": ka['debug']
#     })

In [185]:
import pickle

with open("data/knowledge_augment_csqa_val.pkl", "wb") as fp:
    pickle.dump(knowledge_augment_csqa_val, fp)

In [184]:
knowledge_augment_csqa_val[1]

{'id': 'a7ab086045575bb497933726e4e6ad28',
 'question': 'What do people aim to do at work?',
 'question_concept': 'people',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['complete job',
   'learn from each other',
   'kill animals',
   'wear hats',
   'talk to each other']},
 'answerKey': 'A',
 'correct_answer': 'complete job',
 'ranked_choices': [('complete job', 0.20208769291639328),
  ('wear hats', 0.17954141795635223),
  ('kill animals', 0.1667553275823593),
  ('learn from each other', 0.14598348140716552),
  ('talk to each other', 0.08077710717916489)],
 'debug': {'sim_question_choice': [0.32941436767578125,
   0.11427511274814606,
   0.2179032564163208,
   0.2148442417383194,
   0.07249058783054352],
  'sim_choice_conceptnet': [0.1172032430768013,
   0.16712239384651184,
   0.13265670835971832,
   0.15600620210170746,
   0.08630145341157913],
  'scores': [0.20208769291639328,
   0.14598348140716552,
   0.1667553275823593,
   0.17954141795635223,
   0.080777107179164

In [143]:
for data_point in knowledge_augment_csqa_val:
    question = data_point['question']
    choices = data_point['choices']
    ranked_choices = data_point['ranked_choices']
    additional_context = ""
    for c, s in ranked_choices[: 2]:
        relation_text = data_point["debug"]["cn_relations_for_ranked_choices"][c]
        data_point["debug"]["cn_relations_for_ranked_choices"][c]
        if relation_text:
            additional_context += " ".join(relation_text) + "\n"
    break

In [144]:
question, choices, additional_context

('A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?',
 ['bank', 'library', 'department store', 'mall', 'new york'],
 'revolving door AtLocation bank\nrevolving door AtLocation mall\n')

New mechanism

In [204]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

class ConceptNetRanker:
    def __init__(self, conceptnet_store):
        self.store = conceptnet_store
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    def get_relations(self, concept):
        return self.store.get(concept.lower(), {})

    def build_relation_text(self, concept, relations):
        texts = []
        for rel, items in relations.items():
            for it in items:
                texts.append(f"{concept} {rel} {it}")
        return texts

    def embed(self, texts):
        return self.model.encode(texts, convert_to_tensor=True)

    def rank_choices(self, question, question_concept, choices):
        # 1. get question concept relations
        q_rels = self.get_relations(question_concept)
        q_rel_texts = self.build_relation_text(question_concept, q_rels)

        # embeddings for question relations
        q_rel_emb = self.embed(q_rel_texts) if q_rel_texts else None

        # embeddings for question sentence
        q_emb = self.embed([question])[0]

        results = {}

        for ch in choices:
            ch_rels = self.get_relations(ch)
            ch_rel_texts = self.build_relation_text(ch, ch_rels)

            if not ch_rel_texts:
                results[ch] = 0
                continue

            ch_emb = self.embed(ch_rel_texts)

            scores = []

            # 2. similarity: question relations â†’ choice relations
            if q_rel_emb is not None:
                sim_matrix = util.cos_sim(q_rel_emb, ch_emb)
                scores.append(sim_matrix.mean().item())

            # 3. similarity: question sentence â†’ choice relations
            sim_question = util.cos_sim(q_emb, ch_emb).mean().item()
            scores.append(sim_question)

            # 4. average score per choice
            results[ch] = float(np.mean(scores))

        # 5. rank choices descending
        ranked = sorted(results.items(), key=lambda x: x[1], reverse=True)
        return ranked


In [203]:
conceptnet_data = {
    'revolving door': {
        'RelatedTo': ['drehtÃ¼r', 'revolving doors', 'tourniquet', 'bussola'],
        'UsedFor': ['entering building', 'enter building', 'exiting building', 'getting into building'],
        'AtLocation': ['lobby', 'bank', 'building', 'department store', 'entrance to building', 'hotel lobby', 'mall'],
        'Synonym': ['drehtÃ¼r']
    },
    'bank': {
        'AtLocation': ['building', 'lobby'],
        'UsedFor': ['security'],
    },
    'library': {},
    'department store': {
        'AtLocation': ['mall']
    },
    'mall': {
        'AtLocation': ['entrance to building']
    },
    'new york': {}
}

question = "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"
question_concept = "revolving door"
choices = ["bank", "library", "department store", "mall", "new york"]

ranker = ConceptNetRanker(conceptnet_data)

global_top = ranker.rank_all_relations(
    question_concept="revolving door",
    choices=choices,
    top_k=10
)

for score, q_rel, value, choice, c_rel in global_top:
    print(f"Score={score} | Q:{q_rel} â†’ {value} | Choice={choice} ({c_rel})")


Score=3 | Q:AtLocation â†’ building | Choice=bank (AtLocation)
Score=3 | Q:AtLocation â†’ lobby | Choice=bank (AtLocation)
Score=3 | Q:AtLocation â†’ mall | Choice=department store (AtLocation)
Score=3 | Q:AtLocation â†’ entrance to building | Choice=mall (AtLocation)
Score=2 | Q:UsedFor â†’ RELATION_MATCH | Choice=bank (UsedFor)
Score=2 | Q:AtLocation â†’ RELATION_MATCH | Choice=bank (AtLocation)
Score=2 | Q:AtLocation â†’ RELATION_MATCH | Choice=department store (AtLocation)
Score=2 | Q:AtLocation â†’ RELATION_MATCH | Choice=mall (AtLocation)


In [220]:
import pickle

with open("data/knowledge_augment_csqa_val.pkl", "rb") as fp:
    try_data = pickle.load(fp)

In [222]:
try_data[1]


{'id': 'a7ab086045575bb497933726e4e6ad28',
 'question': 'What do people aim to do at work?',
 'question_concept': 'people',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['complete job',
   'learn from each other',
   'kill animals',
   'wear hats',
   'talk to each other']},
 'answerKey': 'A',
 'correct_answer': 'complete job',
 'ranked_choices': [('complete job', 0.20208769291639328),
  ('wear hats', 0.17954141795635223),
  ('kill animals', 0.1667553275823593),
  ('learn from each other', 0.14598348140716552),
  ('talk to each other', 0.08077710717916489)],
 'debug': {'sim_question_choice': [0.32941436767578125,
   0.11427511274814606,
   0.2179032564163208,
   0.2148442417383194,
   0.07249058783054352],
  'sim_choice_conceptnet': [0.1172032430768013,
   0.16712239384651184,
   0.13265670835971832,
   0.15600620210170746,
   0.08630145341157913],
  'scores': [0.20208769291639328,
   0.14598348140716552,
   0.1667553275823593,
   0.17954141795635223,
   0.080777107179164