In [None]:
!pip install sparqlwrapper langchain langchain-core langchain-ollama

In [None]:
# Basic libraries
import requests
import pandas as pd
import os
import json
import re
import csv
import io

# Advanced libraries
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate
from SPARQLWrapper import SPARQLWrapper, JSON
from utils import entity_link, extract_entity_property

In [None]:
# Vector DB set up
CHROMA_DB_PATH = "./wikidata_properties_vector_db"
os.makedirs(CHROMA_DB_PATH, exist_ok=True)
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

df = pd.read_csv("all-wikidata-properties_v2025.05.27.csv")  # Must contain: ID, Label, Description

df["Data Type"] = df["Data Type"].str.strip()
df = df[df["Data Type"].isin(["Q", "WI"])] # Q = Quantity, WI = WikibaseItem

# Create embedding function using SentenceTransformers
embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Initialize Chroma collection
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = client.get_or_create_collection(
    name="wikidata_properties",
    embedding_function=embedding_fn
)

if collection.count() == 0:
    print("Collection is empty. Starting indexing...")
    for idx, row in df.iterrows():
        label = row["Label"]
        pid = row["ID"]
        desc = row.get("Description", "")

        enriched_text = f"{label}. {desc}".strip()

        collection.add(
            documents=[enriched_text],
            metadatas=[{"id": pid, "label": label, "description": desc}],
            ids=[str(idx)] # must be unique
        )
    print("Indexing complete.")
    print(f"Your vector database is now saved at: {os.path.abspath(CHROMA_DB_PATH)}")
else:
    print(f"Collection '{collection.name}' already contains {collection.count()} items.")
    print(f"Your vector database is located at: {os.path.abspath(CHROMA_DB_PATH)}")

In [None]:
# Optional: Can add also property description
def search_similar_properties(question, collection, top_k=5):
    """
    Given a natural language question and a Chroma collection,
    returns top-k similar properties in the format: - `label`: id
    """
    results = collection.query(
        query_texts=[question],
        n_results=top_k
    )

    output = []
    for meta in results["metadatas"][0]:
        label = meta["label"]
        pid = meta["id"]
        output.append(f"- `{label}`: {pid}")

    return "\n".join(output)

In [None]:
# Text-to-SPARQL question answering
def answer_question(question, llm, qid=None):
    """
    Given a natural language question,
    this function extracts the entity, maps it to Wikidata QID, selects the most similar 
    property using vector search, and constructs a SPARQL query to retrieve the answer.
    
    It prints the intermediate steps and final SPARQL query result.
    """

    entity_name = extract_entity_property(question, llm)["entity"]
    print(f"# Entity name: {entity_name}")
    
    if qid is None:    
        qid = entity_link(entity_name)
        
    print(f"# Entity QID: {qid}")

    property_mappings = search_similar_properties(question, collection, top_k=3)

    prompt = PromptTemplate.from_template("""
PROMPT TITLE: Ontology-Driven Text-to-SPARQL (Wikidata)

ROLE
You translate natural-language questions into syntactically valid SPARQL queries for Wikidata, grounded in an explicit ontology (properties/relations) and entity mappings.

INPUTS
# Natural language question: {question}

# QID mapping (Wikidata entity of interest)
- `{entity_name}`: {qid}

# PID mapping (Wikidata properties or relations)
- `has cause`: P828
- `disease transmission process`: P1060
- `cause of death`: P509
- `health specialty`: P1995
- `medical examination`: P923
- `symptoms and signs`: P780
- `commemorates`: P547
- `field of work`: P101
- `instance of`: P31
- `depicts`: P180
{property_mappings}

OUTPUT CONSTRAINTS
- Output ONLY a valid SPARQL query (no prose, no explanations).
- Always include the label service:
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
- Prefer concise SELECT variables (?causeLabel, ?symptomLabel, etc.).
- The output SPARQL query must be inspired from the query EXAMPLES below, both for the structure, ontological terms, and form.

PROCEDURE (high level)
- The translation typically relies on entities (QID) and ontological relations or properties (PID) in the questions.
- Select an appropriate query pattern: the triple patterns of subject, predicate, and object; directions; and variables.
- Compose a minimal SPARQL query with label service; avoid extraneous clauses unless required.
- Ensure syntactic validity.

EXAMPLES (with ontological terms to reinforce grounding), assume that QIDX is the QID of the entity X.
# Natural language: What is the cause of X?
   Ontological terms involved: has cause (P828)
   SPARQL:
   SELECT ?causeLabel WHERE {{
     wd:QIDX wdt:P828 ?cause .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: How is X transmitted?
   Ontological terms involved: disease transmission process (P1060)
   SPARQL:
   SELECT ?transmissionLabel WHERE {{
     wd:QIDX wdt:P1060 ?transmission .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: What is the cause of death of X?
   Ontological terms involved: cause of death (P509)
   SPARQL:
   SELECT ?causeLabel WHERE {{
     wd:QIDX wdt:P509 ?cause .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: What medical specialties are involved in treating X?
   Ontological terms involved: health specialty (P1995)
   SPARQL:
   SELECT ?specialtyLabel WHERE {{
     wd:QIDX wdt:P1995 ?specialty .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: X is a medical exam for which disease?
   Ontological terms involved: medical examination (P923)
   SPARQL:
   SELECT ?diseaseLabel WHERE {{
     ?disease wdt:P923 wd:QIDX .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: What are the symptoms of X?
   Ontological terms involved: symptoms and signs (P780)
   SPARQL:
   SELECT ?symptomLabel WHERE {{
     wd:QIDX wdt:P780 ?symptom .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: What event commemorates X?
   Ontological terms involved: commemorates (P547)
   SPARQL:
   SELECT ?eventLabel WHERE {{
     ?event wdt:P547 wd:QIDX .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: Who are some researchers working in the field of X?
   Ontological terms involved: field of work (P101)
   SPARQL:
   SELECT ?researcherLabel WHERE {{
     ?researcher wdt:P101 wd:QIDX .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: X corresponds to what types?
   Ontological terms involved: instance of (P31)
   SPARQL:
   SELECT ?typeLabel WHERE {{
     wd:QIDX wdt:P31 ?type .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

# Natural language: The painting of X depicts what disease?
   Ontological terms involved: depicts (P180)
   SPARQL:
   SELECT ?depictionLabel WHERE {{
     wd:QIDX wdt:P180 ?depiction .
     SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
   }}

FINAL INSTRUCTION
Now produce ONLY the SPARQL query for:
Q: {question}
A:
""")

    # Ground the prompt with actual parameters
    grounded_prompt = prompt.format(
        question=question,
        entity_name=entity_name,
        qid=qid,
        property_mappings=property_mappings
    )
    # print("# Grounded Prompt:\n" + grounded_prompt)
    
    chain = prompt | llm

    sparql_query = chain.invoke({
        "question": question,
        "entity_name": entity_name,
        "qid": qid,
        "property_mappings": property_mappings
    }).strip()

    print("# SPARQL query string:\n" + sparql_query)

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(sparql_query)
    sparql.addCustomHttpHeader("User-Agent", "MyWikidataApp/1.0 (mywikidata@example.cm)")
    sparql.setReturnFormat(JSON)

    # print("# SPARQL Query Result:")
    try:
        results = sparql.query().convert()
        answers = []
        for result in results["results"]["bindings"]:
            for key, val in result.items():
                answers.append(val["value"].strip())
        # print(answers if answers else "[]")
        return answers
    except Exception as e:
        print("SPARQL query failed:", e)
        return []

In [None]:
# Naive question answering
def answer_question_naive(question, llm, qid=None):

    entity_name = extract_entity_property(question, llm)["entity"]
    print(f"# Entity name: {entity_name}")
    
    if qid is None:    
        qid = entity_link(entity_name)
        
    print(f"# Entity QID: {qid}")

    property_mappings = search_similar_properties(question, collection, top_k=3)

    prompt = PromptTemplate.from_template("""
PROMPT TITLE: Text-to-SPARQL (Wikidata)

ROLE
You translate natural-language questions into syntactically valid SPARQL queries for Wikidata.

INPUTS
# Natural language question: {question}

# QID mapping (Wikidata entity of interest)
- `{entity_name}`: {qid}

OUTPUT CONSTRAINTS
- Output ONLY a valid SPARQL query (no prose, no explanations).
- Always include the label service:
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
- Prefer concise SELECT variables (?causeLabel, ?symptomLabel, etc.).

PROCEDURE (high level)
- The translation typically relies on entities (QID) in the questions.
- Select an appropriate query pattern: the triple patterns of subject, predicate, and object; directions; and variables.
- Compose a minimal SPARQL query with label service; avoid extraneous clauses unless required.
- Ensure syntactic validity.

FINAL INSTRUCTION
Now produce ONLY the SPARQL query for:
Q: {question}
A:
""")

    # Ground the prompt with actual parameters
    grounded_prompt = prompt.format(
        question=question,
        entity_name=entity_name,
        qid=qid,
        property_mappings=property_mappings
    )
    print("# Grounded Prompt:\n" + grounded_prompt)
    
    chain = prompt | llm

    sparql_query = chain.invoke({
        "question": question,
        "entity_name": entity_name,
        "qid": qid,
        "property_mappings": property_mappings
    }).strip()

    print("# SPARQL query string:\n" + sparql_query)

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(sparql_query)
    sparql.addCustomHttpHeader("User-Agent", "MyWikidataApp/1.0 (mywikidata@example.cm)")
    sparql.setReturnFormat(JSON)

    # print("# SPARQL Query Result:")
    try:
        results = sparql.query().convert()
        answers = []
        for result in results["results"]["bindings"]:
            for key, val in result.items():
                answers.append(val["value"].strip())
        # print(answers if answers else "[]")
        return answers
    except Exception as e:
        print("SPARQL query failed:", e)
        return []

In [None]:
# Naive text-to-SPARQL testing

question = "Can a chest radiograph be used for the medical examination of tuberculosis?"
question = "What is the cause of death of General Soedirman?"
question = "Researchers working in the field of tuberculosis?"
question = "What causes tuberculosis?"

llm = OllamaLLM(model="gpt-oss:20b", reasoning=False)
llm = OllamaLLM(model="mistral:7b", reasoning=False)
llm = OllamaLLM(model="llama3.2", reasoning=False)

answer_question_naive(question, llm)

In [None]:
# Text-to-SPARQL testing

question = "Can a chest radiograph be used for the medical examination of tuberculosis?"
question = "What is the cause of death of General Soedirman?"
question = "Researchers working in the field of tuberculosis?"
question = "What causes tuberculosis?"

llm = OllamaLLM(model="gpt-oss:20b", reasoning=False)
llm = OllamaLLM(model="llama3.2", reasoning=False)
llm = OllamaLLM(model="mistral:7b", reasoning=False)

answer_question(question, llm)

In [None]:
def ensure_list(x):
    """Accept list or comma-separated string with quotes -> list[str]."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    
    # Use csv.reader to handle quoted fields with commas
    reader = csv.reader(io.StringIO(str(x)), skipinitialspace=True)
    parts = next(reader)  # single row
    return [p.strip() for p in parts if p.strip()]

def normalize_item(s: str) -> str:
    """Lowercase, strip surrounding quotes/whitespace, collapse inner spaces."""
    if s is None:
        return ""
    s = str(s).strip().strip('"').strip("'").lower()
    # normalize whitespace
    s = re.sub(r"\s+", " ", s)
    return s

def to_norm_set(items):
    """items (list[str]) -> normalized set[str], dropping empties."""
    return {normalize_item(i) for i in items if normalize_item(i)}

def jaccard(a: set, b: set) -> float:
    """|A ∩ B| / |A ∪ B|; define as 1.0 when both empty."""
    union = a | b
    inter = a & b
    if len(union) == 0:
        return 1.0
    return len(inter) / len(union)

df = pd.read_csv("question-dataset.csv", 
                 quotechar='"', 
                 engine="python")
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

In [None]:
def run_experiment(df, llm):
	all_scores = []
	for idx, row in df.iterrows():
		nl_group = row["Natural Language"]
		nl_variations = row["NL Groundings for TB"]
		gt_results_set = to_norm_set(row["Results (2025.09.14)"])

		print(f"\n### NL Group: {nl_group}")

		for nl_q in nl_variations:
			print(f"Question: {nl_q}")
			query_eval_results = answer_question(nl_q, llm)
			print(query_eval_results)

			pred_set = to_norm_set(query_eval_results)
			score = jaccard(pred_set, gt_results_set)
			all_scores.append(score)

			print(f"- NL: {nl_q}")
			print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
			print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
			print(f"  Jaccard: {score:.4f}")

	overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
	print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:
def run_experiment_naive(df, llm):
	all_scores = []
	for idx, row in df.iterrows():
		nl_group = row["Natural Language"]
		nl_variations = row["NL Groundings for TB"]
		gt_results_set = to_norm_set(row["Results (2025.09.14)"])

		print(f"\n### NL Group: {nl_group}")

		for nl_q in nl_variations:
			print(f"Question: {nl_q}")
			query_eval_results = answer_question_naive(nl_q, llm)
			print(query_eval_results)

			pred_set = to_norm_set(query_eval_results)
			score = jaccard(pred_set, gt_results_set)
			all_scores.append(score)

			print(f"- NL: {nl_q}")
			print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
			print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
			print(f"  Jaccard: {score:.4f}")

	overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
	print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:
llm = OllamaLLM(model="gemma3:12b", reasoning=False)

run_experiment(df, llm)

In [None]:
llm = OllamaLLM(model="gpt-oss:20b", reasoning=False)

run_experiment(df, llm)

In [None]:
llm = OllamaLLM(model="llama3.1:8b", reasoning=False)

run_experiment(df, llm)

In [None]:
llm = OllamaLLM(model="mistral:7b", reasoning=False)

run_experiment(df, llm)

In [None]:
llm = OllamaLLM(model="llama3.2", reasoning=False)

run_experiment(df, llm)

In [None]:
llm = OllamaLLM(model="llama3.2", reasoning=False)

run_experiment_naive(df, llm)

# BACKUP

In [None]:
# --- Main loop -------------------------------------------------------------

all_scores = []

llm = OllamaLLM(model="llama3.2", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:
import pandas as pd
import re
import csv
import io

# --- Helpers ---------------------------------------------------------------

def ensure_list(x):
    """Accept list or comma-separated string with quotes -> list[str]."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    
    # Use csv.reader to handle quoted fields with commas
    reader = csv.reader(io.StringIO(str(x)), skipinitialspace=True)
    parts = next(reader)  # single row
    return [p.strip() for p in parts if p.strip()]

def normalize_item(s: str) -> str:
    """Lowercase, strip surrounding quotes/whitespace, collapse inner spaces."""
    if s is None:
        return ""
    s = str(s).strip().strip('"').strip("'").lower()
    # normalize whitespace
    s = re.sub(r"\s+", " ", s)
    return s

def to_norm_set(items):
    """items (list[str]) -> normalized set[str], dropping empties."""
    return {normalize_item(i) for i in items if normalize_item(i)}

def jaccard(a: set, b: set) -> float:
    """|A ∩ B| / |A ∪ B|; define as 1.0 when both empty."""
    union = a | b
    inter = a & b
    if len(union) == 0:
        return 1.0
    return len(inter) / len(union)

# --- Preprocess columns ----------------------------------------------------

# Make sure these columns exist in your df
# df = ...  # your existing DataFrame

# Ensure "NL Groundings for TB" is a list per row
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)

# Ensure ground-truth results are a list
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

# --- Main loop -------------------------------------------------------------

all_scores = []

# llm = OllamaLLM(model="llama3.2", reasoning=False) # not good, all false
# llm = OllamaLLM(model="mistral:7b", reasoning=False)
llm = OllamaLLM(model="llama3.2", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:
import pandas as pd
import re
import csv
import io

# --- Helpers ---------------------------------------------------------------

def ensure_list(x):
    """Accept list or comma-separated string with quotes -> list[str]."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    
    # Use csv.reader to handle quoted fields with commas
    reader = csv.reader(io.StringIO(str(x)), skipinitialspace=True)
    parts = next(reader)  # single row
    return [p.strip() for p in parts if p.strip()]

def normalize_item(s: str) -> str:
    """Lowercase, strip surrounding quotes/whitespace, collapse inner spaces."""
    if s is None:
        return ""
    s = str(s).strip().strip('"').strip("'").lower()
    # normalize whitespace
    s = re.sub(r"\s+", " ", s)
    return s

def to_norm_set(items):
    """items (list[str]) -> normalized set[str], dropping empties."""
    return {normalize_item(i) for i in items if normalize_item(i)}

def jaccard(a: set, b: set) -> float:
    """|A ∩ B| / |A ∪ B|; define as 1.0 when both empty."""
    union = a | b
    inter = a & b
    if len(union) == 0:
        return 1.0
    return len(inter) / len(union)

# --- Preprocess columns ----------------------------------------------------

# Make sure these columns exist in your df
# df = ...  # your existing DataFrame

# Ensure "NL Groundings for TB" is a list per row
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)

# Ensure ground-truth results are a list
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

# --- Main loop -------------------------------------------------------------

all_scores = []

# llm = OllamaLLM(model="llama3.2", reasoning=False) # not good, all false
llm = OllamaLLM(model="mistral:7b", reasoning=False)
# llm = OllamaLLM(model="llama3.2", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:
import pandas as pd
import re
import csv
import io

# --- Helpers ---------------------------------------------------------------

def ensure_list(x):
    """Accept list or comma-separated string with quotes -> list[str]."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    
    # Use csv.reader to handle quoted fields with commas
    reader = csv.reader(io.StringIO(str(x)), skipinitialspace=True)
    parts = next(reader)  # single row
    return [p.strip() for p in parts if p.strip()]

def normalize_item(s: str) -> str:
    """Lowercase, strip surrounding quotes/whitespace, collapse inner spaces."""
    if s is None:
        return ""
    s = str(s).strip().strip('"').strip("'").lower()
    # normalize whitespace
    s = re.sub(r"\s+", " ", s)
    return s

def to_norm_set(items):
    """items (list[str]) -> normalized set[str], dropping empties."""
    return {normalize_item(i) for i in items if normalize_item(i)}

def jaccard(a: set, b: set) -> float:
    """|A ∩ B| / |A ∪ B|; define as 1.0 when both empty."""
    union = a | b
    inter = a & b
    if len(union) == 0:
        return 1.0
    return len(inter) / len(union)

# --- Preprocess columns ----------------------------------------------------

# Make sure these columns exist in your df
# df = ...  # your existing DataFrame

# Ensure "NL Groundings for TB" is a list per row
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)

# Ensure ground-truth results are a list
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

# --- Main loop -------------------------------------------------------------

all_scores = []

# llm = OllamaLLM(model="llama3.2", reasoning=False) # not good, all false
llm = OllamaLLM(model="gemma3:12b", reasoning=False)
# llm = OllamaLLM(model="llama3.2", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:

# --- Preprocess columns ----------------------------------------------------

# Make sure these columns exist in your df
# df = ...  # your existing DataFrame

# Ensure "NL Groundings for TB" is a list per row
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)

# Ensure ground-truth results are a list
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

# --- Main loop -------------------------------------------------------------

all_scores = []

# llm = OllamaLLM(model="llama3.2", reasoning=False) # not good, all false
llm = OllamaLLM(model="llama3.1:8b", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")

In [None]:

# --- Preprocess columns ----------------------------------------------------

# Make sure these columns exist in your df
# df = ...  # your existing DataFrame

# Ensure "NL Groundings for TB" is a list per row
df["NL Groundings for TB"] = df["NL Groundings for TB"].apply(ensure_list)

# Ensure ground-truth results are a list
df["Results (2025.09.14)"] = df["Results (2025.09.14)"].apply(ensure_list)

# --- Main loop -------------------------------------------------------------

all_scores = []

# llm = OllamaLLM(model="llama3.2", reasoning=False) # not good, all false
llm = OllamaLLM(model="qwen3:8b", reasoning=False)

for idx, row in df.iterrows():
    nl_group = row["Natural Language"]
    nl_variations = row["NL Groundings for TB"]
    gt_results_set = to_norm_set(row["Results (2025.09.14)"])

    print(f"\n### NL Group: {nl_group}")

    for nl_q in nl_variations:
        # ------------------------------------------------------------------
        # TODO: Replace the next line with YOUR query evaluation for `nl_q`.
        # It must produce an iterable of answers (strings).
        #
        # Example expected type:
        #   query_eval_results = ["mycobacterium tuberculosis"]
        #
        # >>> BEGIN YOUR EVALUATION CODE FOR `nl_q` <<<
        print(f"Question: {nl_q}")
        query_eval_results = answer_question(nl_q, llm)
        print(query_eval_results)
        # >>> END YOUR EVALUATION CODE FOR `nl_q` <<<
        # ------------------------------------------------------------------

        pred_set = to_norm_set(query_eval_results)
        score = jaccard(pred_set, gt_results_set)
        all_scores.append(score)

        print(f"- NL: {nl_q}")
        print(f"  Pred: {sorted(pred_set) if pred_set else '[]'}")
        print(f"  Gold: {sorted(gt_results_set) if gt_results_set else '[]'}")
        print(f"  Jaccard: {score:.4f}")

# --- Overall ---------------------------------------------------------------

overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
print(f"\n=== Overall average Jaccard: {overall:.4f} ===")