In [1]:
import pandas as pd

TASKNAMES = ["subtask1", "subtask2"]
BATCHNAMES = ["batch1", "batch2", "batch3"]

dataset_loader = dict()

for taskname in TASKNAMES:
    for batchname in BATCHNAMES:
        print(f"Processing {taskname} for {batchname}")
        prefix = "dataset/"
        # Load the data
        data = pd.read_csv(f"{prefix}/{taskname}_train_{batchname}.csv")
        dataset_loader[f"{taskname}_{batchname}"] = data
        

Processing subtask1 for batch1
Processing subtask1 for batch2
Processing subtask1 for batch3
Processing subtask2 for batch1
Processing subtask2 for batch2
Processing subtask2 for batch3


### Statistics

Hereby I list the number of each unique answers and claims for each batch datatset

| Task | Batch | number of answers | number of claims |
| ---  | ----- | ----------------- | ---------------- |
| 1    | 1     | 235               | 500              |
| 1    | 2     | 468               | 2092             |
| 1    | 3     | 469               | 1500             |


(seems that subtask 1 and 2 have the shared dataset)
| Task | Batch | number of answers | number of claims |
| ---  | ----- | ----------------- | ---------------- |
| 2    | 1     | 235               | 500              |
| 2    | 2     | 468               | 2092             |
| 2    | 3     | 469               | 1500             |

In [16]:
all_answer = dataset_loader["subtask2_batch2"].answer.unique()

import os

os.makedirs("outputs/statistics&collections", exist_ok=True)

with open("outputs/statistics&collections/all_answer.txt", "w") as f:
    for answer in all_answer:
        f.write(answer + "\n" + "-"*100 + "\n")


# Testing to get named entity in reference

Run `process_text_to_knowledge_graph` function on reference with *gemini-2.5-flash* model

In [3]:
content = dataset_loader["subtask1_batch1"].iloc[0].reference

content

'[3]: The widespread use of chatbots is a reality and their application in higher education is promising. Understanding higher education usersÃ¢â\x82¬â\x84¢ expectations for the use of chatbots in education is important for the design and development of new solutions. The present investigation documents how higher education users envision the pedagogical uses of chatbots in higher education, and how experts in the domain of education chatbots perceive the potential benefits and challenges related to the use of chatbots in education. A qualitative inquiry was undertaken based on 22 semi-structured interviews with higher-education students and instructors, and experts from the fields of Artificial Intelligence and educational chatbots. Based on our findings, the envisioned pedagogical uses of chatbots can be categorized in terms of chronological integration into the learning process: prospective, on-going, and retrospective. Under each one of those higher-order categories, specific learn

In [4]:
import asyncio
import os
import networkx as nx

from dotenv import load_dotenv
from operationCheatSheet import chunking_by_token_size, extract_entities
from utils import compute_mdhash_id
from geminillm import gemini_complete_if_cache
from openaillm import openai_embed, openai_complete
from faiss_impl import FaissVectorDBStorage
from networkx_impl import NetworkXStorage
from json_kv_iml import JsonKVStorage
from shared_storage import initialize_share_data
from cheatsheet import CHEATSHEETS


# LLM_MODEL_NAME = "gpt-3.5-turbo"
LLM_MODEL_NAME = "gemini-2.5-flash-preview-04-17"

load_dotenv()

async def process_text_to_knowledge_graph(text_content,
                                          cheatsheet_knowledge_graph_inst,
                                          file_path="unknown_source"):
    # Step 1: Create chunks from your text content
    raw_chunks = chunking_by_token_size(
        content=text_content,
        split_by_character=None,  # Optional: specify a character to split by (e.g., "\n\n")
        split_by_character_only=False,
        overlap_token_size=128,
        max_token_size=1024,
        tiktoken_model=LLM_MODEL_NAME
    )
    
    # Step 2: Format chunks as required by extract_entities
    doc_id = compute_mdhash_id(text_content, prefix="doc-")
    chunks = {
        compute_mdhash_id(chunk["content"], prefix="chunk-"): {
            **chunk,
            "full_doc_id": doc_id,
            "file_path": file_path
        }
        for chunk in raw_chunks
    }
    
    # Step 3: Initialize your storage instances
    faiss_global_config = {
        "working_dir": "/tmp",
        "embedding_batch_num": 64,  # or another integer suitable for your setup
        "vector_db_storage_cls_kwargs": {
            "cosine_better_than_threshold": 0.2  # or another float threshold you want
        },
        "base_url": "https://api.openai.com/v1",
    }
    kv_global_config = {
        "working_dir": "/tmp",
        "llm_model_name": LLM_MODEL_NAME,
        "embedding_batch_num": 64,  # or another integer suitable for your setup
        "vector_db_storage_cls_kwargs": {
            "cosine_better_than_threshold": 0.2  # or another float threshold you want
        },
        "base_url": "https://api.openai.com/v1",
    }
    knowledge_graph = NetworkXStorage(namespace="nx_kg", global_config=faiss_global_config, embedding_func=openai_embed)  # Graph storage implementation
    entity_vector_db = FaissVectorDBStorage(namespace="faiss_entity", global_config=faiss_global_config, embedding_func=openai_embed)  # Vector storage for entities
    relationship_vector_db = FaissVectorDBStorage(namespace="faiss_relationship", global_config=faiss_global_config, embedding_func=openai_embed)  # Vector storage for relationships
    llm_cache = JsonKVStorage(namespace="llm_cache", global_config=kv_global_config, embedding_func=openai_embed)
    
    initialize_share_data()

    await knowledge_graph.initialize()
    await entity_vector_db.initialize()
    await relationship_vector_db.initialize()
    await llm_cache.initialize()  # if your cache supports/needs it

    # Step 4: Use the OpenAI LLM function from openai.py
    # Create a wrapper that matches the expected signature for extract_entities
    async def llm_wrapper(prompt, history_messages=None, max_tokens=None, **kwargs):
        if history_messages is None:
            history_messages = []
    

        # Use Google GenAI
        return await gemini_complete_if_cache(
            model=LLM_MODEL_NAME,
            prompt=prompt,
            history_messages=history_messages,
            hashing_kv=llm_cache,
            temperature=0.2,
            max_tokens=max_tokens or 1024,
        )

        
    
    # Step 5: Configure the extraction process
    global_config = {
        "llm_model_func": llm_wrapper,  # Use our wrapper function
        "llm_model_name": LLM_MODEL_NAME,  # Required for some OpenAI functions
        "entity_extract_max_gleaning": 2,
        "force_llm_summary_on_merge": False,
        "llm_model_max_token_size": 1024,
        "summary_to_max_tokens": 256,
        "tiktoken_model_name": LLM_MODEL_NAME,
        "addon_params": {
            "language": "English",
            "entity_types": ["Crop Type",
                             "Crop Yield",
                             "Climate Drivers",
                             "Experimental Design",
                             "Location",
                             "Time"]
        },
    }
    
    # Step 6: Optional status tracking
    pipeline_status = {
        "latest_message": "",
        "history_messages": []
    }
    pipeline_status_lock = asyncio.Lock()
    
    # Step 7: Run the extraction
    await extract_entities(
        chunks=chunks,
        knowledge_graph_inst=knowledge_graph,
        entity_vdb=entity_vector_db,
        relationships_vdb=relationship_vector_db,
        global_config=global_config,
        pipeline_status=pipeline_status,
        pipeline_status_lock=pipeline_status_lock,
        llm_response_cache=llm_cache,
        cheatsheet_knowledge_graph_inst=cheatsheet_knowledge_graph_inst,
        write_result_to_txt=True,
        special_interest=""
    )
    
    return knowledge_graph

# Example usage in a Jupyter notebook
async def main():
    # monkeyReader = reader.MonkeyReader('monkey') # or 'lxml' or 'x2d'

    # read paper content
    # essay = monkeyReader.readEssay('./xmldata/Byjesh.pdf.tei.xml')
    # text = "\n".join(essay["Impact of fixed rise in temperature, CO 2 and change in rain fall"])
    text = content
    nx_graph = nx.Graph()
    knowledge_graph = await process_text_to_knowledge_graph(text, nx_graph, "example.txt")
    # Now you can query the knowledge graph for entities and relationships
    
    return knowledge_graph

# In a Jupyter notebook, you can run this with:
knowledge_graph = await main()

No existing Faiss index file found. Starting fresh.
No existing Faiss index file found. Starting fresh.
INFO: Process 72468 Shared-Data created for Single Process
INFO: Process 72468 initialized updated flags for namespace: [nx_kg]
INFO: Process 72468 initialized updated flags for namespace: [faiss_entity]
INFO: Process 72468 initialized updated flags for namespace: [faiss_relationship]
INFO: Process 72468 initialized updated flags for namespace: [llm_cache]
INFO: Process 72468 ready to initialize storage namespace: [llm_cache]
  hint_prompt = fill_nightly_prompt.format(


Final result:
("entity"<|>"Chatbots"<|>"technology"<|>"The widespread use of chatbots is a reality and their application in higher education is promising.")##
("entity"<|>"Higher Education"<|>"location"<|>"The widespread use of chatbots is a reality and their application in higher education is promising.")##
("entity"<|>"Higher Education Users"<|>"person"<|>"Understanding higher education usersÃ¢â‚¬â„¢ expectations for the use of chatbots in education is important for the design and development of new solutions.")##
("entity"<|>"Experts"<|>"person"<|>"A qualitative inquiry was undertaken based on 22 semi-structured interviews with higher-education students and instructors, and experts from the fields of Artificial Intelligence and educational chatbots.")##
("entity"<|>"Students"<|>"person"<|>"A qualitative inquiry was undertaken based on 22 semi-structured interviews with higher-education students and instructors, and experts from the fields of Artificial Intelligence and educational c

In [5]:
from utils import get_latest_result, read_knowledge_graph_from_pickle

nodes, edges = get_latest_result()
# nodes, edges = read_knowledge_graph_from_pickle("outputs/result_20250520_163337.pkl")

In [6]:
nodes

defaultdict(list,
            {'Qualitative Inquiry': [{'entity_name': 'Qualitative Inquiry',
               'entity_type': 'Experimental Design',
               'description': 'A qualitative inquiry was undertaken based on 22 semi-structured interviews with higher-education students and instructors, and experts from the fields of Artificial Intelligence and educational chatbots.',
               'source_id': 'chunk-ae1ae5bc19aa1ce1b241b4fe34d93818',
               'file_path': 'example.txt'}],
             'Chatbots': [{'entity_name': 'Chatbots',
               'entity_type': 'technology',
               'description': 'The widespread use of chatbots is a reality and their application in higher education is promising.',
               'source_id': 'chunk-ae1ae5bc19aa1ce1b241b4fe34d93818',
               'file_path': 'example.txt'}],
             'Higher Education': [{'entity_name': 'Higher Education',
               'entity_type': 'location',
               'description': 'The widespr

In [7]:
from rich.console import Console
from rich.text import Text

def highlight_with_rich(text, substrings, style="bold yellow"):
    console = Console()
    rich_text = Text(text)

    for substring in substrings:
        start = 0
        while True:
            index = text.find(substring[:40], start)
            if index == -1:
                break
            # Apply style to the substring
            rich_text.stylize(style, index, index + len(substring))
            start = index + len(substring)

    console.print(rich_text)

# edge_description = [edge["description"] for edge in linkage_edges if 'Crop Yield' in edge["type"]]
# node_description = [node["description"] for node in linkage_nodes if 'Crop Yield' in node["entity_type"]]

node_description = []
for node_list in nodes.values():
    for node in node_list:
        node_description.append(node["description"])


highlight_with_rich(content, node_description)

In [None]:
node_name = []
for node_list in nodes.values():
    for node in node_list:
        node_name.append(node["entity_name"])


highlight_with_rich(content, node_name)

# Find the entities in claim
Given the entities, the initial idea is to use the found entities to locate and find the corresponding entities in the claim. We use the original cheatsheet for the first attempt and then use the corresponding entity as othology for finding glean nodes.
- make the othology graph
- run `process_text_to_knowledge_graph` method