# Test with Gemini

In [1]:
import networkx as nx
from cheatsheet import CHEATSHEETS

def create_cheatsheet_knowledge_graph() -> nx.Graph:
    """
    Creates a NetworkX graph from the cheatsheet relationships.
    Returns:
        nx.Graph: Graph containing entity types as nodes and their relationships
    """
    G = nx.Graph()
    
    # Parse the cheatsheet relationships
    relationships = CHEATSHEETS["cheat_sheet_kg"].strip().split('\n')
    for relationship in relationships:
        if not relationship.strip():
            continue
            
        # Split on "-" and clean up whitespace
        nodes = [node.strip() for node in relationship.strip(';').split('-')]
        if len(nodes) == 2:
            source, target = nodes
            # Add nodes and edge
            G.add_node(source)
            G.add_node(target)
            G.add_edge(
                source, 
                target,
                description=f"Relationship between {source} and {target}",
                weight=1.0
            )
    
    return G
nx_graph = create_cheatsheet_knowledge_graph()

In [2]:
node_name = "Crop Type"

location_node = nx_graph.nodes[node_name]
neighbors = list(nx_graph.neighbors(node_name))
neighbor_edges = list(nx_graph.edges(node_name, data=True))
for edge in neighbor_edges:
    print(f"Edge: {edge[0]} -> {edge[1]}, Description: {edge[2].get('description', 'No description')}")

Edge: Crop Type -> Crop Yield, Description: Relationship between Crop Type and Crop Yield
Edge: Crop Type -> Climate Drivers, Description: Relationship between Crop Type and Climate Drivers


In [3]:
nx_graph.get_edge_data("Crop Yield", "Climate Drivers")

{'description': 'Relationship between Crop Yield and Climate Drivers',
 'weight': 1.0}

In [4]:
content = """
3.1.1 Impact of fixed rise in temperature, CO₂ and change in rain fall

Monsoon crop
Results of simulation analysis indicate that maize yields in monsoon season are adversely affected due to rise in atmospheric temperatures in all three regions (Fig. 2a). Grain yield decreased with each degree rise in atmospheric temperature. However, the rate of decrease varied with location. The mean baseline yield of rainfed maize crop is about 2 Mg ha⁻¹ in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to baseline yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures. However, a 20% increase in rainfall is projected to offset the yield loss due to 1°C rise in temperature. Similarly, a 30% increase in rainfall is predicted to offset the adverse impact of 2°C rise in temperature. In MIGP region, yield reduction of about 8–35% with 1–5°C rise in atmospheric temperature is projected. In this region, increase in rainfall is likely to offset the temperature rise up to 0.75°C and any increase beyond this temperature will adversely impact the yields, in spite of increase in rainfall. The SP region is also projected to experience adverse impact with -10, -15, -23, -27 and -35% reductions from the baseline yield levels at each 1°C rise in temperature. A 10% increase in rainfall will offset the reduction in yield due to 1°C rise in temperature in this region.

Even though maize is a C4 plant, increase in carbon dioxide is projected to benefit the crop yield ranging from 0.1 to 3.4% at 450 ppmV and 0.6 to 7.2% at 550 ppmV. The benefits are projected to be high in mild water stress conditions, but they are likely to reduce in severe water stress situations (Table 3). The yield gains due to increase in atmospheric CO₂ concentration are projected to be more in SP regions (low rainfall area) followed by UIGP and MIGP regions.

Winter crop
Maize crop during winter is provided with assured irrigation and thus yields about 1.5 times more than that of monsoon crop. Winter maize grain yield reduced with increase in temperatures in SP and MIGP, but in UIGP rise in temperatures up to 2.7°C is likely to improve the maize yields. However, further increase in temperature is projected to reduce grain yields and the reductions are likely to be more than those at MIGP and SP (Fig. 3a). In UIGP, this beneficial effect with rise in temperature is projected to be more up to 2°C rise (13% increase over current yields). In this region, yield will improve with 2°C in spite of reduction in rainfall. In the event of further increase in temperature to about 2.7°C, the reduction in yields can be offset only if rainfall is increased or more irrigation is provided. With temperature rise, the crop experiences conditions closer to optimal temperature during grain development, benefiting grain number. Relatively low temperature during grain filling period required more days to satisfy thermal time requirement. However, in both MIGP and SP, where the average maximum temperatures during winter crop season are relatively higher (Table 2), any increase in temperature can cause reduction in yield.

Table 3 Influence of atmospheric carbon dioxide concentration on maize yields in rainfall deficit conditions during monsoon season

In UIGP, rise in temperatures beyond 2.7°C caused reduction in yield mainly due to reduced number of grains. This limited the gains in spite of increase in GFD and individual grain weight. Further increase in temperature resulted in yield reduction from current yields. In UIGP, GFD was found to increase with rise in temperature because of current lower temperature during winter. While the rise in temperature prolonged GFD significantly at UIGP than at MIGP, it actually reduced at SP. In all locations, flowering hastened due to increase in temperature.
3.1.2 Impact of climate change scenarios on maize yield

The climate change scenario outputs of HadCM3 model on minimum and maximum temperatures and rainfall; CO₂ concentrations as per Bern CC model for 2020, 2050 and 2080 were coupled to InfoCrop-MAIZE model. This approach was followed because of reported spatio-temporal variations in climate change scenarios (IPCC 2007).

Monsoon crop
The analysis indicates that in UIGP region, climate change is projected to insignificantly affect the productivity of monsoon maize crop in 2020, 2050 and 2080 scenarios (Fig. 4a). This is mainly due to projected increase in rainfall during crop season, which will provide scope for improved dry matter production and increase in grain number. This implies that the maize crop may benefit from additional availability of water in spite of increase in temperature and related reduction in crop duration by 3–4 days. On the other hand, in MIGP, maize is likely to suffer yield loss in future scenarios. The loss from current yields is projected to be ~5%, ~13%, ~17% in 2020, 2050 and 2080, respectively. In SP, monsoon season crop is projected to lose grain yield by 21% from current yields due to climate change by 2020 and 35% by 2050 and later. Projected rise in daytime temperature during monsoon is higher in SP and MIGP as compared to UIGP region, even though minimum temperatures are projected to rise almost similarly in these locations. Apart from this, rainfall is projected to increase in UIGP while it is likely to change in MIGP. Thus, the spatio-temporal variation in existing climatic conditions and projected changes in temperature and rainfall would bring about differential impacts on monsoon maize crop in India.

Winter crop
As far as maize crop grown in winter is concerned, yield gains are projected to be ~5% over current yield in 2020 scenario at UIGP and this benefit is likely to remain till 2050 (Fig. 4b). However, in 2080 scenario, yields are projected to be reduced by 25% from current yields. Winter maize in MIGP, currently a high yielding zone, is projected to suffer in post-2020 scenario. The reduction in yield is likely to be to the tune of ~50% by 2050 and about 60% by 2080. In SP region, yields are projected to decline by about 13% in 2020, 17% in 2050 and 21% in 2080. In these areas, winter maize is well irrigated and thus variation in winter rainfall, which otherwise is low, is less influential. The projected rise in temperature during winter crop season is more in UIGP in 2020 and 2050 than in MIGP and SP, particularly during later part of crop growth.
"""

In [5]:
import asyncio
import os
from dotenv import load_dotenv
from operationCheatSheet import chunking_by_token_size, extract_entities
from utils import compute_mdhash_id
from geminillm import gemini_complete_if_cache
from openaillm import openai_embed, openai_complete
from faiss_impl import FaissVectorDBStorage
from networkx_impl import NetworkXStorage
from json_kv_iml import JsonKVStorage
from shared_storage import initialize_share_data
from cheatsheet import CHEATSHEETS
from grobidmonkey import reader

# LLM_MODEL_NAME = "gpt-3.5-turbo"
LLM_MODEL_NAME = "gemini-2.5-flash-preview-04-17"

load_dotenv()

async def process_text_to_knowledge_graph(text_content,
                                          cheatsheet_knowledge_graph_inst,
                                          file_path="unknown_source"):
    # Step 1: Create chunks from your text content
    raw_chunks = chunking_by_token_size(
        content=text_content,
        split_by_character=None,  # Optional: specify a character to split by (e.g., "\n\n")
        split_by_character_only=False,
        overlap_token_size=128,
        max_token_size=1024,
        tiktoken_model=LLM_MODEL_NAME
    )
    
    # Step 2: Format chunks as required by extract_entities
    doc_id = compute_mdhash_id(text_content, prefix="doc-")
    chunks = {
        compute_mdhash_id(chunk["content"], prefix="chunk-"): {
            **chunk,
            "full_doc_id": doc_id,
            "file_path": file_path
        }
        for chunk in raw_chunks
    }
    
    # Step 3: Initialize your storage instances
    faiss_global_config = {
        "working_dir": "/tmp",
        "embedding_batch_num": 64,  # or another integer suitable for your setup
        "vector_db_storage_cls_kwargs": {
            "cosine_better_than_threshold": 0.2  # or another float threshold you want
        },
        "base_url": "https://api.openai.com/v1",
    }
    kv_global_config = {
        "working_dir": "/tmp",
        "llm_model_name": LLM_MODEL_NAME,
        "embedding_batch_num": 64,  # or another integer suitable for your setup
        "vector_db_storage_cls_kwargs": {
            "cosine_better_than_threshold": 0.2  # or another float threshold you want
        },
        "base_url": "https://api.openai.com/v1",
    }
    knowledge_graph = NetworkXStorage(namespace="nx_kg", global_config=faiss_global_config, embedding_func=openai_embed)  # Graph storage implementation
    entity_vector_db = FaissVectorDBStorage(namespace="faiss_entity", global_config=faiss_global_config, embedding_func=openai_embed)  # Vector storage for entities
    relationship_vector_db = FaissVectorDBStorage(namespace="faiss_relationship", global_config=faiss_global_config, embedding_func=openai_embed)  # Vector storage for relationships
    llm_cache = JsonKVStorage(namespace="llm_cache", global_config=kv_global_config, embedding_func=openai_embed)
    
    initialize_share_data()

    await knowledge_graph.initialize()
    await entity_vector_db.initialize()
    await relationship_vector_db.initialize()
    await llm_cache.initialize()  # if your cache supports/needs it

    # Step 4: Use the OpenAI LLM function from openai.py
    # Create a wrapper that matches the expected signature for extract_entities
    async def llm_wrapper(prompt, history_messages=None, max_tokens=None, **kwargs):
        if history_messages is None:
            history_messages = []
    

        # Use Google GenAI
        return await gemini_complete_if_cache(
            model=LLM_MODEL_NAME,
            prompt=prompt,
            history_messages=history_messages,
            hashing_kv=llm_cache,
            temperature=0.2,
            max_tokens=max_tokens or 1024,
        )

        
    
    # Step 5: Configure the extraction process
    global_config = {
        "llm_model_func": llm_wrapper,  # Use our wrapper function
        "llm_model_name": LLM_MODEL_NAME,  # Required for some OpenAI functions
        "entity_extract_max_gleaning": 2,
        "force_llm_summary_on_merge": False,
        "llm_model_max_token_size": 1024,
        "summary_to_max_tokens": 256,
        "tiktoken_model_name": LLM_MODEL_NAME,
        "addon_params": {
            "language": "English",
            "entity_types": ["Crop Type",
                             "Crop Yield",
                             "Climate Drivers",
                             "Experimental Design",
                             "Location",
                             "Time"]
        },
    }
    
    # Step 6: Optional status tracking
    pipeline_status = {
        "latest_message": "",
        "history_messages": []
    }
    pipeline_status_lock = asyncio.Lock()
    
    # Step 7: Run the extraction
    await extract_entities(
        chunks=chunks,
        knowledge_graph_inst=knowledge_graph,
        entity_vdb=entity_vector_db,
        relationships_vdb=relationship_vector_db,
        global_config=global_config,
        pipeline_status=pipeline_status,
        pipeline_status_lock=pipeline_status_lock,
        llm_response_cache=llm_cache,
        cheatsheet_knowledge_graph_inst=cheatsheet_knowledge_graph_inst,
        write_result_to_txt=True
    )
    
    return knowledge_graph

# Example usage in a Jupyter notebook
async def main():
    # monkeyReader = reader.MonkeyReader('monkey') # or 'lxml' or 'x2d'

    # read paper content
    # essay = monkeyReader.readEssay('./xmldata/Byjesh.pdf.tei.xml')
    # text = "\n".join(essay["Impact of fixed rise in temperature, CO 2 and change in rain fall"])
    text = content
    knowledge_graph = await process_text_to_knowledge_graph(text, nx_graph, "example.txt")
    # Now you can query the knowledge graph for entities and relationships
    
    return knowledge_graph

# In a Jupyter notebook, you can run this with:
knowledge_graph = await main()

No existing Faiss index file found. Starting fresh.
No existing Faiss index file found. Starting fresh.
INFO: Process 22792 Shared-Data created for Single Process
INFO: Process 22792 initialized updated flags for namespace: [nx_kg]
INFO: Process 22792 initialized updated flags for namespace: [faiss_entity]
INFO: Process 22792 initialized updated flags for namespace: [faiss_relationship]
INFO: Process 22792 initialized updated flags for namespace: [llm_cache]
INFO: Process 22792 ready to initialize storage namespace: [llm_cache]
  hint_prompt = fill_nightly_prompt.format(


Final result:
("entity"<|>"Maize Crop"<|>"Maize Crop"<|>"Crop Type"<|>"Results of simulation analysis indicate that maize yields in monsoon season are adversely affected due to rise in atmospheric temperatures in all three regions (Fig. 2a)."<|>"maize"<|><source_id><|><file_path>)##
("entity"<|>"Monsoon Maize Yield"<|>"Monsoon Maize Yield"<|>"Crop Yield"<|>"Results of simulation analysis indicate that maize yields in monsoon season are adversely affected due to rise in atmospheric temperatures in all three regions (Fig. 2a)."<|>"maize yields in monsoon season"<|><source_id><|><file_path>)##
("entity"<|>"Winter Maize Yield"<|>"Winter Maize Yield"<|>"Crop Yield"<|>"Maize crop during winter is provided with assured irrigation and thus yields about 1.5 times more than that of monsoon crop."<|>"Winter maize grain yield"<|><source_id><|><file_path>)##
("entity"<|>"Baseline Yield"<|>"Baseline Yield"<|>"Crop Yield"<|>"The mean baseline yield of rainfed maize crop is about 2 Mg ha⁻¹ in UIGP, wh

Entity extraction error: empty description for entity 'InfoCrop-MAIZE model' of type 'Experimental Design'
Entity extraction error: empty description for entity 'Monsoon Crop' of type 'Experimental Design'
Entity extraction error: empty description for entity 'Winter Crop' of type 'Experimental Design'


In [23]:
from utils import get_latest_result, read_knowledge_graph_from_pickle

# nodes, edges = get_latest_result()
nodes, edges = read_knowledge_graph_from_pickle("outputs/result_20250520_163337.pkl")

In [13]:
from operationCheatSheet import _link_relationships_across_entities
from openaillm import openai_embed

embedding_func = openai_embed

linkage_nodes, linkage_edges = await _link_relationships_across_entities(
    nodes, edges, embedding_func, nx_graph)

In [7]:
import networkx as nx

# Create a new graph
G = nx.Graph()

# Add edges with their attributes
for edge in linkage_edges:
    # Add edge with all attributes
    G.add_edge(
        edge['src_id'],
        edge['tgt_id'],
        description=edge['description'],
        keywords=edge['keywords'],
        source_id=edge['source_id'],
        file_path=edge['file_path']
    )

NameError: name 'linkage_edges' is not defined

In [15]:
edge_description = [edge["description"] for edge in linkage_edges if edge["type"] == ('Climate Drivers', 'Crop Type')]


In [10]:
from rich.console import Console
from rich.text import Text

def highlight_with_rich(text, substrings, style="bold yellow"):
    console = Console()
    rich_text = Text(text)

    for substring in substrings:
        start = 0
        while True:
            index = text.find(substring[:40], start)
            if index == -1:
                break
            # Apply style to the substring
            rich_text.stylize(style, index, index + len(substring))
            start = index + len(substring)

    console.print(rich_text)

# edge_description = [edge["description"] for edge in linkage_edges if 'Crop Yield' in edge["type"]]
# node_description = [node["description"] for node in linkage_nodes if 'Crop Yield' in node["entity_type"]]

node_description = []
for node_list in nodes.values():
    for node in node_list:
        if 'Crop Yield' in node["entity_type"]:
            node_description.append(node["description"])


highlight_with_rich(content, node_description)


In [26]:
edge_description = []
for edge_list in edges.values():
    for edge in edge_list:
        edge_src_id = edge["src_id"]
        edge_tgt_id = edge["tgt_id"]
        if edge_src_id in nodes.keys() and edge_tgt_id in nodes.keys():
            if 'Climate Drivers' in nodes[edge_src_id][0]["entity_type"] and 'Crop Yield' in nodes[edge_tgt_id][0]["entity_type"]:
                edge_description.append((edge_src_id, edge_tgt_id, edge["description"]))
            elif 'Climate Drivers' in nodes[edge_tgt_id][0]["entity_type"] and 'Crop Yield' in nodes[edge_src_id][0]["entity_type"]:
                edge_description.append((edge_tgt_id, edge_src_id, edge["description"]))

edge_description

[('Atmospheric Temperatures',
  'Maize Yields',
  'Results of simulation analysis indicate that maize yields in monsoon season are adversely affected due to rise in atmospheric temperatures in all three regions (Fig. 2a).'),
 ('1, 2, 3, 4, 5°C Degrees Rise',
  'Projected Yield Loss in UIGP (7-33%)',
  'The mean baseline yield of rainfed maize crop is about 2 Mg ha⁻¹ in UIGP, where the projected yield loss is up to 7, 11, 15, 22, and 33% relative to baseline yields with 1, 2, 3, 4, 5°C degrees rise in atmospheric temperatures.'),
 ('1, 2, 3, 4, 5°C Degrees Rise',
  'Yield Reduction of 8–35%',
  'In MIGP region, yield reduction of about 8–35% with 1–5°C rise in atmospheric temperature is projected.'),
 ('1°C Rise in Temperature',
  'Projected Yield Reductions in SP (-10 to -35%)',
  'The SP region is also projected to experience adverse impact with -10, -15, -23, -27 and -35% reductions from the baseline yield levels at each 1°C rise in temperature.'),
 ('Increase in Carbon Dioxide',
  '

In [22]:
nodes['Maize'][0]

{'entity_name': 'Maize',
 'entity_type': 'Crop Type',
 'description': 'Results of simulation analysis indicate that maize yields in monsoon season are adversely affected due to rise in atmospheric temperatures in all three regions (Fig. 2a).',
 'source_id': 'chunk-110b2a3996a446fc39fb457d4214d315',
 'file_path': 'example.txt'}