In [4]:
#%%
import os
import json
from typing import List, Dict, Any, Tuple
import re
from llama_index.extractors.entity import EntityExtractor
from llama_index.core import SimpleDirectoryReader, Document, PropertyGraphIndex
from llama_index.core.graph_stores import SimplePropertyGraphStore
from llama_index.core.settings import Settings
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.schema import TextNode, NodeRelationship
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core.llms import ChatMessage, MessageRole
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from pyvis.network import Network
import nest_asyncio
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.llms.azure_openai import AzureOpenAI

from dotenv import load_dotenv
load_dotenv()


endpoint = "https://d-ais-eus-ais-chatbots.openai.azure.com/"
model_name = "o1-mini"
deployment = "o1-mini"
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = "2024-12-01-preview" # Use a valid API version

llm = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version,
    deployment_name=deployment,
    model_name=model_name,
    temperature=1.0
)

embed_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

llm2 = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2024-05-01-preview",
    deployment_name="gpt-4o-mini-test",
    model_name="gpt-4o-mini-test",
    temperature=1.0
)

#%%
nest_asyncio.apply()

# Load documents
# def load_documents() -> List[Document]:
#     """Load markdown documents from a directory."""
#     reader = SimpleDirectoryReader("./few_examples/").load_data()
#     documents = reader
#     return documents

# documents = SimpleDirectoryReader("./kgdata/").load_data()
documents = SimpleDirectoryReader("./few_examples/").load_data()

# Initialize the graph store
graph_store = SimplePropertyGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [5]:
# Add filename and title to metadata for later use as ProtocolDocument properties
processed_documents = []
for doc in documents:
    filename = os.path.basename(doc.metadata.get('file_path', 'unknown.md'))
    # Attempt to extract title from the first line (assuming it's '# Title')
    first_line = doc.text.split('\n', 1)[0]
    title = first_line.lstrip('# ').strip() if first_line.startswith('#') else "Untitled Protocol"
    
    # Create a new Document or update metadata of existing one
    # LlamaIndex expects text in doc.text
    # We will create a ProtocolDocument node explicitly later
    doc.metadata['filename'] = filename
    doc.metadata['protocol_title'] = title
    processed_documents.append(doc)

In [15]:
processed_documents[0]

Document(id_='f06566ff-7f2b-485f-97c2-8bdd17ff3eb2', embedding=None, metadata={'file_path': 'c:\\Users\\newac\\OneDrive\\Desktop\\Master\\few_examples\\01.md', 'file_name': '01.md', 'file_size': 8702, 'creation_date': '2025-04-30', 'last_modified_date': '2025-04-21', 'filename': '01.md', 'protocol_title': '01 Unconscious adult – not breathing normally'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='# 01 Unconscious adult – not breathing normally\n\n## CRITERIA\n- Critical | Unconscious adult, not breathing normally\n\n## SITUATIONAL GUIDANCE & IMPORTANT TO ASCERTAIN\n- Help is on the way as I speak to you.\n- You must start CPR (revivin

In [123]:
# --- Define Node and Relation Types for the Extractor ---
# This helps guide the LLM. Be as specific as possible.
node_types = [
    "ProtocolDocument", "Criterion", "Advice", "Scenario", "Action",
    "Procedure", "MedicalCondition", "Symptom", "MedicalDevice",
    "Medication", "PatientQualifier", "InformationResource", "Question",
    "InstructionDetail"
]

# Define relationships as (HeadEntityType, RelationType, TailEntityType)
# This is crucial for LLMPropertyGraphExtractor
relation_schemas = [
    ("ProtocolDocument", "HAS_CRITERION", "Criterion"),
    ("ProtocolDocument", "HAS_ADVICE", "Advice"),
    ("Criterion", "TRIGGERS_ADVICE", "Advice"), # Or link via advice_ids
    ("ProtocolDocument", "DEFINES_SCENARIO", "Scenario"),
    ("Scenario", "LEADS_TO_ACTION", "Action"), # Renamed for clarity
    ("Action", "IS_A_PROCEDURE", "Procedure"),
    ("Action", "USES_DEVICE", "MedicalDevice"),
    ("Action", "ADMINISTERS_MEDICATION", "Medication"),
    ("Action", "APPLIES_TO_PATIENT_TYPE", "PatientQualifier"), # Renamed
    ("Procedure", "HAS_INSTRUCTION", "InstructionDetail"), # Renamed
    ("Criterion", "SIGNALS_CONDITION", "MedicalCondition"), # Renamed
    ("Criterion", "PRESENTS_SYMPTOM", "Symptom"), # Renamed
    ("MedicalCondition", "HAS_SYMPTOM", "Symptom"),
    ("MedicalCondition", "MAY_BE_TREATED_WITH", "Procedure"), # Renamed
    ("ProtocolDocument", "CONCERNS_PATIENT_TYPE", "PatientQualifier"), # Renamed
    ("ProtocolDocument", "MENTIONS_RESOURCE", "InformationResource"), # Renamed
    ("ProtocolDocument", "SUGGESTS_QUESTION", "Question"), # Renamed
]


In [124]:
# --- Advanced CoT Prompting Design ---
kg_extraction_prompt_template = """
You are an expert in medical emergency protocols and knowledge graph generation.
Your task is to extract structured information from the provided text chunk of a medical protocol.
The protocol is titled: "{protocol_title}" from file: "{filename}".

Consider the overall context of this protocol when extracting information.
The main 'ProtocolDocument' node for this text is "{protocol_title}" (filename: "{filename}").
All extracted entities should, where appropriate, be linked to this 'ProtocolDocument' node with a 'PART_OF_PROTOCOL' relationship if no other specific relationship to it is identified.

Allowed Node Types:
{node_types_str}

Allowed Relation Triplets (HeadEntityType, RelationType, TailEntityType):
{relation_schemas_str}

For the given text:
---
{text_chunk}
---

Extract entities and relationships according to the allowed types and schemas.
Entities should have a 'label' (their name or description) and a 'type'.
Extract relevant properties for nodes if they are clearly stated (e.g., 'level' for Criterion, 'advice_id' for Advice).
Relationships should connect two entities with one of the allowed relation types.

Think step by step:
1. Identify potential entities from the text that match the allowed node types. Assign a clear label.
2. For each entity, extract its properties if specified in the schema or clearly available.
3. Identify relationships between these entities based on the allowed relation schemas. Ensure the relationship makes sense in the context of the protocol.
4. Ensure that extracted entities that are components of the protocol are linked to the main 'ProtocolDocument' node for this file ({protocol_title}, {filename}) with a 'PART_OF_PROTOCOL' relationship if no other direct relationship to the ProtocolDocument is identified.

Output the extracted information as a JSON object with two keys: "entities" and "relationships".
"entities": A list of JSON objects, each with "label", "type", and an optional "properties" dictionary.
"relationships": A list of JSON objects, each with "source_label", "relation_label", and "target_label".

Example entity: {{"label": "Start CPR", "type": "Action", "properties": {{"target_patient": "Adult"}}}}
Example relationship: {{"source_label": "Unconscious adult, not breathing normally", "relation_label": "REQUIRES_ATTENTION", "target_label": "Start CPR"}}

"""

In [125]:
# Format node_types and relation_schemas for the prompt
node_types_str_prompt = "\n".join(f"- {nt}" for nt in node_types)
relation_schemas_str_prompt = "\n".join(f"- ({h}, {r}, {t})" for h, r, t in relation_schemas)

In [126]:
# Create a list to hold all nodes and relationships from all documents
all_graph_nodes = []
all_graph_relationships = []

In [127]:
protocol_document_nodes_map = {}
for i, doc in enumerate(processed_documents):
    filename = doc.metadata['filename']
    title = doc.metadata['protocol_title']
    # Use a unique ID for the protocol document node, e.g., filename
    # The label for the node will be its title.
    protocol_node_label = title # This will be the 'label' LLM uses to refer to it
    protocol_document_nodes_map[filename] = protocol_node_label
    
    node_data = {
        "label": protocol_node_label, # What the LLM will use to refer to it
        "type": "ProtocolDocument",
        "properties": {
            "id": filename, # Actual unique ID for the node in the graph
            "filename": filename,
            "title": title
        }
    }
    all_graph_nodes.append(node_data)

In [128]:
for doc in processed_documents:
    filename = doc.metadata['filename']
    protocol_title = doc.metadata['protocol_title'] # This is the label for the ProtocolDocument node

    # Prepare the specific prompt for this document
    current_prompt = kg_extraction_prompt_template.format(
        protocol_title=protocol_title,
        filename=filename,
        node_types_str=node_types_str_prompt,
        relation_schemas_str=relation_schemas_str_prompt,
        text_chunk=doc.text # Pass the full document text for now
    )

    # Initialize extractor for each document to pass the custom prompt
    # Note: This is a conceptual way. LLMPropertyGraphExtractor might not directly take a fully formatted prompt string
    # but rather a prompt template where it fills {text_chunk}. You might need to subclass or adapt.
    # A more direct way is to make the LLM call yourself and parse its JSON output.

    # Let's try a more direct LLM call for better prompt control
    response = llm.complete(current_prompt)
    llm_output_text = response.text  # Get the raw text from the LLM

    # --- START: Clean the LLM output ---
    cleaned_json_text = llm_output_text.strip()
    if cleaned_json_text.startswith("```json"):
        cleaned_json_text = cleaned_json_text[7:]  # Remove ```json and potential newline
    elif cleaned_json_text.startswith("```"):
        cleaned_json_text = cleaned_json_text[3:]   # Remove ``` and potential newline
    
    if cleaned_json_text.endswith("```"):
        cleaned_json_text = cleaned_json_text[:-3] # Remove trailing ```
    
    cleaned_json_text = cleaned_json_text.strip() # Ensure no leading/trailing whitespace remains
    # --- END: Clean the LLM output ---
    
    try:
        # Now parse the cleaned text
        extracted_data = json.loads(cleaned_json_text) 
        
        for entity in extracted_data.get("entities", []):
            # --- STRIP LABELS ---
            entity["label"] = entity["label"].strip() 
            if "name" in entity: # If you have a separate name field
                entity["name"] = entity["name"].strip()
            # --- END STRIP ---
            entity_id = entity.get("properties", {}).get("id", entity["label"]) 
            if "properties" not in entity: entity["properties"] = {}
            entity["properties"]["id"] = entity_id 
            all_graph_nodes.append(entity)

        for rel in extracted_data.get("relationships", []):
            # --- STRIP LABELS ---
            rel["source_label"] = rel["source_label"].strip()
            rel["target_label"] = rel["target_label"].strip()
            rel["relation_label"] = rel["relation_label"].strip()
            # --- END STRIP ---
            all_graph_relationships.append(rel)
        
        print(f"Cleaned Text Snippet for JSON Parsing: {cleaned_json_text}")
            
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from LLM for {filename}: {e}")
        print(f"Original LLM Response Snippet: {response.text[:500]}...") 
        print(f"Cleaned Text Snippet for JSON Parsing: {cleaned_json_text[:500]}...")
    except Exception as e:
        print(f"An unexpected error occurred processing {filename}: {e}")



Error decoding JSON from LLM for 01.md: Expecting property name enclosed in double quotes: line 378 column 6 (char 12209)
Original LLM Response Snippet: ```json
{
  "entities": [
    {
      "label": "01 Unconscious adult – not breathing normally",
      "type": "ProtocolDocument",
      "properties": {
        "filename": "01.md"
      }
    },
    {
      "label": "Unconscious adult, not breathing normally",
      "type": "Criterion",
      "properties": {
        "level": "Critical"
      }
    },
    {
      "label": "Help is on the way as I speak to you.",
      "type": "Advice"
    },
    {
      "label": "You must start CPR (reviving the ...
Cleaned Text Snippet for JSON Parsing: {
  "entities": [
    {
      "label": "01 Unconscious adult – not breathing normally",
      "type": "ProtocolDocument",
      "properties": {
        "filename": "01.md"
      }
    },
    {
      "label": "Unconscious adult, not breathing normally",
      "type": "Criterion",
      "properties": {
  

KeyboardInterrupt: 

In [119]:
all_graph_nodes

[{'label': '01 Unconscious adult – not breathing normally',
  'type': 'ProtocolDocument',
  'properties': {'id': '01.md',
   'filename': '01.md',
   'title': '01 Unconscious adult – not breathing normally'}},
 {'label': '05 Mental health issue',
  'type': 'ProtocolDocument',
  'properties': {'id': '05.md',
   'filename': '05.md',
   'title': '05 Mental health issue'}},
 {'label': '23 Poisoning – not related to alcohol or drugs',
  'type': 'ProtocolDocument',
  'properties': {'id': '23.md',
   'filename': '23.md',
   'title': '23 Poisoning – not related to alcohol or drugs'}},
 {'label': '01 Unconscious adult – not breathing normally',
  'type': 'ProtocolDocument',
  'properties': {'filename': '01.md',
   'id': '01 Unconscious adult – not breathing normally'}},
 {'label': 'Critical',
  'type': 'Criterion',
  'properties': {'level': 'Critical', 'id': 'Critical'}},
 {'label': 'Unconscious adult, not breathing normally',
  'type': 'Symptom',
  'properties': {'id': 'Unconscious adult, not b

In [120]:
all_graph_relationships

[{'source_label': '01 Unconscious adult – not breathing normally',
  'relation_label': 'HAS_CRITERION',
  'target_label': 'Critical'},
 {'source_label': 'Critical',
  'relation_label': 'PRESENTS_SYMPTOM',
  'target_label': 'Unconscious adult, not breathing normally'},
 {'source_label': 'Critical',
  'relation_label': 'SIGNALS_CONDITION',
  'target_label': 'Heart arrest'},
 {'source_label': '01 Unconscious adult – not breathing normally',
  'relation_label': 'HAS_ADVICE',
  'target_label': 'Help is on the way as I speak to you.'},
 {'source_label': '01 Unconscious adult – not breathing normally',
  'relation_label': 'HAS_ADVICE',
  'target_label': 'Start CPR'},
 {'source_label': 'Start CPR',
  'relation_label': 'APPLIES_TO_PATIENT_TYPE',
  'target_label': 'Adult'},
 {'source_label': 'Start CPR',
  'relation_label': 'PART_OF_PROTOCOL',
  'target_label': '01 Unconscious adult – not breathing normally'},
 {'source_label': 'Defibrillator',
  'relation_label': 'PART_OF_PROTOCOL',
  'target_l

In [111]:
from llama_index.core.graph_stores.types import EntityNode, ChunkNode, Relation

graph_data_nodes = []
for node_info in all_graph_nodes:
    node_properties = node_info.get("properties", {})
    # Ensure 'id' is present, fallback to label if necessary for uniqueness
    node_id_val = node_properties.get("id", node_info["label"]).strip()
    node_properties["id"] = node_id_val # Standardize where ID is stored

    graph_data_nodes.append(
        EntityNode(
            label=node_info["label"], # This will be the node_id in SimplePropertyGraphStore
            properties=node_properties,
            name=node_info["type"],
            
        )
    )

graph_data_relations = []
for rel_info in all_graph_relationships:
    graph_data_relations.append(
        Relation(
            source_id=rel_info["source_label"].strip(), # Must match a GraphNode.label
            target_id=rel_info["target_label"].strip(), # Must match a GraphNode.label
            label=rel_info["relation_label"].strip(), # This is the relation type
            # properties=rel_info.get("properties", {}) # If relations have properties
        )
    )


In [112]:
graph_store = SimplePropertyGraphStore()
graph_store.upsert_nodes(graph_data_nodes)
graph_store.upsert_relations(graph_data_relations)

print("Nodes and relationships added to SimplePropertyGraphStore.")

Nodes and relationships added to SimplePropertyGraphStore.


In [113]:
# --- Build the PropertyGraphIndex from the populated store ---
index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm, # For querying later
    # embed_model=None # For querying later
    embed_kg_nodes=False
)
print("PropertyGraphIndex built.")

PropertyGraphIndex built.


In [114]:
index.property_graph_store.save_networkx_graph(name="./kg_manualv3.html")

In [122]:
query_engine = index.as_query_engine(
        llm=llm,
        # include_text=False # Set to True if you want source text with nodes
    )
protocol_title_to_query = "01 Unconscious adult – not breathing normally" # Example
response = query_engine.query(f"What are the CRITICAL criteria for the protocol '{protocol_title_to_query}'?")
print("\nQuery Response for Critical Criteria:", response)

KeyError: '05 Mental health issue_MENTIONS_RESOURCE_UKOM report  Somatisk helse hos pasienter med alvorlig psykisk lidelse '

In [8]:
CoT_prompt_template = """
-Goal-
Given a text document containing medical protocols or guidelines, identify all entities of the specified medical types and all relationships among the identified entities to build a knowledge graph.

I will be using this data to generate a knowledge graph, detect communities and summarise the communities
SO PLEAE BE AWARE OF THE FUTURE PROSPECTS OF THIS PROJECT.
 
 -Steps-
1.  Identify all entities within the text. For each identified entity, extract the following information:
    *   `entity_name`: Name of the entity, capitalized (e.g., CPR, UNCONSCIOUSNESS, NEWBORN).
    *   `entity_type`: One of the relevant medical types provided below.
    *   `entity_description`: Comprehensive description of the entity's attributes, purpose, or actions as described in the text.
    Format each entity as ("entity"{{tuple_delimiter}}<entity_name>{{tuple_delimiter}}<entity_type>{{tuple_delimiter}}<entity_description>)

2.  From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly and directly related* within the context of the provided text.
    For each pair of related entities, extract the following information:
    *   `source_entity`: name of the source entity, as identified in step 1.
    *   `target_entity`: name of the target entity, as identified in step 1.
    *   `relationship_description`: explanation in English describing the nature of the relationship between the source and target entity based *only* on the text (e.g., "is a symptom of", "is used to treat", "is performed on", "is a type of", "is indicated for", "uses", "should be checked in case of").
    *   `relationship_strength`: an integer score between 1 (weakly related) to 10 (very strongly and explicitly related).
    Format each relationship as ("relationship"{{tuple_delimiter}}<source_entity>{{tuple_delimiter}}<target_entity>{{tuple_delimiter}}<relationship_description>{{tuple_delimiter}}<relationship_strength>)

3.  Return the output as a single list containing all identified entities and relationships. Use **{{record_delimiter}}** as the delimiter between each entity or relationship record. The primary language of the provided text is mixed English and Norwegian.

4.  Translate Norwegian descriptions into English for the `entity_description` and `relationship_description` fields *only*. Keep entity names and types consistent (preferably English where obvious equivalents exist, otherwise use the capitalized term from the text).

5.  When finished, output {{completion_delimiter}}.

-Relevant Medical Entity Types-
[Medical Condition, Symptom, Patient Group, Medical Procedure, Medical Device/Tool, Medication/Substance, Anatomical Location, Guideline Section, Organization/Role, Medical Concept]

-Examples-
######################

Example 1 (Based on 01.md/02.md):

entity_types: [Medical Condition, Symptom, Patient Group, Medical Procedure, Medical Device/Tool, Medication/Substance, Anatomical Location, Guideline Section, Organization/Role, Medical Concept]
text:
# 01 Unconscious adult – not breathing normally
## CRITERIA
- Critical | Unconscious adult, not breathing normally
## SITUATIONAL GUIDANCE & IMPORTANT TO ASCERTAIN
- Help is on the way as I speak to you.
- You must start CPR (reviving the person). I will tell you what to do.
- Don’t hang up, put the phone on speaker if you can.
- If there is defibrillator at hand, get someone else to fetch it. Check Hjertestarterregisteret (the caller must not fetch a defibrillator / AED if alone)
## EMERGENCY RESPONSE
### SCENARIO
- BCPR (CARDIO PULMONARY RESUSCITATION)
### IF YES
- Push down at this rate 30 times.
- Now give rescue breaths.
- Tilt the head back with one hand on the forehead.
- Lift the chin up with the other hand.
- Pinch the nose and give 2 gentle rescue breaths.
- Continue with 30 pushes and 2 rescue breaths until medics take over or the person wakes up.
- Lay the person on the floor, on his / her back.
- Kneel beside the person’s chest.
- Place your hands in the middle of his / her chest... Push down hard...
------------------------
output:
("entity"{{tuple_delimiter}}UNCONSCIOUS ADULT{{tuple_delimiter}}Patient Group{{tuple_delimiter}}An adult patient who is unconscious and not breathing normally, requiring critical intervention.){{record_delimiter}}
("entity"{{tuple_delimiter}}NOT BREATHING NORMALLY{{tuple_delimiter}}Symptom{{tuple_delimiter}}A critical symptom indicating lack of normal respiration, often associated with unconsciousness.){{record_delimiter}}
("entity"{{tuple_delimiter}}CPR{{tuple_delimiter}}Medical Procedure{{tuple_delimiter}}Cardiopulmonary Resuscitation, a life-saving procedure involving chest compressions and rescue breaths, instructed by the call handler.){{record_delimiter}}
("entity"{{tuple_delimiter}}CHEST COMPRESSIONS{{tuple_delimiter}}Medical Procedure{{tuple_delimiter}}A component of CPR involving pushing down hard on the center of the chest 30 times.){{record_delimiter}}
("entity"{{tuple_delimiter}}RESCUE BREATHS{{tuple_delimiter}}Medical Procedure{{tuple_delimiter}}A component of CPR involving tilting the head, lifting the chin, pinching the nose, and giving 2 gentle breaths into the mouth.){{record_delimiter}}
("entity"{{tuple_delimiter}}AED{{tuple_delimiter}}Medical Device/Tool{{tuple_delimiter}}Automated External Defibrillator (also referred to as defibrillator or Hjertestarter), a device to be fetched if available and if someone else is present.){{record_delimiter}}
("entity"{{tuple_delimiter}}CALLER{{tuple_delimiter}}Organization/Role{{tuple_delimiter}}The person calling for help who is instructed to perform CPR.){{record_delimiter}}
("entity"{{tuple_delimiter}}MEDICS{{tuple_delimiter}}Organization/Role{{tuple_delimiter}}Emergency medical personnel who are on their way and will take over CPR upon arrival.){{record_delimiter}}
("entity"{{tuple_delimiter}}CRITERIA{{tuple_delimiter}}Guideline Section{{tuple_delimiter}}Section defining the conditions under which this protocol applies, such as an unconscious adult not breathing normally.){{record_delimiter}}
("entity"{{tuple_delimiter}}CHEST{{tuple_delimiter}}Anatomical Location{{tuple_delimiter}}The location on the body (middle of the chest) where chest compressions are applied during CPR.){{record_delimiter}}
("entity"{{tuple_delimiter}}HEAD{{tuple_delimiter}}Anatomical Location{{tuple_delimiter}}Body part manipulated during rescue breaths (tilt the head back).){{record_delimiter}}
("entity"{{tuple_delimiter}}CHIN{{tuple_delimiter}}Anatomical Location{{tuple_delimiter}}Body part manipulated during rescue breaths (lift the chin up).){{record_delimiter}}
("relationship"{{tuple_delimiter}}UNCONSCIOUS ADULT{{tuple_delimiter}}NOT BREATHING NORMALLY{{tuple_delimiter}}Is a defining symptom for this patient group according to the criteria.{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}CPR{{tuple_delimiter}}UNCONSCIOUS ADULT{{tuple_delimiter}}Is the required procedure for an unconscious adult not breathing normally.{{tuple_delimiter}}10){{record_delimiter}}
("relationship"{{tuple_delimiter}}CPR{{tuple_delimiter}}CHEST COMPRESSIONS{{tuple_delimiter}}Incorporates chest compressions as a key component (30 pushes).{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}CPR{{tuple_delimiter}}RESCUE BREATHS{{tuple_delimiter}}Incorporates rescue breaths as a key component (2 breaths).{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}CALLER{{tuple_delimiter}}CPR{{tuple_delimiter}}Is instructed to perform CPR.{{tuple_delimiter}}8){{record_delimiter}}
("relationship"{{tuple_delimiter}}AED{{tuple_delimiter}}CPR{{tuple_delimiter}}Should be fetched and used during CPR if available and feasible.{{tuple_delimiter}}7){{record_delimiter}}
("relationship"{{tuple_delimiter}}CHEST COMPRESSIONS{{tuple_delimiter}}CHEST{{tuple_delimiter}}Are applied to the middle of the chest.{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}RESCUE BREATHS{{tuple_delimiter}}HEAD{{tuple_delimiter}}Involves tilting the head back.{{tuple_delimiter}}8){{record_delimiter}}
("relationship"{{tuple_delimiter}}RESCUE BREATHS{{tuple_delimiter}}CHIN{{tuple_delimiter}}Involves lifting the chin up.{{tuple_delimiter}}8){{record_delimiter}}
("relationship"{{tuple_delimiter}}MEDICS{{tuple_delimiter}}CPR{{tuple_delimiter}}Will take over CPR from the caller upon arrival.{{tuple_delimiter}}8)
{{completion_delimiter}}
#############################

Example 2 (Based on 20.md):

entity_types: [Medical Condition, Symptom, Patient Group, Medical Procedure, Medical Device/Tool, Medication/Substance, Anatomical Location, Guideline Section, Organization/Role, Medical Concept]
text:
# 20 Diabetes
## CRITERIA
- Critical | Drowsy (decreased level of consciousness) - May have a low blood sugar (hypo) | 1.2.3.5.6
## ADVICE
### Advice 5. IF THE PERSON HAS A HYPO
If necessary and the person has a glucagon injection or nasal spray:
– Give one dose (1 mg) Glucagon...
### Advice 7. DROWSY OR DAZED AND UNABLE TO DRINK
– Do not force the person to drink...
– Alternatively, you can put one or two spoonfuls of honey in the mouth. You can also spread honey or granulated sugar on the gums, between the lips and the teeth.
### Advice 8. THE PERSON IS AWAKE ENOUGH TO DRINK
– Give the person several glasses of sugary drink e.g. fizzy drink, cordial, juice or milk.
## INFORMATION
### HYPOGLYKEMI – LAVT BLODSUKKER
Når blodsukkeret synker under 4 mmol/l... Hvis blodsukkeret faller ytterligere under 3 mmol/l, opptrer føling (insulinføling)... Pas. kan hurtig bli sløv, bevisstløs eller få kramper. Behandlingen er rask tilførsel av sukker eller Glukagon®
------------------------
output:
("entity"{{tuple_delimiter}}DIABETES{{tuple_delimiter}}Medical Condition{{tuple_delimiter}}The underlying medical condition being addressed in this protocol.){{record_delimiter}}
("entity"{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Medical Condition{{tuple_delimiter}}Low blood sugar (føling/insulinføling), defined as blood sugar below 4 mmol/L or 3 mmol/L, potentially causing drowsiness, unconsciousness, or seizures. Referred to as 'hypo'.){{record_delimiter}}
("entity"{{tuple_delimiter}}DROWSINESS{{tuple_delimiter}}Symptom{{tuple_delimiter}}Decreased level of consciousness, listed as a critical criterion possibly indicating hypoglycemia.){{record_delimiter}}
("entity"{{tuple_delimiter}}GLUCAGON{{tuple_delimiter}}Medication/Substance{{tuple_delimiter}}A medication administered via injection (1mg or 0.5mg dose) or nasal spray (3mg) to treat severe hypoglycemia if available.){{record_delimiter}}
("entity"{{tuple_delimiter}}GLUCAGON KIT{{tuple_delimiter}}Medical Device/Tool{{tuple_delimiter}}Refers to the glucagon injection or nasal spray kit the person might have.){{record_delimiter}}
("entity"{{tuple_delimiter}}HONEY{{tuple_delimiter}}Medication/Substance{{tuple_delimiter}}A sugary substance that can be placed in the mouth or on the gums of a drowsy person unable to drink.){{record_delimiter}}
("entity"{{tuple_delimiter}}GRANULATED SUGAR{{tuple_delimiter}}Medication/Substance{{tuple_delimiter}}A sugary substance that can be spread on the gums of a drowsy person unable to drink.){{record_delimiter}}
("entity"{{tuple_delimiter}}SUGARY DRINK{{tuple_delimiter}}Medication/Substance{{tuple_delimiter}}Drinks like fizzy drinks, cordial, juice, or milk given to a person awake enough to drink to raise blood sugar.){{record_delimiter}}
("entity"{{tuple_delimiter}}GUMS{{tuple_delimiter}}Anatomical Location{{tuple_delimiter}}Location in the mouth where honey or granulated sugar can be applied for absorption in a drowsy patient.){{record_delimiter}}
("entity"{{tuple_delimiter}}ADVICE{{tuple_delimiter}}Guideline Section{{tuple_delimiter}}Section providing instructions on how to manage specific situations like hypoglycemia.){{record_delimiter}}
("entity"{{tuple_delimiter}}INFORMATION{{tuple_delimiter}}Guideline Section{{tuple_delimiter}}Section providing background information on conditions like hypoglycemia.){{record_delimiter}}
("relationship"{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}DIABETES{{tuple_delimiter}}Is a potential complication or state related to Diabetes.{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}DROWSINESS{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Is listed as a potential symptom or consequence of Hypoglycemia.{{tuple_delimiter}}8){{record_delimiter}}
("relationship"{{tuple_delimiter}}GLUCAGON{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Is a treatment for severe Hypoglycemia.{{tuple_delimiter}}10){{record_delimiter}}
("relationship"{{tuple_delimiter}}HONEY{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Is an alternative treatment for Hypoglycemia in drowsy patients unable to drink.{{tuple_delimiter}}7){{record_delimiter}}
("relationship"{{tuple_delimiter}}GRANULATED SUGAR{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Is an alternative treatment for Hypoglycemia in drowsy patients unable to drink.{{tuple_delimiter}}7){{record_delimiter}}
("relationship"{{tuple_delimiter}}SUGARY DRINK{{tuple_delimiter}}HYPOGLYCEMIA{{tuple_delimiter}}Is a treatment for Hypoglycemia in patients awake enough to drink.{{tuple_delimiter}}8){{record_delimiter}}
("relationship"{{tuple_delimiter}}GLUCAGON KIT{{tuple_delimiter}}GLUCAGON{{tuple_delimiter}}Is the delivery method for Glucagon medication.{{tuple_delimiter}}9){{record_delimiter}}
("relationship"{{tuple_delimiter}}HONEY{{tuple_delimiter}}GUMS{{tuple_delimiter}}Can be applied to the gums for absorption.{{tuple_delimiter}}7){{record_delimiter}}
("relationship"{{tuple_delimiter}}GRANULATED SUGAR{{tuple_delimiter}}GUMS{{tuple_delimiter}}Can be applied to the gums for absorption.{{tuple_delimiter}}7)
{{completion_delimiter}}
#############################

-Real Data-
######################
entity_types: [Medical Condition, Symptom, Patient Group, Medical Procedure, Medical Device/Tool, Medication/Substance, Anatomical Location, Guideline Section, Organization/Role, Medical Concept]
text: {input_text}
######################
output:
"""

In [57]:
cot_prompt = """ 
---Goal---
Given an emergency medical document or call transcript, identify all key entities and relationships that are critical for emergency response coordination and decision-making.
Use English as output language.

---Steps---

1. Identify all entities related to emergency medical response. For each identified entity, extract the following information:

entity_name: Name of the entity, use same language as input text. If English, capitalize the name.

entity_type: One of the following types: [medical_condition, symptom, treatment, procedure, emergency_protocol, response_category, patient_characteristic, location, equipment, severity_level, medication, vital_sign, time_factor, healthcare_professional, communication_method]

entity_description: Comprehensive description of the entity including its characteristics, significance in emergency responses, and potential impact on patient care.

Format each entity as ("entity"|<entity_name>|<entity_type>|<entity_description>)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are clearly related to each other in an emergency response context.
For each pair of related entities, extract the following information:

source_entity: name of the source entity, as identified in step 1

target_entity: name of the target entity, as identified in step 1

relationship_description: explanation of how these entities are connected in emergency response contexts and why this relationship is important for effective care

relationship_strength: a numeric score (1-10) indicating how critical this relationship is for emergency responders to understand

relationship_keywords: key words that summarize the nature of the relationship, focusing on emergency medicine concepts or protocols

Format each relationship as ("relationship"|<source_entity>|<target_entity>|<relationship_description>|<relationship_keywords>|<relationship_strength>)

3. Identify high-level emergency response concepts, themes, or topics from the entire text. These should capture the overarching emergency protocols, critical decision points, or priority assessment frameworks present in the document.
Format the content-level key words as ("content_keywords"|<high_level_keywords>)

Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use ; as the list delimiter.

When finished, output ###END###

---Example---

Entity_types: [medical_condition, symptom, treatment, procedure, emergency_protocol, response_category, patient_characteristic, location, equipment, severity_level, medication, vital_sign, time_factor, healthcare_professional, communication_method]

Text:
Unconscious adult – not breathing normally
CRITERIA
- Critical | Unconscious adult, not breathing normally
SITUATIONAL GUIDANCE & IMPORTANT TO ASCERTAIN
- Help is on the way as I speak to you.
- You must start CPR (reviving the person). I will tell you what to do.
- Don't hang up, put the phone on speaker if you can.
- If there is defibrillator at hand, get someone else to fetch it.
- If you suspect a blocked airway, open the mouth to see if you can remove any object.
SCENARIO - AM I (RESCUER) TRAINED IN CPR?
IF YES
- Push down at this rate 30 times.
- Now give rescue breaths.
- Tilt the head back with one hand on the forehead.
- Lift the chin up with the other hand.
- Pinch the nose and give 2 gentle rescue breaths.
- Continue with 30 pushes and 2 rescue breaths until medics take over or the person wakes up.

Output:
### start ###
("entity"|"Unconscious Adult"|"patient_characteristic"|"An adult patient who is unresponsive to stimuli, representing a critical emergency situation requiring immediate intervention.");
("entity"|"Not Breathing Normally"|"symptom"|"Abnormal or absent respiratory pattern in a patient, a life-threatening condition requiring immediate resuscitation efforts.");
("entity"|"CPR"|"procedure"|"Cardiopulmonary resuscitation, a lifesaving technique involving chest compressions and rescue breaths for someone who is unconscious and not breathing normally.");
("entity"|"Defibrillator"|"equipment"|"A device that delivers an electric shock to restore normal heart rhythm in someone experiencing cardiac arrest, significantly improving survival chances when used early.");
("entity"|"Blocked Airway"|"medical_condition"|"Obstruction of the respiratory tract that prevents normal breathing, requiring immediate intervention to prevent asphyxiation.");
("entity"|"30 Compressions"|"emergency_protocol"|"Standard CPR protocol requiring 30 chest compressions delivered at the proper rate and depth before rescue breaths are given.");
("entity"|"2 Rescue Breaths"|"emergency_protocol"|"Ventilation procedure in CPR where two breaths are given after 30 compressions, ensuring oxygen delivery to the patient's lungs.");
("entity"|"Head Tilt-Chin Lift"|"procedure"|"Airway management technique where the head is tilted backward and the chin lifted forward to open the airway before giving rescue breaths.");
("entity"|"Critical Response"|"response_category"|"Highest priority emergency response for life-threatening situations requiring immediate medical intervention.");
("relationship"|"Unconscious Adult"|"Not Breathing Normally"|"These conditions typically occur together in cardiac arrest situations and trigger the need for immediate CPR."|"cardiac arrest, resuscitation trigger, life-threatening"|10);
("relationship"|"CPR"|"30 Compressions"|"CPR protocol specifies delivering 30 chest compressions as the first stage of the resuscitation cycle."|"resuscitation protocol, compression cycle, cardiac support"|9);
("relationship"|"CPR"|"2 Rescue Breaths"|"After chest compressions, CPR requires delivering 2 rescue breaths to provide oxygen to the patient."|"ventilation protocol, oxygenation, airway management"|8);
("relationship"|"Head Tilt-Chin Lift"|"2 Rescue Breaths"|"The head tilt-chin lift maneuver must be performed before rescue breaths to ensure an open airway for effective ventilation."|"airway management, ventilation preparation, respiration support"|9);
("relationship"|"Unconscious Adult"|"Critical Response"|"An unconscious adult not breathing normally represents a critical emergency requiring the highest priority response."|"triage priority, emergency classification, response urgency"|10);
("relationship"|"Blocked Airway"|"CPR"|"A blocked airway must be addressed before or during CPR to ensure effectiveness of resuscitation efforts."|"airway clearance, resuscitation prerequisite, ventilation requirement"|9);
("relationship"|"Defibrillator"|"CPR"|"When available, a defibrillator should be used in conjunction with CPR to increase survival chances in cardiac arrest."|"advanced life support, cardiac intervention, survival improvement"|10);
("content_keywords"|"cardiac arrest response, CPR protocol, critical life support, airway management, emergency responder guidance, resuscitation techniques")
###END###

---Real Data---
Entity_types: [medical_condition, symptom, treatment, procedure, emergency_protocol, response_category, patient_characteristic, location, equipment, severity_level, medication, vital_sign, time_factor, healthcare_professional, communication_method]
Text:
{input_text}"""

In [58]:
#### testinf with just calling llm
response = llm2.chat([ChatMessage(role=MessageRole.USER, content=cot_prompt.format(input_text=documents[0].text))])
# print(response[0])

In [73]:
def parse_custom_llm_response_to_graph_data(response_text: str):
    """
    Parses a custom LLM response format into entities and relationships.

    Expected format per line:
    ("entity"|"name"|"type"|"description");
    ("relationship"|"source"|"target"|"description"|"keywords_str"|strength);
    ("content_keywords"|"keywords_str");

    Args:
        response_text: The string response from the LLM.

    Returns:
        A dictionary with 'entities', 'relationships', and 'content_keywords'.
        Example:
        {
            'entities': [{'name': '...', 'type': '...', 'description': '...'}, ...],
            'relationships': [{'source': '...', 'target': '...', 'description': '...', 'keywords': '...', 'strength': 0}, ...],
            'content_keywords': "keyword1, keyword2"
        }
    """
    entities = []
    relationships = []
    content_keywords_str = None

    # 1. Clean up the overall string: remove start/end markers and strip whitespace
    if "### start ###" in response_text:
        response_text = response_text.split("### start ###", 1)[1]
    if "###END###" in response_text: # Case sensitive as in your example
        response_text = response_text.split("###END###", 1)[0]
    
    response_text = response_text.strip()
    lines = response_text.splitlines()

    # Regex to extract all quoted values from a line
    # e.g., from ("entity"|"Unconscious Adult"|"patient_characteristic"|"description");
    # it will extract "entity", "Unconscious Adult", "patient_characteristic", "description"
    value_extractor_regex = r'"(.*?)"' # Non-greedy match inside quotes

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Remove the surrounding ("...") and trailing semicolon if present
        # This makes extracting values simpler
        if line.startswith('(') and line.endswith(');'):
            core_content = line[1:-2] # Remove (" and );
        else:
            print(f"Warning: Skipping malformed line (no parens/semicolon): {line}")
            continue
            
        # Extract all quoted values first
        all_quoted_values = re.findall(value_extractor_regex, core_content)
        
        if not all_quoted_values:
            print(f"Warning: Skipping line with no quoted values: {line}")
            continue

        record_type = all_quoted_values[0] # "entity", "relationship", or "content_keywords"

        try:
            if record_type == "entity":
                if len(all_quoted_values) >= 4:
                    entity = {
                        'name': all_quoted_values[1],
                        'type': all_quoted_values[2],
                        'description': all_quoted_values[3]
                    }
                    entities.append(entity)
                else:
                    print(f"Warning: Malformed entity line (not enough values): {line}")
            
            elif record_type == "relationship":
                # For relationships, the last part (strength) is not quoted.
                # So we need to split the core_content differently.
                # Example: "Unconscious Adult"|"Not Breathing Normally"|"description"|"keywords"|10
                
                # We'll use the quoted values for the text parts, and then find the strength.
                if len(all_quoted_values) >= 5: # "relationship", source, target, desc, keywords
                    # The strength is the last part, after the last quote and pipe
                    parts = core_content.split('|')
                    strength_str = parts[-1].strip() # Get the part after the last pipe
                    
                    relationship = {
                        'source': all_quoted_values[1],
                        'target': all_quoted_values[2],
                        'description': all_quoted_values[3], # This is the relationship's label
                        'keywords': all_quoted_values[4],    # Store as string, can be split later if needed
                        'strength': int(strength_str)
                    }
                    relationships.append(relationship)
                else:
                    print(f"Warning: Malformed relationship line (not enough quoted values): {line}")

            elif record_type == "content_keywords":
                if len(all_quoted_values) >= 2:
                    content_keywords_str = all_quoted_values[1]
                else:
                    print(f"Warning: Malformed content_keywords line: {line}")
            # else:
            #     print(f"Warning: Unknown record type '{record_type}' in line: {line}")

        except IndexError:
            print(f"Warning: Malformed line (IndexError processing values): {line}")
        except ValueError:
            print(f"Warning: Could not convert strength to int in relationship line: {line}")
        except Exception as e:
            print(f"Warning: Unexpected error parsing line '{line}': {e}")

    return {
        "entities": entities,
        "relationships": relationships,
        "content_keywords": content_keywords_str
    }

In [75]:
parsed_data = parse_custom_llm_response_to_graph_data(response.message.content)

# Print entities and relationships
print(f"Found {len(parsed_data['entities'])} entities and {len(parsed_data['relationships'])} relationships.")
print(f"Content Keywords: {parsed_data['content_keywords']}")

# Example: Access first entity
if parsed_data['entities']:
    print(f"\nFirst entity: {parsed_data['entities'][0]}")

# Example: Access first relationship
if parsed_data['relationships']:
    print(f"\nFirst relationship: {parsed_data['relationships'][0]}")

# Example: Access last entity
if parsed_data['entities']:
    print(f"\nLast entity: {parsed_data['entities'][-1]}")

# Example: Access last relationship
if parsed_data['relationships']:
    print(f"\nLast relationship: {parsed_data['relationships'][-1]}")

print("\n--- All Entities ---")
for i, entity in enumerate(parsed_data['entities']):
    print(f"{i+1}. {entity}")

print("\n--- All Relationships ---")
for i, rel in enumerate(parsed_data['relationships']):
    print(f"{i+1}. {rel}")


Found 15 entities and 9 relationships.
Content Keywords: None

First entity: {'name': 'Unconscious Adult', 'type': 'patient_characteristic', 'description': 'An adult patient who is unresponsive to stimuli, indicating a critical emergency requiring immediate intervention and rapid resuscitation efforts.'}

First relationship: {'source': 'Unconscious Adult', 'target': 'Not Breathing Normally', 'description': 'These two conditions frequently occur together in cardiac arrest situations, initiating the need for immediate CPR.', 'keywords': 'cardiac arrest, resuscitation trigger, critical condition', 'strength': 10}

Last entity: {'name': 'Emergency Medical Response', 'type': 'response_category', 'description': 'A systematic approach involving various protocols and actions in response to life-threatening emergency situations.'}

Last relationship: {'source': 'ROSC', 'target': 'Emergency Medical Response', 'description': 'Following successful resuscitation efforts leading to ROSC, rapid trans

In [40]:
def parse_cot_response(response_str: str):
    """
    Parse the Chain-of-Thought structured output into entities and relationships
    for knowledge graph construction.
    
    Args:
        response_str: The raw output string from the LLM
        
    Returns:
        dict: Contains 'entities' and 'relationships' lists
    """
    # Extract just the text content from the ChatResponse object
    if hasattr(response_str, 'message'):
        response_str = response_str.message.content
    
    # Replace placeholder delimiters with actual delimiters for parsing
    response_str = response_str.replace("{tuple_delimiter}", "|||")
    response_str = response_str.replace("{record_delimiter}", "~~~")
    response_str = response_str.replace("{completion_delimiter}", "")
    
    # Split the response into records
    records = response_str.split("~~~")
    
    entities = []
    relationships = []
    
    for record in records:
        record = record.strip()
        if not record:
            continue
            
        # Clean up the record format
        record = record.strip('()"')
        
        # Split the record into its components
        parts = record.split("|||")
        if len(parts) < 3:
            continue
            
        record_type = parts[0].strip('"')
        
        if record_type == "entity":
            if len(parts) >= 4:
                entity = {
                    "name": parts[1].strip(),
                    "type": parts[2].strip(),
                    "description": parts[3].strip()
                }
                entities.append(entity)
                
        elif record_type == "relationship":
            if len(parts) >= 5:
                # Clean and extract just the numerical part for strength
                strength_str = parts[4].strip()
                # Extract only digits from the strength string
                import re
                strength_match = re.search(r'\d+', strength_str)
                strength = int(strength_match.group()) if strength_match else 5  # Default to 5 if no number found
                
                relationship = {
                    "source": parts[1].strip(),
                    "target": parts[2].strip(),
                    "description": parts[3].strip(),
                    "strength": strength
                }
                relationships.append(relationship)
            elif len(parts) >= 4:  # Handle case without strength
                relationship = {
                    "source": parts[1].strip(),
                    "target": parts[2].strip(),
                    "description": parts[3].strip()
                }
                relationships.append(relationship)
    
    return {"entities": entities, "relationships": relationships}

In [43]:
# Example usage
response_data = parse_cot_response(response)

# Print entities and relationships
print(f"Found {len(response_data['entities'])} entities and {len(response_data['relationships'])} relationships")

# Example: Access first entity
if response_data['entities']:
    print(f"First entity: {response_data['entities'][0]}")

# Example: Access first relationship
if response_data['relationships']:
    print(f"First relationship: {response_data['relationships'][10]}")

Found 12 entities and 13 relationships
First entity: {'name': 'NOT BREATHING NORMALLY', 'type': 'Symptom', 'description': 'A critical symptom indicating lack of normal respiration, often associated with unconsciousness.'}
First relationship: {'source': 'DROWNING', 'target': 'NOT BREATHING NORMALLY', 'description': 'Is a potential cause leading to abnormal respiratory patterns and unconsciousness.', 'strength': 8}


In [44]:
# --- 2. Convert your data to PropertyGraphNode and PropertyGraphRelationship objects ---
from llama_index.core.graph_stores.types import EntityNode, ChunkNode, Relation

pg_nodes = []
for entity_dict in response_data['entities']:
    # Ensure all properties are strings or basic types Pydantic can handle
    # The 'name' becomes id_, 'type' becomes label, rest go into properties
    properties = {k: v for k, v in entity_dict.items() if k not in ['name', 'type']}

    node = EntityNode(
        name=entity_dict['name'],
        label=entity_dict['type'],
        properties=properties # Store original description and any other fields
    )
    pg_nodes.append(node)

pg_relationships = []
for rel_dict in response_data['relationships']:
    # Ensure all properties are strings or basic types
    properties = {
        # 'description': str(rel_dict.get('description', '')), # The description IS the label here
        'strength': rel_dict.get('strength', 0)
    }
    # The 'label' for a relationship is often its type or verb phrase
    # Here, the 'description' of the relationship itself acts as a good label
    relationship = Relation(
        source_id=rel_dict['source'],
        target_id=rel_dict['target'],
        label=rel_dict['description'], # Using the relationship's description as its type/label
        properties=properties
    )
    pg_relationships.append(relationship)

In [80]:

# --- 1. Your Parser Function (from previous answer) ---
def parse_custom_llm_response_to_graph_data(response_text: str):
    # ... (parser code from the previous response)
    entities = []
    relationships = []
    content_keywords_str = None
    if "### start ###" in response_text:
        response_text = response_text.split("### start ###", 1)[1]
    if "###END###" in response_text:
        response_text = response_text.split("###END###", 1)[0]
    response_text = response_text.strip()
    lines = response_text.splitlines()
    value_extractor_regex = r'"(.*?)"'
    for line in lines:
        line = line.strip()
        if not line: continue
        if line.startswith('(') and line.endswith(');'):
            core_content = line[1:-2]
        else:
            print(f"Warning: Skipping malformed line (no parens/semicolon): {line}")
            continue
        all_quoted_values = re.findall(value_extractor_regex, core_content)
        if not all_quoted_values:
            print(f"Warning: Skipping line with no quoted values: {line}")
            continue
        record_type = all_quoted_values[0]
        try:
            if record_type == "entity":
                if len(all_quoted_values) >= 4:
                    entities.append({
                        'name': all_quoted_values[1],
                        'type': all_quoted_values[2],
                        'description': all_quoted_values[3]
                    })
                else: print(f"Warning: Malformed entity line: {line}")
            elif record_type == "relationship":
                if len(all_quoted_values) >= 5:
                    parts = core_content.split('|')
                    strength_str = parts[-1].strip()
                    relationships.append({
                        'source': all_quoted_values[1],
                        'target': all_quoted_values[2],
                        'description': all_quoted_values[3],
                        'keywords': all_quoted_values[4],
                        'strength': int(strength_str)
                    })
                else: print(f"Warning: Malformed relationship line: {line}")
            elif record_type == "content_keywords":
                if len(all_quoted_values) >= 2:
                    content_keywords_str = all_quoted_values[1]
                else: print(f"Warning: Malformed content_keywords line: {line}")
        except (IndexError, ValueError) as e:
            print(f"Warning: Error parsing line '{line}': {e}")
    return {"entities": entities, "relationships": relationships, "content_keywords": content_keywords_str}


# --- 3. Parse the response ---
response_data = parse_custom_llm_response_to_graph_data(response.message.content)

print(f"Parsed: {len(response_data['entities'])} entities and {len(response_data['relationships'])} relationships.")
if response_data['entities']:
    print(f"First parsed entity: {response_data['entities'][0]}")
if response_data['relationships']:
    print(f"First parsed relationship: {response_data['relationships'][0]}")


# --- 4. Convert parsed data to PropertyGraphNode and PropertyGraphRelationship objects ---
pg_nodes = []
for entity_dict in response_data['entities']:
    # 'name' becomes id_, 'type' becomes label for the PropertyGraphNode
    # 'description' goes into properties
    node = EntityNode(
        name=entity_dict['name'],        # Unique ID for the node
        label=entity_dict['type'],      # Type of the node (e.g., "symptom", "procedure")
        properties={
            "description": entity_dict['description']
            # You can add other original fields here if needed, e.g.
            # "original_name": entity_dict['name'] # if id_ might be transformed
        }
    )
    pg_nodes.append(node)

pg_relationships = []
for rel_dict in response_data['relationships']:
    # 'description' of the relationship becomes its label (type)
    # 'strength' and 'keywords' go into properties
    # Note: The 'description' from your parsed relationship data is often a full sentence.
    # For PropertyGraphRelationship, the 'label' is typically a concise relationship type
    # (e.g., "CAUSES", "INDICATES", "PART_OF"). If you have a more concise relationship
    # type available or can derive one, that might be better for the 'label'.
    # For now, we'll use the provided 'description' as the label as per your previous structure.
    relationship = Relation(
        source_id=rel_dict['source'],
        target_id=rel_dict['target'],
        label=rel_dict['keywords'], # Using the descriptive text as the relationship label
        properties={
            "strength": rel_dict.get('strength', 0), # Default to 0 if not present
            "keywords": rel_dict.get('description', "")  # Default to empty string
        }
    )
    pg_relationships.append(relationship)

print(f"\nConverted to {len(pg_nodes)} PropertyGraphNodes and {len(pg_relationships)} PropertyGraphRelationships.")
if pg_nodes:
    print(f"First PropertyGraphNode: {pg_nodes[0]}")
if pg_relationships:
    print(f"First PropertyGraphRelationship: {pg_relationships[0]}")

# --- 5. Initialize a PropertyGraphStore ---
graph_store = SimplePropertyGraphStore()

# --- 6. Add nodes and relationships to the graph store ---
graph_store.upsert_nodes(pg_nodes)
graph_store.upsert_relations(pg_relationships)
print("\nNodes and relationships added to SimplePropertyGraphStore.")

# --- 7. Create the PropertyGraphIndex from the existing graph store ---
index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm,
    embed_kg_nodes=False
)
print("PropertyGraphIndex created successfully.")

# --- 8. Query the index ---
if Settings.llm: # Only query if LLM is configured
    query_engine = index.as_query_engine(include_text=False)
    print("\n--- Example Queries ---")
    queries = [
        "What is 'Not Breathing Normally'?",
        "What are the characteristics of an 'Unconscious Adult'?",
        "What procedure is related to 'Unconscious Adult' when they are 'Not Breathing Normally'?",
        "What is the relationship between CPR and a Defibrillator?"
    ]
    for q_text in queries:
        print(f"\nQuery: {q_text}")
        try:
            response = query_engine.query(q_text)
            print(f"Response: {response}")
        except Exception as e:
            print(f"Error during query: {e}")
else:
    print("\nSkipping queries as LLM is not configured.")

# You can also inspect the schema the index has inferred/stored
print("\n--- Graph Schema ---")
try:
    schema = index.get_schema(refresh=False) # Get cached schema
    print(schema)
except Exception as e:
    print(f"Could not retrieve schema: {e}")

Parsed: 15 entities and 9 relationships.
First parsed entity: {'name': 'Unconscious Adult', 'type': 'patient_characteristic', 'description': 'An adult patient who is unresponsive to stimuli, indicating a critical emergency requiring immediate intervention and rapid resuscitation efforts.'}
First parsed relationship: {'source': 'Unconscious Adult', 'target': 'Not Breathing Normally', 'description': 'These two conditions frequently occur together in cardiac arrest situations, initiating the need for immediate CPR.', 'keywords': 'cardiac arrest, resuscitation trigger, critical condition', 'strength': 10}

Converted to 15 PropertyGraphNodes and 9 PropertyGraphRelationships.
First PropertyGraphNode: Unconscious Adult ({'description': 'An adult patient who is unresponsive to stimuli, indicating a critical emergency requiring immediate intervention and rapid resuscitation efforts.'})
First PropertyGraphRelationship: cardiac arrest, resuscitation trigger, critical condition ({'strength': 10, '

In [81]:
index.property_graph_store.save_networkx_graph(name="./kg_manual2.html")

In [45]:
print(f"Converted to {len(pg_nodes)} PropertyGraphNodes and {len(pg_relationships)} PropertyGraphRelationships.")

Converted to 12 PropertyGraphNodes and 13 PropertyGraphRelationships.


In [47]:
graph_store = SimplePropertyGraphStore()
graph_store.upsert_nodes(pg_nodes)
graph_store.upsert_relations(pg_relationships)

print("Nodes and relationships added to SimplePropertyGraphStore.")

Nodes and relationships added to SimplePropertyGraphStore.


In [49]:
index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm,  # Use the same LLM for querying and embedding
    embed_kg_nodes=False  # Use the same embedding model for consistency
    # If you had original text Documents from which these entities/relationships were extracted,
    # you could pass them as `nodes` (List[BaseNode]) here to also build a vector index component.
    # For now, we're focusing on the graph structure you've already extracted.
)
print("PropertyGraphIndex created successfully.")

PropertyGraphIndex created successfully.


In [50]:
index.property_graph_store.save_networkx_graph(name="./kg_manual.html")

In [51]:
query_engine = index.as_query_engine(
    # llm=Settings.llm, # Not needed here if already in global Settings
    include_text=False, # Set to False if you ONLY want to query the graph structure
                        # and don't have associated text chunks in a vector store component.
                        # If you had passed `nodes` (text documents) during index creation,
                        # you might set this to True (or leave default) for hybrid search.
    # For more control, you can pass property_graph_tool_kwargs
    # property_graph_tool_kwargs={"graph_schema_output_parser": KGPropertyGraphQuery()} # if you need more control
)

In [9]:
Settings.llm = llm

In [16]:
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor, DynamicLLMPathExtractor
from typing import List, Tuple, Dict, Any, Optional
import re

def parse_fn(response_str: str) -> List[Tuple[str, str, str]]:
    """Parse the structured output from the CoT prompt into triples.
    
    Args:
        response_str: The output string from the LLM
        
    Returns:
        List of triples in (subject, predicate, object) format
    """
    # Define the delimiters (same as in the prompt)
    tuple_delimiter = r"\{tuple_delimiter\}"
    record_delimiter = r"\{record_delimiter\}"
    completion_delimiter = r"\{completion_delimiter\}"
    
    # Clean up the response string
    response_str = response_str.replace(completion_delimiter, "").strip()
    
    # Extract records using record delimiter
    records = re.split(record_delimiter, response_str)
    
    triples = []
    
    # Process each record
    for record in records:
        record = record.strip()
        if not record:
            continue
            
        # Check if it's an entity or relationship
        parts = re.split(tuple_delimiter, record.strip('(")'))
        if len(parts) < 3:
            continue
            
        record_type = parts[0]
        
        if record_type == "entity":
            # Entities become self-referential triples with type as predicate
            if len(parts) >= 3:
                entity_name = parts[1].strip()
                entity_type = parts[2].strip()
                triples.append((entity_name, "type", entity_type))
                
                # If there's a description, add it too
                if len(parts) >= 4:
                    description = parts[3].strip()
                    triples.append((entity_name, "description", description))
                    
        elif record_type == "relationship":
            # Relationships become direct triples
            if len(parts) >= 4:
                source = parts[1].strip()
                target = parts[2].strip()
                relation = parts[3].strip()
                triples.append((source, relation, target))
                
                # Also add strength if available
                if len(parts) >= 5:
                    strength = parts[4].strip()
                    triples.append((source + "_to_" + target, "strength", strength))
    
    return triples

# # Create the extractor with our parsing function
kg_extractor = SimpleLLMPathExtractor(
    llm=llm,
    extract_prompt=CoT_prompt_template,
    parse_fn=parse_fn,
    max_paths_per_chunk=50
)


kg_extractor_2 = DynamicLLMPathExtractor(
    llm=llm,
    extract_prompt=CoT_prompt_template,
    num_workers=4,

)

In [17]:
index = PropertyGraphIndex.from_documents(
    documents,
    # llm=llm,  # Use the wrapped model
    # embed_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    kg_extractor=kg_extractor,
    show_progress=True,
    embed_kg_nodes=False,
    storage_context=storage_context,  # Use the custom storage context
)


Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting paths from text: 100%|██████████| 13/13 [00:29<00:00,  2.24s/it]
Extracting implicit paths: 100%|██████████| 13/13 [00:00<00:00, 6405.77it/s]


In [18]:
index.property_graph_store.save_networkx_graph(name="./kg_test2+simpleparcer.html")