In [None]:
!pip install --quiet neo4j langchain-community langchain-core langchain-openai langchain-text-splitters tiktoken wikipedia langchain-groq

In [None]:
import os
import re
import json 
import neo4j
from typing import List, Type, Optional
from pydantic import Field, BaseModel, ValidationError
from langchain_groq import ChatGroq
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate

In [None]:
# --- Set up GROQ API Key ---
if "GROQ_API_KEY" not in os.environ:
    print("GROQ_API_KEY not found in environment variables. Please set it in the .env file.")
    # os.environ["GROQ_API_KEY"] = "replace_with_your_groq_api_key"  # Uncomment and set your key if needed

In [None]:
llm = ChatGroq(
    model="deepseek-r1-distill-llama-70b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    # other params...
)

In [None]:
#Define Pydantic model
class AtomicFact(BaseModel):
    key_elements: List[str]
    atomic_fact: str


class ChunkEnrichment(BaseModel):
    atomic_facts: List[AtomicFact]


#Output parser
parser = PydanticOutputParser(pydantic_object=ChunkEnrichment)

In [None]:
#Define prompt template with a placeholder for the input
enrichment_prompt = (
    "You are an intelligent assistant tasked with meticulously extracting structured information consisting:\n\n"
    "1. Key Elements: The essential nouns (e.g., characters, times, events, places, numbers) or verbs (actions), or adjectives (states, feelings) pivotal to the text's narrative.\n"
    "2. Atomic Facts: The smallest, indivisible facts, presented as concise sentences. Each Fact must:\n"
    "   - Be stand alone, with no ambiguity or missing context.\n"
    "   - Include all necessary details (e.g., full names, dates, numbers).\n"
    "   - Clarify any ambiguous terms.\n"
    "   - Not depend on any other fact for meaning.\n"
    "   - Avoid hallucination or guessing; only use information present in the text.\n"
    "Use the format: \n{format_instructions}\n for response from the following text:\n{placeholder}"
)

enrichment_temp = PromptTemplate(
    template=enrichment_prompt,
    input_variables=["placeholder"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


rendered_prompt = enrichment_temp.format(placeholder=" ")

In [None]:
def llm_client(input,prompt=None):
    """Invoke the LLM with the given input."""
    messages = [
        (
            "system",
            prompt,
        ),
        ("human", input),
    ]
    return llm.invoke(messages).content

In [None]:
def extract_json_from_response(response: str):
    """
    Extracts the first JSON code block from a string and parses it.
    Returns the parsed Python object, or None if not found/invalid.
    """
    # Regex to find a JSON code block (```json ... ```)
    match = re.search(r"```json\s*(\{.*?\})\s*```", response, re.DOTALL)
    if not match:
        # Fallback: try to find any {...} block
        match = re.search(r"(\{.*\})", response, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("JSON decode error:", e)
            return None
    return None

In [None]:
def extract_facts_and_entities(content):
    op = llm_client(content, rendered_prompt)
    extracted_json = extract_json_from_response(op)
    return extracted_json['atomic_facts'] if extracted_json and 'atomic_facts' in extracted_json else None

In [None]:
# Neo4j connection
driver = neo4j.GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "neo4j1999"))
NEO4J_DATABASE = "graphreader1"

In [None]:
def ingest_knowledge_graph(patient_data):
    with driver.session(database=NEO4J_DATABASE) as session:
        for patient_id, domains in patient_data.items():
            # Create Patient node
            session.run("MERGE (p:Patient {id: $id})", id=patient_id)


            # Process each domain
            for domain_name, content in domains.items():
                session.run(
                    "MATCH (p:Patient {id: $patient_id}) "
                    "MERGE (d:Domain {name: $name, content: $content}) "
                    "MERGE (p)-[:HAS_DOMAIN]->(d)",
                    patient_id=patient_id, name=domain_name, content=content
                )
                # Extract and store facts and entities
                facts = extract_facts_and_entities(content)
                for fact_data in facts:
                    atomic_fact = fact_data["atomic_fact"]
                    key_elements = fact_data["key_elements"]
                    fact_result = session.run(
                        "MATCH (d:Domain {name: $name, content: $content}) "
                        "MERGE (f:AtomicFact {fact: $fact}) "
                        "MERGE (d)-[:CONTAINS_FACT]->(f) "
                        "RETURN id(f) AS fact_id",
                        name=domain_name, content=content, fact=atomic_fact
                    )
                    fact_id = fact_result.single()["fact_id"]
                    for entity in key_elements:
                        session.run(
                            "MATCH (f:AtomicFact) WHERE id(f) = $fact_id "
                            "MERGE (e:KeyEntity {name: $name}) "
                            "MERGE (f)-[:HAS_ENTITY]->(e)",
                            fact_id=fact_id, name=entity
                        )


In [None]:
with open("diverse_patient_data.json", "r") as f:
        patient_data = json.load(f)
ingest_knowledge_graph(patient_data)
driver.close()