<h4>Installing Required Python Packages

In [None]:
!pip install langchain_core
!pip install langchain
!pip install langchain_experimental
!pip install neo4j
!pip install yfiles_jupyter_graphs
!pip install PyPDF2
!pip install pypdf
!pip install requests
!pip install py2neo
!pip install python-dotenv
!pip install langchain langchain_groq pydantic
!pip install langchain_community
!pip install sentence-transformers

In [None]:
from langchain.graphs import Neo4jGraph

url = "ENTER YOUR NEO4J URL"
username = "ENTER YOUR NEO4J USERNAME"
password = "ENTER YOUR NEO4J PASSWORD"

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

import os
os.environ["GROQ_API_KEY"] = "API_KEY"
api_key = os.environ['GROQ_API_KEY']



<h4> Defining Custom Pydantic Models for Knowledge Graph

In [None]:
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document

class Property(BaseModel):
    """A single property consisting of key and value"""
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class Node(BaseNode):
    id: str = Field(..., description="Unique identifier for the node")
    name: str = Field(..., description="Name or title of the node")
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    source: Node = Field(..., description="Source node of the relationship")
    target: Node = Field(..., description="Target node of the relationship")
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )
    text: str = Field(description="Source text from which this relationship was extracted")


class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

<h4> Formatting & Mapping

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties


def map_to_base_node(node: Node) -> BaseNode:
    properties = props_to_dict(node.properties) if node.properties else {}
    properties["name"] = node.name.strip()  # store the actual entity name
    return BaseNode(
        id=node.id.strip(),
        type=node.type.strip(),
        properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

<h4> LLM Chain for Knowledge Extraction 

In [None]:
import os
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain.chains import LLMChain
from pydantic import BaseModel, Field
from typing import List, Optional


# Core biomedical schema for Type 2 Diabetes
ENTITY_TYPES = [
    "Disease", "Symptom", "Drug", "Complication", "Biomarker",
    "RiskFactor", "TreatmentProcedure", "Gene", "MolecularTarget", "LifestyleFactor"
]

RELATION_TYPES = [
    "treats", "causes", "associated_with", "has_symptom",
    "contraindicated_with", "expresses", "linked_to", "increases_risk_of",
    "reduces_risk_of", "side_effect_of"
]


os.environ["GROQ_API_KEY"] = "API_KEY"

def get_extraction_chain_groq(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
):
    """
    Fonction d'extraction avec prompt spécifiant clairement le format de sortie
    """

    # Parser pour le format KnowledgeGraph
    parser = PydanticOutputParser(pydantic_object=KnowledgeGraph)

    # System prompt avec exemple de format
    system_prompt = PromptTemplate(
        template=f"""
You are an expert biomedical knowledge extraction system for constructing a Type 2 Diabetes knowledge graph.
Your goal is to extract the **maximum possible medically relevant entities and relationships** from the provided text and output them in the specified JSON format.

## STRICT RULES:
- You should use ONLY these ENTITY TYPES when possible: {ENTITY_TYPES}
- You should use ONLY RELATION TYPES when possible: {RELATION_TYPES}
- Entities and relations must be relevant to the biomedical domain (Type 2 Diabetes, treatments, symptoms, comorbidities, drugs, genes, risk factors, lifestyle factors, etc.).
- Be consistent: use the exact same spelling for identical entities or relation names for both node 'id' and 'name'. The 'id' and 'name' should usually be the same string representation of the entity.
- All relationships must be **medically correct and evidence-based**.


## CRITICAL: EXACT OUTPUT FORMAT REQUIRED
You MUST follow this EXACT JSON structure. Every field is MANDATORY:

Exemple :

```json
{{{{
  "nodes": [
    {{{{
      "id": "Type_2_Diabetes_Mellitus",
      "name": "Type 2 Diabetes Mellitus",
      "type": "Disease",
      "properties": []
    }}}},
    {{{{
      "id": "Insulin_Resistance",
      "name": "Insulin resistance",
      "type": "Biomarker",
      "properties": []
    }}}},
    {{{{
      "id": "Obesity",
      "name": "Obesity",
      "type": "RiskFactor",
      "properties": []
    }}}},
    {{{{
      "id": "Metformin",
      "name": "Metformin",
      "type": "Drug",
      "properties": []
    }}}},
    {{{{
      "id": "Diabetic_Nephropathy",
      "name": "Diabetic nephropathy",
      "type": "Complication",
      "properties": []
    }}}}
  ],
  "rels": [
    {{{{
      "source": {{{{
        "id": "Type_2_Diabetes_Mellitus",
        "name": "Type 2 Diabetes Mellitus",
        "type": "Disease",
        "properties": []
      }}}},
      "target": {{{{
        "id": "Insulin_Resistance",
        "name": "Insulin resistance",
        "type": "Biomarker",
        "properties": []
      }}}},
      "type": "associated_with",
      "properties": [],
      "text": "Patients with Type 2 Diabetes Mellitus often develop insulin resistance"
    }}}},
    {{{{
      "source": {{{{
        "id": "Insulin_Resistance",
        "name": "Insulin resistance",
        "type": "Biomarker",
        "properties": []
      }}}},
      "target": {{{{
        "id": "Obesity",
        "name": "Obesity",
        "type": "RiskFactor",
        "properties": []
      }}}},
      "type": "associated_with",
      "properties": [],
      "text": "insulin resistance, which is strongly associated with obesity"
    }}}},
    {{{{
      "source": {{{{
        "id": "Hyperglycemia",
        "name": "Hyperglycemia",
        "type": "Biomarker",
        "properties": []
      }}}},
      "target": {{{{
        "id": "Diabetic_Nephropathy",
        "name": "Diabetic nephropathy",
        "type": "Complication",
        "properties": []
      }}}},
      "type": "leads_to",
      "properties": [],
      "text": "Prolonged hyperglycemia leads to complications such as diabetic nephropathy"
    }}}},
    {{{{
      "source": {{{{
        "id": "Metformin",
        "name": "Metformin",
        "type": "Drug",
        "properties": []
      }}}},
      "target": {{{{
        "id": "Insulin_Resistance",
        "name": "Insulin resistance",
        "type": "Biomarker",
        "properties": []
      }}}},
      "type": "improves",
      "properties": [],
      "text": "Metformin improves insulin sensitivity"
    }}}}
  ]
}}}}
```

## MANDATORY FORMAT RULES:
1. Every node MUST have: "id", "name", "type", "properties" (even if properties is empty array [])
2. Every relationship MUST have: "source", "target", "type", "properties"
3. Every "source" and "target" in relationships MUST be complete node objects with "id", "name", "type", "properties"
4. "id" and "name" should be identical for each entity
5. "properties" should be an empty array [] if no properties exist
6. DO NOT omit any required fields

## ENTITY TYPING GUIDANCE:
- Disease = medical condition (Type 2 Diabetes Mellitus)
- Symptom = patient-experienced
- Complication = consequence of disease
- Biomarker = measurable indicator
- RiskFactor = predisposing factor
- Drug = pharmacologic agent
- Gene = genetic factor
- MolecularTarget = specific biological target
- LifestyleFactor = behavioral or environmental factor

##RELATION TYPING GUIDANCE:
- Treats = subject is used to cure or manage the object
- Causes = subject directly produces or leads to the object
- Associated_with = subject and object occur together or correlate, not necessarily causal
- Has_symptom = object is a symptom of the subject
- Contraindicated_with = subject should not be used with object
- Expresses = subject produces or displays the object (e.g., gene → protein)
- Linked_to = subject and object are connected in a broad or unspecified way
- Increases_risk_of = subject raises the likelihood of the object
- Reduces_risk_of = subject lowers the likelihood of the object
- Side_effect_of = subject is an adverse effect caused by the object



Output ONLY the JSON object following the exact format above. Do not include any additional text, explanations, or markdown formatting.
""",
        input_variables=[]
    )

    system_message_prompt = SystemMessagePromptTemplate(prompt=system_prompt)

    # Human prompt simplifié
    human_prompt = PromptTemplate(
        template="""
Extract entities and relationships from the following biomedical text about Type 2 Diabetes Mellitus.

Follow the exact JSON format specified in the system prompt. Ensure every required field is present.

Text: {doc}

JSON Output:
""",
        input_variables=["doc"]
    )

    human_message_prompt = HumanMessagePromptTemplate(prompt=human_prompt)

    # Chat prompt
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

    # Modèle Groq
    model = ChatGroq(temperature=0, model_name="meta-llama/llama-4-scout-17b-16e-instruct")

    # Chaîne LLM avec parser
    chain = chat_prompt | model | parser

    return chain


 <h4>Extract & Store Graph Data into Neo4j

In [None]:
def extract_and_store_graph(
    document: Document,
    nodes: Optional[List[str]] = None,
    rels: Optional[List[str]] = None
) -> None:
    # Extract graph data using the Groq chain
    extract_chain = get_extraction_chain_groq(nodes, rels)
    kg = extract_chain.invoke({"doc": document.page_content})

    # Ensure 'nodes' and 'rels' are present in the response
    if not hasattr(kg, 'nodes') or not hasattr(kg, 'rels'):
        raise ValueError("Missing 'nodes' or 'rels' in the extracted KnowledgeGraph object")

    # Construct a graph document
    graph_document = GraphDocument(
    nodes=[map_to_base_node(node) for node in kg.nodes],
    relationships=[map_to_base_relationship(rel) for rel in kg.rels],
    source=document
    )

    # Store information into a graph
    print(f"Adding graph document for: {document.metadata.get('source', 'unknown source')}")
    graph.add_graph_documents([graph_document])
    return graph_document

<h4> Building the Graph from documents

In [None]:
"""# 2000"""

from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime
import os
from google.colab import drive
from tqdm import tqdm

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/Diabetes_KG_Project/2023_1'

text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=100)

all_chunks = []

pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
for file_name in tqdm(pdf_files, desc="Processing PDFs"):
    file_path = os.path.join(folder_path, file_name)

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load()  
        full_text = " ".join([p.page_content for p in pages])
        documents = text_splitter.create_documents([full_text])
        all_chunks.extend(documents)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Processing chunks with tqdm progress bar
distinct_nodes = set()
relations = []
extracted_knowledge_graphs = []

for i, d in tqdm(enumerate(all_chunks), total=len(all_chunks), desc="Processing Chunks"):
    #graph_document = extract_and_store_graph(d)

    #extracted_knowledge_graphs.append(graph_document)

    # Get distinct nodes
    for node in graph_document.nodes:
        distinct_nodes.add(node.id)

    # Get all relations
    for relation in graph_document.relationships:
        relations.append(relation.type)

In [None]:
"""# 2023"""

from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime
import os
from google.colab import drive
from tqdm import tqdm

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/Diabetes_KG_Project/2023_2'

text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=100)

all_chunks = []

pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
for file_name in tqdm(pdf_files, desc="Processing PDFs"):
    file_path = os.path.join(folder_path, file_name)

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load()  
        full_text = " ".join([p.page_content for p in pages])
        documents = text_splitter.create_documents([full_text])
        all_chunks.extend(documents)

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Processing chunks with tqdm progress bar
distinct_nodes = set()
relations = []
extracted_knowledge_graphs = []

for i, d in tqdm(enumerate(all_chunks), total=len(all_chunks), desc="Processing Chunks"):
    #graph_document = extract_and_store_graph(d)

    #extracted_knowledge_graphs.append(graph_document)

    # Get distinct nodes
    for node in graph_document.nodes:
        distinct_nodes.add(node.id)

    # Get all relations
    for relation in graph_document.relationships:
        relations.append(relation.type)

<h4>Visualizing the Knowledge Graph 

In [None]:
# ditectly show the graph resulting from  the given cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 500"

from google.colab import output
output.enable_custom_widget_manager()

from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from google.colab import output

def showGraph(cypher:str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = url,
        auth = (username, password))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = "id"
    # display(widget)
    return widget

showGraph()

In [None]:
nodes = graph.query("MATCH (n) RETURN count(n)")[0]['count(n)']
rels = graph.query("MATCH ()-[r]->() RETURN count(r)")[0]['count(r)']
print(f"Nœuds: {nodes:,} | Relations: {rels:,}")

<h4> LLM-Powered Node Filtering (Clean Irrelevant Nodes)

In [None]:
from neo4j import GraphDatabase
from groq import Groq
import json

GROQ_API_KEY = "API_KEY"
client = Groq(api_key=GROQ_API_KEY)
driver = GraphDatabase.driver(url, auth=(username, password))

def get_nodes_from_label(label: str):
    """Extract node names from Neo4j for a given label."""
    query = f"""
    MATCH (n:{label})
    RETURN DISTINCT n.name AS node
    """
    with driver.session() as session:
        results = session.run(query)
        return [record["node"] for record in results if record["node"]]

def filter_irrelevant_nodes(label: str, nodes: list):
    """Ask LLM to identify irrelevant nodes for T2D."""
    prompt = f"""
You are a medical expert specialized in Type 2 Diabetes (T2D).
Here is a list of nodes coming from a knowledge graph (category: {label}).

Your task is to identify the nodes that:
- are NOT medical concepts,
- are NOT relevant to Type 2 Diabetes,
- or make no medical sense.

Return STRICTLY in JSON format only:
{{
  "irrelevant_nodes": ["Node1", "Node2", ...]
}}

### Examples:

Input List: [Insulin, Glucose metabolism, Car Engine, Guitar, Metformin]
Output: {{"irrelevant_nodes": ["Car Engine", "Guitar"]}}

Input List: [Obesity, Hypertension, Banana, Blood Pressure, T2D, Chair]
Output: {{"irrelevant_nodes": ["Banana", "Chair"]}}

Input List: [Diabetes Distress, Depression, Anxiety, Happiness, Leg Pain, iPhone]
Output: {{"irrelevant_nodes": ["Happiness", "iPhone"]}}

---

Now process the following list:

List: {nodes}
"""

    response = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},

    )

    result = response.choices[0].message.content

    try:
        irrelevant_nodes = json.loads(result)["irrelevant_nodes"]
    except Exception as e:
        print("Error parsing LLM response:", e)
        irrelevant_nodes = []

    return irrelevant_nodes

if __name__ == "__main__":
    # Enter the label you want to process
    label = input("Enter the label name (e.g. Symptom, Drug, Treatment): ").strip()

    nodes = get_nodes_from_label(label)
    print(f"✅ Extracted {len(nodes)} nodes for label '{label}'")

    irrelevant = filter_irrelevant_nodes(label, nodes)
    print(f"\n🚮 Irrelevant nodes suggested for deletion in '{label}':\n{irrelevant}")
