## Import libraries

In [9]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
import os
from langchain.chains.structured_output.base import (
    create_openai_fn_runnable,
    create_structured_output_runnable,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import TextLoader
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain_community.chat_models import ChatOllama
from langchain_neo4j import Neo4jGraph
from neo4j.debug import watch

## Connection to Neo4J Aura

In [11]:
watch('neo4j')

url = "neo4j+ssc://2aee9011.databases.neo4j.io"
username ="neo4j"
password = "IjSw2rsQxHJezdBNzJMyNvmM2_fXEWHBU7OkodD8Wz4"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,424  [#0000]  _: <POOL> created, routing address IPv4Address(('2aee9011.databases.neo4j.io', 7687))
[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,424  [#0000]  _: <POOL> created, routing address IPv4Address(('2aee9011.databases.neo4j.io', 7687))
[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,424  [#0000]  _: <POOL> created, routing address IPv4Address(('2aee9011.databases.neo4j.io', 7687))
[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,424  [#0000]  _: <POOL> created, routing address IPv4Address(('2aee9011.databases.neo4j.io', 7687))
[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,424  [#0000]  _: <POOL> created, routing address IPv4Address(('2aee9011.databases.neo4j.io', 7687))
[DEBUG   ] [Thread 8488438912] [Task 4727199312     ] 2025-05-22 10:55:24,428  [#0000]  _: <WORKSPACE> resolve home database
[DEBU

## Setup env variables

In [None]:
%env OPENAI_API_KEY=NO_KEY
%env LANGCHAIN_TRACING_V2=true
%env LANGCHAIN_API_KEY=YOUR_KEY

## Class definitions for output structure in KG

In [4]:
class Property(BaseModel):
    """A single property consisting of key and value"""
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="List of relationship properties")

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")

## Utilities for KG representation

In [5]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

## Create information extraction pipeline

In [None]:
#LLM configuration - Set up model to use for data extraction

#GPT API
#llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)
#Ollama - Llama3 or TinyLlama
llm = ChatOllama(model="llama3")


@traceable
def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting clinical and molecular information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts.
- The aim is to achieve simplicity and clarity in the knowledge graph, respecting clinical terminology as its public will be healthcare workers.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use snake_case for property keys, e.g., 'birth_date'.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he", "Mr. Doe"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_runnable(KnowledgeGraph, llm, prompt)

In [27]:
def extract_and_store_graph(document: Document,nodes:Optional[List[str]] = None, rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    print(graph_document.dict())
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [28]:
loader = TextLoader('./input_docs/test_input.txt')
doc_input = loader.load()

try:
    extract_and_store_graph(doc_input[0])
except Exception as e:
    print(type(e))
    print(e)

{'nodes': [{'id': 'Cheo', 'type': 'Hospital', 'properties': {'name': 'Cheo', 'address': '401 Smyth Rd, Ottawa, Ontario K1H 8L1', 'phone': '613-737-7600 ext. 3796', 'fax': '613-738-4814'}}, {'id': 'Genetics Diagnostic Laboratory', 'type': 'Laboratory', 'properties': {'name': 'Genetics Diagnostic Laboratory', 'address': '401 Smyth Rd, Ottawa, Ontario K1H 8L1', 'phone': '613-737-7600 ext. 3796', 'fax': '613-738-4814'}}, {'id': 'Patient', 'type': 'Patient', 'properties': {'name': 'Patient'}}, {'id': 'Hereditary Breast/Ovarian/Prostate Cancer Panel', 'type': 'Panel', 'properties': {'status': 'In process', 'name': 'Hereditary Breast/Ovarian/Prostate Cancer Panel'}}, {'id': 'Vincent, Krista, Md', 'type': 'Physician', 'properties': {'name': 'Vincent, Krista, Md'}}, {'id': 'Zelenietz Sari', 'type': 'Physician', 'properties': {'name': 'Zelenietz Sari'}}, {'id': 'Specimen A', 'type': 'Specimen', 'properties': {'type': 'Genetics Blood, edta', 'quantity': 'x2', 'name': 'Specimen A'}}, {'id': 'Msh6'