# RAG with Knowledge Graph - Load Data

In [None]:
This notebook and associate Python 

In [10]:
# Install some packages that are needed. 

!pip install neo4j langchain langchain-community ollama

Collecting ollama
  Downloading ollama-0.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting httpx<0.28.0,>=0.27.0 (from ollama)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<0.28.0,>=0.27.0->ollama)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading ollama-0.2.0-py3-none-any.whl (9.5 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m 

In [1]:
# Imports needed

import glob
import json
import os
import re

from pprint import pprint

from langchain.llms import Ollama
from langchain.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector
#from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOllama
from langchain import PromptTemplate

# Imports from other local python files
from neo4j_graph import Graph



## Establish Database Connection

The cell connects to the Neo4J instance. It relies on several environment variables. 

**PLEASE NOTE**: The variable have been changed to support multiple databases in the same instance. 

| Variable            | Description                          | Sample Value          |
|---------------------|--------------------------------------|-----------------------|
| FHIR_GRAPH_URL      | Where to find the instance of Neo4j. | bolt://localhost:7687 |
| FHIR_GRAPH_USER     | The username for the database.       | neo4j                 |
| FHIR_GRAPH_PASSWORD | The password for the database.       | password              |
| FHIR_GRAPH_DATABASE | The name of the database instance.   | neo4j                 |

In [2]:
NEO4J_URI = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

g = Graph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)

## Helper Database Cells

The following three cells are here to be used to manage the database. They do not need to be run on a blank database. 

In [157]:
print(g.database_metrics())

[[432, 431]]
(432, 431)


In [3]:
g.wipe_database()

[[0, 0]]


'Deleted 0 nodes and 0 relationships in 0.031 seconds'

## Load Markdown into the Graph

This cell opens the bundle and creates the nodes and edges in the graph for each resource. 

Every resource will result in a node that has a label based on the resource type and as a `resource`. The values within the resource will be flattened 
into properties within the node. Also, a property called `text` will include a string representation of the resource. 

Additionally, nodes will be created for every unique date (ignoring time) found in the FHIR resources. 

Edges will be created for every reference in the resource to something that can be found within the bundles loaded. So the linking resource doesn't have 
to be in the same bundle, but it must be in a bundle that is loaded. 

Edges will also connect resources to the dates found inside them. 

**Warning:** This cell may take sometime to run. 

In [4]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

loader = DirectoryLoader('/data-transfer/iihf', glob="**/*.md", loader_cls=TextLoader)
documents = loader.load()

print (documents[0].metadata["source"])
print (len(documents))

headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
    ("###", "header3"),
    ("####", "header4"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=True)
md_header_splits = markdown_splitter.split_text(documents[0].page_content)

/data-transfer/iihf/rulebook.md
1


In [27]:
md_header_splits[0].page_content


'No matter where ice hockey is played, the object of the game is the same – to put the puck into the opponent’s goal. Beyond that, ice hockey across the globe is subject to certain variations. This makes the rules of the game extremely important. These rules must be followed all times, in all countries, in all age categories, for the game to be enjoyed by everyone.  \nHockey’s speed is one of the qualities that makes it so exciting. But this skill and excitement must be balanced with fair play and respect.  \nIt is, therefore, important to make a clear separation between the purpose of all the elements of the game and to use these respectfully. These distinctions can be taught at an early age or whenever one begins to show interest in the game. And this is why hockey development begins with parents and coaches, those people most influential in guiding a person, old or young, into playing the game properly and within the rules.  \nThe IIHF Championship program encompasses 81 Member Nati

In [5]:
# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    # Existing args
)

chunk_size = 300
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
chunks = text_splitter.split_documents(md_header_splits)
for m in chunks[6].metadata:
    print (chunks[6].metadata[m])

IIHF Official Rulebook 2023/24
Welcome


In [6]:
len(chunks)

511

## Load data into the Graph

In [7]:
import os
from langchain.graphs import Neo4jGraph
from neo4j import GraphDatabase

NEO4J_URI = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USER
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD
os.environ["NEO4J_DATABASE"] = NEO4J_DATABASE

graph = Neo4jGraph()

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)

In [8]:
from neo4j_graph_loader import initialise_neo4j

initialise_neo4j(graph)

In [9]:
from neo4j_graph_loader import insert_document

document = documents[0]

insert_document(graph, document)

In [10]:
from neo4j_graph_loader import insert_chunks

insert_chunks(graph, 1, 1, chunks)

In [11]:
from neo4j_graph_loader import derive_sections
derive_sections(graph, document)

## Create the Vector Embedding Index in the Graph

This cell creates a Vector Index in Neo4J. It looks at nodes labeled as `resource` and indexes the string representation in the `text` property. 

**Warning:** This cell may take sometime to run. 

In [13]:
import ollama
from ollama import Client
from langchain_community.embeddings import OllamaEmbeddings

OLLAMA_URL="http://192.168.1.102:11434"
EMBEDDING_MODEL="mxbai-embed-large"

ollama = Client(host=OLLAMA_URL)

embedding = OllamaEmbeddings(
            base_url=OLLAMA_URL, model=EMBEDDING_MODEL, temperature=0)

embedding_dimension = len(embedding.embed_query("foo"))
print (embedding_dimension)


1024


In [None]:
def get_embedding(client, text, model):
        response = client.embeddings(model=model, prompt=text)
        return response["embedding"]

print(get_embedding(ollama, "car", EMBEDDING_MODEL))

In [16]:
from neo4j_graph_loader import create_vector_index

create_vector_index(graph, 1024)

TypeError: create_vector_index() missing 1 required positional argument: 'dimension'

In [17]:
from neo4j_graph_loader import create_embedding

create_embedding(driver, 'Chunk', 'text', ollama, EMBEDDING_MODEL)

Processed 511 Chunk nodes for property @text.


511

# Using LangChain to create a Vector Embedding Index in an existing graph

This cell creates a Vector Index in Neo4J. It looks at nodes labeled as `Chunk` and indexes the string representation in the `text` property. 

**Warning:** This cell may take sometime to run. 

In [18]:
from langchain.vectorstores import Neo4jVector
Neo4jVector.from_existing_graph(
    embedding=embedding,
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name='chunk_text',
    node_label="Chunk",
    text_node_properties=['text'],
    embedding_node_property='embedding',
)

<langchain_community.vectorstores.neo4j_vector.Neo4jVector at 0xffff504fbf50>

# Using LangChain to create a Neo4J Vector Database (do not use!)

In [None]:
from langchain.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings

# Neo4j Aura credentials
NEO4J_URL = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    chunks,
    embedding = OllamaEmbeddings(
            base_url=OLLAMA_URL, model="mxbai-embed-large", temperature=0
        ),
    url=NEO4J_URL,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    node_label="Chunk"
)

### Create Vector Index 

This cell creates a new vector index, using the index created above. 

This is here because running the cell above can take time and only should be done one time when the DB is created. 

In [19]:
vector_index = Neo4jVector.from_existing_index(
    embedding = OllamaEmbeddings(
            base_url=OLLAMA_URL, model="mxbai-embed-large", temperature=0
        ),
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name='chunk_text'
)

## Pick the Question

All following cells will work with the question as defined here. As you can see, I have been experimenting with a number of 
different questions.


In [20]:
#question = "What is special about the equipment of a goalkeeper in icehockey"
#question = "When can a stick meassurement be requested"
#question = "what is the size of an icehockey rink"
#question = "how many players play in a hockey game"
question = "What is considered dangerous equipment"

## Start working with the LLM

The rest of this notebook is working with the LLM to attempt to answer the question.

### Ask LLM

This first cell asks the LLM with no context and gets told the LLM can't answer without more information. 

In [24]:
ollama_model = 'llama3' # mistral, orca-mini, llama2

llm = Ollama(base_url=OLLAMA_URL, model=ollama_model)
no_rag_answer = llm(question)
print(no_rag_answer)

  warn_deprecated(


"Dangerous equipment" can vary depending on the context, industry, and regulations. However, here are some examples of equipment that are commonly considered hazardous or potentially dangerous:

1. Heavy machinery:
	* Cranes
	* Forklifts
	* Backhoes
	* Excavators
2. Power tools and equipment:
	* Circular saws
	* Reciprocating saws (e.g., sawzalls)
	* Drill presses
	* Soldering irons
3. Industrial machinery:
	* Presses
	* Lathes
	* Grinders
	* Milling machines
4. Chemical processing equipment:
	* Reactors
	* Distillation columns
	* Centrifuges
	* Mixing tanks
5. Electrical equipment:
	* High-voltage transformers
	* Electrical substations
	* Power distribution panels
6. Pneumatic and hydraulic systems:
	* Air compressors
	* Hydraulic pumps
	* Pneumatic cylinders
7. Food processing equipment:
	* Meat grinders
	* Slaughterhouse machinery
	* Dairy processing equipment
8. Medical equipment:
	* MRI machines
	* Radiation therapy equipment
	* Surgical lasers
9. Firefighting and rescue equipment

In [21]:
emb = embedding.embed_query(question)
graph.query("""
    CALL db.index.vector.queryNodes('chunk_text', 3, $embedding) yield node, score
    RETURN score, node.text AS text
""", { "embedding": emb })

[{'score': 0.8824647665023804,
  'text': 'A “broken stick” is one which, in the opinion of the Referee, is unfit for normal play.'},
 {'score': 0.8573859930038452,
  'text': 'In the interval between periods, the ice surface shall be flooded unless mutually agreed to the contrary.'},
 {'score': 0.8559696674346924, 'text': 'to the ice surface are not allowed.'}]

### Check Vector Index

This cell checks what the vector index will return and is here for debugging / informational purposes. 

In [22]:
response = vector_index.similarity_search(question, k=2) # k_nearest is not used here because we don't have a retrieval query yet.
print(response[0].page_content)
print(response[1].page_content)


A “broken stick” is one which, in the opinion of the Referee, is unfit for normal play.
In the interval between periods, the ice surface shall be flooded unless mutually agreed to the contrary.


### Ask the LMM with Context

This cell will ask the LLM with the string representation of the resource node that is found by the vector index. 

In [25]:
ollama_model = 'llama3' # mistral, orca-mini, llama2

vector_qa = RetrievalQA.from_chain_type(
                llm=ChatOllama(base_url=OLLAMA_URL, model=ollama_model, temperature=0)
                , chain_type="stuff"
                , retriever=vector_index.as_retriever(search_kwargs={'k': 1}) # k_nearest is not used here because we don't have a retrieval query yet.
                , verbose=True
                , chain_type_kwargs={"verbose": True}
)

pprint(vector_qa.run(question))

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
A “broken stick” is one which, in the opinion of the Referee, is unfit for normal play.
Human: What is considered dangerous equipment[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m
('In the context of a game or sport, "dangerous equipment" typically refers to '
 'any piece of gear that poses a significant risk of injury to players if not '
 'properly used or maintained.\n'
 '\n'
 'In the case of a broken stick in hockey, it would be considered dangerous '
 'equipment because it can cause harm to other players on the ice. A broken '
 'stick can fly off and hit someone, causing an injury, or

In [32]:
from langchain.vectorstores import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain

neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embedding,
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name='chunks_vector',
    node_label="Chunk",
    text_node_properties=['text'],
    embedding_node_property='embedding',
)
retriever = neo4j_vector_store.as_retriever(search_kwargs={'k': 1})

chain = RetrievalQA.from_chain_type(
    ChatOllama(model="llama3"), 
    chain_type="stuff", 
    retriever=retriever
)

def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [34]:
prettychain(question)

ValueError: Missing some input keys: {'query'}

# directly show the graph resulting from the given Cypher query


In [1]:
!pip install yfiles_jupyter_graphs 
!pip install ipywidgets

Collecting yfiles_jupyter_graphs
  Downloading yfiles_jupyter_graphs-1.6.2-py2.py3-none-any.whl.metadata (11 kB)
Downloading yfiles_jupyter_graphs-1.6.2-py2.py3-none-any.whl (15.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: yfiles_jupyter_graphs
Successfully installed yfiles_jupyter_graphs-1.6.2


In [4]:
from yfiles_jupyter_graphs import GraphWidget

# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (d)-[r:CONTAINS]->(s) RETURN d,r,s LIMIT 50"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

w = showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [64]:
graph.query("SHOW INDEXES")

[{'id': 8,
  'name': 'chunkKey',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['key'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'chunkKey',
  'lastRead': neo4j.time.DateTime(2024, 5, 31, 18, 13, 4, 126000000, tzinfo=<UTC>),
  'readCount': 2187},
 {'id': 12,
  'name': 'chunkVectorIndex',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Embedding'],
  'properties': ['value'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 4,
  'name': 'constraint_1dc138a',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['id'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'constraint_1dc138a',
  'lastRead': neo4j.time.DateTime(2024, 5, 29, 17, 27, 38, 963000000, tzinfo=<UTC