# RAG with Knowledge Graph - Load Data

In [None]:
This notebook and associate Python 

In [10]:
# Install some packages that are needed. 

!pip install neo4j langchain langchain-community ollama

Collecting ollama
  Downloading ollama-0.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting httpx<0.28.0,>=0.27.0 (from ollama)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<0.28.0,>=0.27.0->ollama)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading ollama-0.2.0-py3-none-any.whl (9.5 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m 

In [2]:
# Imports needed

import glob
import json
import os
import re

from pprint import pprint

from langchain.llms import Ollama
from langchain.graphs import Neo4jGraph
from langchain.vectorstores.neo4j_vector import Neo4jVector
#from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOllama
from langchain import PromptTemplate

# Imports from other local python files
from neo4j_graph import Graph



## Establish Database Connection

The cell connects to the Neo4J instance. It relies on several environment variables. 

**PLEASE NOTE**: The variable have been changed to support multiple databases in the same instance. 

| Variable            | Description                          | Sample Value          |
|---------------------|--------------------------------------|-----------------------|
| FHIR_GRAPH_URL      | Where to find the instance of Neo4j. | bolt://localhost:7687 |
| FHIR_GRAPH_USER     | The username for the database.       | neo4j                 |
| FHIR_GRAPH_PASSWORD | The password for the database.       | password              |
| FHIR_GRAPH_DATABASE | The name of the database instance.   | neo4j                 |

In [2]:
NEO4J_URI = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

g = Graph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)

## Helper Database Cells

The following three cells are here to be used to manage the database. They do not need to be run on a blank database. 

In [52]:
print(g.database_metrics())

[[38, 0]]
(38, 0)


In [3]:
g.wipe_database()

[[167, 165]]


'Deleted 167 nodes and 165 relationships in 0.052 seconds'

## Load Markdown into the Graph

This cell opens the bundle and creates the nodes and edges in the graph for each resource. 

Every resource will result in a node that has a label based on the resource type and as a `resource`. The values within the resource will be flattened 
into properties within the node. Also, a property called `text` will include a string representation of the resource. 

Additionally, nodes will be created for every unique date (ignoring time) found in the FHIR resources. 

Edges will be created for every reference in the resource to something that can be found within the bundles loaded. So the linking resource doesn't have 
to be in the same bundle, but it must be in a bundle that is loaded. 

Edges will also connect resources to the dates found inside them. 

**Warning:** This cell may take sometime to run. 

In [4]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

loader = DirectoryLoader('/data-transfer/iihf', glob="**/*.md", loader_cls=TextLoader)
documents = loader.load()

print (documents[0].metadata["source"])
print (len(documents))

headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
    ("###", "header3"),
    ("####", "header4"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=True)
md_header_splits = markdown_splitter.split_text(documents[0].page_content)

/data-transfer/iihf/rulebook.md
1


In [10]:
md_header_splits[1]
#len(md_header_splits)

Document(page_content='Games under jurisdiction of the IIHF shall be played on an ice surface known as the “Rink” and must adhere to the dimensions and specifications prescribed by the IIHF and these rules.  \nNo ice markings shall be permitted except those provided for under these rules unless express written permission has been obtained from the IIHF. On-ice logos must not interfere with any official ice markings provided for the proper playing of the game.  \nIn the interval between periods, the ice surface shall be flooded unless mutually agreed to the contrary.', metadata={'header1': 'IIHF Official Rulebook 2023/24', 'header2': 'SECTION 01 PLAYING AREA', 'header3': 'RULE 1 RINK', 'header4': '1.1 RINK'})

In [5]:
# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    # Existing args
)

chunk_size = 200
chunk_overlap = 30
##text_splitter = RecursiveCharacterTextSplitter(
##    chunk_size=chunk_size, chunk_overlap=chunk_overlap
##)

# Split
chunks = text_splitter.split_documents(md_header_splits)
for m in chunks[6].metadata:
    print (chunks[6].metadata[m])

IIHF Official Rulebook 2023/24
SECTION 01 PLAYING AREA
RULE 1 RINK
1.6 DIVISION OF ICE SURFACE


In [76]:
len(chunks)

73

## Load data into the Graph

In [12]:
import os
from langchain.graphs import Neo4jGraph
from neo4j import GraphDatabase

NEO4J_URI = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

os.environ["NEO4J_URI"] = NEO4J_URL
os.environ["NEO4J_USERNAME"] = NEO4J_USER
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD
os.environ["NEO4J_DATABASE"] = NEO4J_DATABASE

graph = Neo4jGraph()

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)

In [None]:
from neo4j_graph_loader import initialise_neo4j

initialise_neo4j(graph)

In [7]:
from neo4j_graph_loader import insert_document

document = documents[0]

insert_document(graph, document)

In [8]:
from neo4j_graph_loader import insert_chunks

insert_chunks(graph, 1, 1, chunks)

In [9]:
from neo4j_graph_loader import derive_sections
derive_sections(graph, document)

## Create the Vector Embedding Index in the Graph

This cell creates a Vector Index in Neo4J. It looks at nodes labeled as `resource` and indexes the string representation in the `text` property. 

**Warning:** This cell may take sometime to run. 

In [2]:
import ollama
from ollama import Client
from langchain_community.embeddings import OllamaEmbeddings

OLLAMA_URL="http://192.168.1.102:11434"
EMBEDDING_MODEL="mxbai-embed-large"

ollama = Client(host=OLLAMA_URL)

embedding = OllamaEmbeddings(
            base_url=OLLAMA_URL, model=EMBEDDING_MODEL, temperature=0)

embedding_dimension = len(embedding.embed_query("foo"))
print (embedding_dimension)


1024


In [None]:
def get_embedding(client, text, model):
        response = client.embeddings(model=model, prompt=text)
        return response["embedding"]

print(get_embedding(ollama, "car", EMBEDDING_MODEL))

In [7]:
from neo4j_graph_loader import create_vector_index

create_vector_index(graph, 1024)

In [6]:
from neo4j_graph_loader import create_embedding

create_embedding(driver, 'Chunk', 'text', ollama, EMBEDDING_MODEL)

Processed 73 Chunk nodes for property @text.


73

# Using LangChain to create a Vector Embedding Index in an existing graph

In [15]:
from langchain.vectorstores import Neo4jVector
Neo4jVector.from_existing_graph(
    embedding=embedding,
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name='chunk_text',
    node_label="Chunk",
    text_node_properties=['text'],
    embedding_node_property='embedding',
)

<langchain_community.vectorstores.neo4j_vector.Neo4jVector at 0xffff457d6ed0>

# Using LangChain to create a Neo4J Vector Database

In [None]:
from langchain.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings

# Neo4j Aura credentials
NEO4J_URL = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    chunks,
    embedding = OllamaEmbeddings(
            base_url=OLLAMA_URL, model="mxbai-embed-large", temperature=0
        ),
    url=NEO4J_URL,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    node_label="Chunk"
)

# directly show the graph resulting from the given Cypher query


In [None]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

showGraph()