In [109]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    UnstructuredHTMLLoader,
    JSONLoader,
    UnstructuredMarkdownLoader,
)

# Langchain
from langchain_community.graphs import Neo4jGraph

from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI
import uuid
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# Warning control
import warnings

warnings.filterwarnings("ignore")

In [147]:
# Load from environment
load_dotenv(".env", override=True)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE") or "neo4j"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Note the code below is unique to this course environment, and not a
# standard part of Neo4j's integration with OpenAI. Remove if running
# in your own environment.
OPENAI_ENDPOINT = os.getenv("OPENAI_BASE_URL") + "/embeddings"

MISTRIAL_API_KEY = os.getenv("MISTRIAL_API_KEY")
MISTRIAL_ENDPOINT = os.getenv("MISTRIAL_BASE_URL") + "/embeddings"

# Global constants
VECTOR_INDEX_NAME = "form_chunks"
VECTOR_NODE_LABEL = "Chunk"
VECTOR_SOURCE_PROPERTY = "text"
VECTOR_EMBEDDING_PROPERTY = "textEmbedding"

In [111]:
def load_document_return_text(document: str):
    """
    _description_
    This function accepts a document, creates chunks, embed the chunks and write to a vector store

    Parameters
    ----------
    document : str
        _description_
        takes the directory of the document to be processed

    Returns
    -------
    _type_
        _description_
        Chunks of text from the document

    Raises
    ------
    ValueError
        _description_
    """

    if document.split(".")[-1] == "pdf":
        loader = PyPDFLoader(document)
    elif document.split(".")[-1] == "txt":
        loader = TextLoader(document)
    elif document.split(".")[-1] == "json":
        loader = JSONLoader(document)
    elif document.split(".")[-1] == "html":
        loader = UnstructuredHTMLLoader(document)
    elif document.split(".")[-1] == "md":
        loader = UnstructuredMarkdownLoader(document)
    else:
        raise ValueError("File type not supported")

    pages = loader.load()

    unify_content = ""
    for page in pages:
        unify_content += "\n" + page.page_content

    id = uuid.uuid4()

    form_id = id.hex[:16]

    patient_id = document.split("_")[-1].split(".")[0]

    print(f"Form ID: {form_id}")
    print(f"Patient ID: {patient_id}")

    return unify_content, form_id, patient_id


# file = ["./patient6_105099.txt",
#         "./patient3_104016.txt", "./patient9_103062.txt"]
# for i in file:
#     text, form_id, patient_id = load_document_return_text(i)

#     print(text)

In [112]:
def chunk_text_with_metadata(text: str, form_id: str, patient_id: str):
    """_summary_

    Parameters
    ----------
    text : str
        _description_
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=20, length_function=len, is_separator_regex=False
    )

    text_chunks = text_splitter.split_text(text)

    chunks_with_metadata = [
        {
            "patientId": patient_id,
            "formId": form_id,
            "text": chunk,
            "chunkName": f"chunk_{form_id}_{patient_id}_{i}",
            "chunkIndex": i,
        }
        for i, chunk in enumerate(text_chunks)
    ]

    return chunks_with_metadata


# chunks_with_metadata = chunk_text_with_metadata(first_file_as_object)

# chunks_with_metadata

## Creating Nodes

In [113]:
def instantiate_node_creation():
    """_summary_"""

    kg = Neo4jGraph(
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        database=NEO4J_DATABASE,
    )

    kg.query(
        """
    CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
        FOR (c:Chunk) REQUIRE c.chunkName IS UNIQUE
    """
    )

    kg.query(
        """
    CREATE CONSTRAINT UniqueFile IF NOT EXISTS
        FOR (f:PatientFile) REQUIRE f.fileId IS UNIQUE
    """
    )
    kg.query(
        """
            CREATE CONSTRAINT UniquePatient IF NOT EXISTS
        FOR (p:Patient) REQUIRE p.patientId IS UNIQUE"""
    )
    return kg


# kg = instantiate_node_creation()
# kg.query("""match (n) detach delete n""")

#### Creating nodes for individual chunks of text

In [114]:
def create_chunk_nodes(chunks_with_metadata: list, kg):
    """_summary_"""

    merge_chunk_node_query = """
    MERGE(mergedChunk:Chunk {chunkName: $chunkParam.chunkName})
        ON CREATE SET 
            mergedChunk.formId = $chunkParam.formId, 
            mergedChunk.patientId = $chunkParam.patientId, 
            mergedChunk.text = $chunkParam.text,
            mergedChunk.chunkIndex = $chunkParam.chunkIndex
    RETURN mergedChunk
    """

    node_count = 0
    for chunk in chunks_with_metadata:
        print(f"Creating `:Chunk` node for chunk ID {chunk['chunkName']}")
        kg.query(merge_chunk_node_query, params={"chunkParam": chunk})
        node_count += 1
    print(f"Created {node_count} nodes")


# create_chunk_nodes(chunks_with_metadata)

In [115]:
def create_patientFile_node(patient_id: int, form_id: str, kg):
    """_summary_"""

    cypher = """
    MERGE (f:PatientFile {fileId: $formId})
    on create 
    set f.patientId = $patientId
    set f.lastUpdated = timestamp()
    """
    kg.query(cypher, params={"formId": form_id, "patientId": patient_id})


# create_patientFile_node(patient_id, form_id)

In [116]:
def create_patient_node(patient_id: int, name: str, kg):
    """_summary_"""

    cypher = """
    MERGE (f:Patient {patientId: $patientId})
    on create 
    set f.name = $name
    set f.lastUpdated = timestamp()
    """
    kg.query(cypher, params={"patientId": patient_id, "name": name})


# create_patient_node(patient_id, "Jean Dupont")

## Creating relationships

In [117]:
def create_chunk_relationships(form_id: str, kg):
    """_summary_"""

    cypher = """
    MATCH (from_same_section:Chunk)
    WHERE from_same_section.formId = $formIdParam
        
    WITH from_same_section
        ORDER BY from_same_section.chunkIndex ASC
    WITH collect(from_same_section) as section_chunk_list
        CALL apoc.nodes.link(
            section_chunk_list, 
            "NEXT", 
            {avoidDuplicates: true}
        )  // NEW!!!
    RETURN size(section_chunk_list)
    """

    kg.query(
        cypher,
        params={
            "formIdParam": form_id,
        },
    )


# create_chunk_relationships(form_id, kg)

In [118]:
def create_patientFile_chunk_and_patient_patientFile_relationship(kg):
    """_summary_"""

    cypher = """
    MATCH (c:Chunk)
    MATCH (f:PatientFile {fileId: c.formId})
    MATCH (p:Patient {patientId: f.patientId})
    MERGE (c)-[newRelationship:PART_OF]->(f)
    MERGE (f)-[r:BELONGS_TO]->(p)
    RETURN count(newRelationship) AS chunkFileRelationshipsCount, count(r) AS filePatientRelationshipsCount
    """

    kg.query(cypher)


# create_patientFile_chunk_and_patient_patientFile_relationship()

In [119]:
# kg.refresh_schema()
# print(kg.schema)

# Create Vector Index and Populate it

In [120]:
def create_chunk_vector_node_indexes(kg):
    """_summary_"""

    kg.query(
        """
         CREATE VECTOR INDEX `form_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
    """
    )


# create_chunk_vector_node_indexes()

In [121]:
def populate_chunk_vectorIndex_with_embeddings(kg):
    """_summary_"""

    kg.query(
        """
        MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
        WITH chunk, genai.vector.encode(
          chunk.text, 
          "OPENAI", 
          {
            token: $openAiApiKey, 
            endpoint: $openAiEndpoint
          }) AS vector
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
        """,
        params={"openAiApiKey": OPENAI_API_KEY,
                "openAiEndpoint": OPENAI_ENDPOINT},
    )


# kg.query("SHOW INDEXES")
# populate_chunk_vectorIndex_with_embeddings()

In [122]:
def run_full_pipeline(files: list, names: list):
    """_summary_"""

    for document, name in zip(files, names):

        text, form_id, patient_id = load_document_return_text(document)

        chunks_with_metadata = chunk_text_with_metadata(
            text, form_id, patient_id)

        kg = instantiate_node_creation()

        create_chunk_nodes(chunks_with_metadata, kg)

        create_patientFile_node(patient_id, form_id, kg)

        create_patient_node(patient_id, name, kg)

        create_chunk_relationships(form_id, kg)

        create_patientFile_chunk_and_patient_patientFile_relationship(kg)

        kg.refresh_schema()
        print(kg.schema)

        create_chunk_vector_node_indexes(kg)

        populate_chunk_vectorIndex_with_embeddings(kg)

        print("done embedding")


# files = [
#     "./patient6_105099.txt",
#     "./patient13_104016.txt",
#     "./patient9_103062.txt",
#     "./patient1_105700.txt",
# ]

# names = ["Jean Lemoine", "Pauline Dubois", "Eric Laval", "Jean Dupont"]

# run_full_pipeline(files, names)

Form ID: 7cacf0bb8c824100
Patient ID: 105099
Creating `:Chunk` node for chunk ID chunk_7cacf0bb8c824100_105099_0
Creating `:Chunk` node for chunk ID chunk_7cacf0bb8c824100_105099_1
Creating `:Chunk` node for chunk ID chunk_7cacf0bb8c824100_105099_2
Creating `:Chunk` node for chunk ID chunk_7cacf0bb8c824100_105099_3
Created 4 nodes
Node properties are the following:
Patient {name: STRING, patientId: STRING, lastUpdated: INTEGER},Chunk {text: STRING, chunkName: STRING, formId: STRING, patientId: STRING, chunkIndex: INTEGER},PatientFile {patientId: STRING, fileId: STRING, lastUpdated: INTEGER}
Relationship properties are the following:

The relationships are the following:
(:Chunk)-[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:PatientFile),(:PatientFile)-[:BELONGS_TO]->(:Patient)
done embedding
Form ID: 471255ca6d2d45c3
Patient ID: 104016
Creating `:Chunk` node for chunk ID chunk_471255ca6d2d45c3_104016_0
Creating `:Chunk` node for chunk ID chunk_471255ca6d2d45c3_104016_1
Creating `:Chunk` nod

## Perform Similarity Search to find similar chunks

In [143]:
def neo4j_vector_search(question):
    """Search for similar nodes using the Neo4j vector index"""
    vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
    kg = Neo4jGraph(
        url=NEO4J_URI,
        username=NEO4J_USERNAME,
        password=NEO4J_PASSWORD,
        database=NEO4J_DATABASE,
    )

    similar = kg.query(
        vector_search_query,
        params={
            "question": question,
            "openAiApiKey": OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT,
            "index_name": VECTOR_INDEX_NAME,
            "top_k": 2,
        },
    )
    return similar

In [144]:
search_results = neo4j_vector_search(
    "In a single sentence, tell me about Jean Dupont and his age.")

In [145]:
print(textwrap.fill(search_results[1].get("text"), 100))

Dear Dr. Lefevre, I hope you are well. I'm taking the liberty of writing to you about one of my
patients, Jean Lemoine, who has been presenting a rather complex situation lately, and I really
think that your expertise could be of great help to us.


### Set up a LangChain RAG workflow to chat with the form

- Set up a RetrievalQAWithSourcesChain to carry out question answering
- You can check out the LangChain documentation for this chain [here](https://api.python.langchain.com/en/latest/chains/langchain.chains.qa_with_sources.retrieval.RetrievalQAWithSourcesChain.html)

In [169]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

retriever = neo4j_vector_store.as_retriever()

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=retriever
)


def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain(
        {"question": question},
        return_only_outputs=True,
    )
    print(textwrap.fill(response["answer"], 60))


prettychain("In a single sentence, tell me about Jean Dupont and his age.")

Jean Dupont is 54 years old.


In [170]:
question = "In three sentences tell me what is the diagnosis of Jean Dupont?"
prettychain(question)

The diagnosis of Jean Dupont is a thorough cardiac
assessment due to atypical chest pain and shortness of
breath on moderate exertion, with a medical history of
controlled hypertension, hypercholesterolemia, and a family
history of coronary heart disease.


In [174]:
prettychain("In three sentences tell me what is the diagnosis of Eric Laval?")

Eric Laval, 45, presents with frequent headaches, episodes
of vertigo, and intermittent numbness in his hands and feet.
He has a history of a major depressive episode five years
ago, successfully treated with an SSRI for a year. Further
tests like a brain MRI or electrophysiological tests are
being considered to get a clearer picture of his condition.


In [172]:
question = "tell me about all patients in the database?"
prettychain(question)

There are four patients mentioned in the database: Jean
Lemoine, Eric Laval, and Mr. Jean Dupont.


In [156]:
question = (
    """tell me about Jeans? If you are unsure about the answer, say you don't know"""
)
prettychain(question)

I don't know.


In [171]:
retrieval_query_window = """
MATCH window=
    (:Chunk)-[:NEXT*0..1]->(node)-[:NEXT*0..1]->(:Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC LIMIT 1
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.text) as textList, node, score
RETURN apoc.text.join(textList, " \n ") as text,
    score,
    node {.source} AS metadata
"""

vector_store_window = Neo4jVector.from_existing_index(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j",
    index_name=VECTOR_INDEX_NAME,
    text_node_property=VECTOR_SOURCE_PROPERTY,
    retrieval_query=retrieval_query_window,  # NEW!!!
)

# Create a retriever from the vector store
retriever_window = vector_store_window.as_retriever()

# Create a chatbot Question & Answer chain from the retriever
chain_window = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), chain_type="stuff", retriever=retriever_window
)


def prettychain_extra(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain_window(
        {"question": question},
        return_only_outputs=True,
    )
    print(textwrap.fill(response["answer"], 60))


prettychain_extra("In three sentences tell me what is the diagnosis of Jean Dupont?")

The diagnosis of Jean Dupont is unclear, but he is
presenting with persistent abdominal pain, unexplained
weight loss, mild anemia, and increased inflammatory
markers. Further imaging and consultation with a specialist
are needed to determine the cause of his symptoms.


In [173]:
prettychain_extra(
    "In three sentences tell me what is the diagnosis of Eric Laval?")

Eric Laval, 45, presents with frequent headaches, episodes
of vertigo, and intermittent numbness in his hands and feet.
He has a history of major depressive episode, appendectomy,
and shingles. Clinical examination revealed no abnormal
findings apart from the symptoms described.


In [175]:
question = ("""tell me about Jeans? If you are unsure about the answer, say you don't know""")
prettychain_extra(question)

I don't know.


In [166]:
question = "tell me the name of all patients in the database?"
prettychain_extra(question)

I don't know.


In [111]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to 
query a graph database.
Instructions:
Use only the provided relationship types and properties in the 
schema. Do not use any other relationship types or properties that 
are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than 
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher 
statements for particular questions:
# How many patients do we have and what are their names?
MATCH (p:Patient)
RETURN p.name, count(p) AS patientCount
# How many patient files belong to each patient?
MATCH (f:PatientFile)-[:BELONGS_TO]->(p:Patient)
    WHERE f.fieldId = "8e140f0a28c947cf"
RETURN f.fieldId, p.name, count(f) AS patientFileCount
# What is the name of the patient with patientId 105700?
MATCH (p:Patient {patientId: 105700})
RETURN p.name
# Generate Cypher statement to extract diagnosis information of a patient with patientId 105700.
MATCH (p:Patient {patientId: '105700'})<-[:BELONGS_TO]-(f:PatientFile)<-[:PART_OF]-(c:Chunk)
WHERE c.diagnosis IS NOT NULL
RETURN p.name AS patientName, f.fileId AS fileName, c.text AS diagnosisText
The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

cypherChain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)


def prettyCypherChain(question: str) -> str:
    response = cypherChain.run(question)
    print(textwrap.fill(response, 60))

In [81]:
prettyCypherChain("How many patients do we have and what are their names?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (pf:PatientFile)-[:BELONGS_TO]->(p:Patient)
RETURN p.name, count(p) AS patientCount[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Jean Dupont', 'patientCount': 1}][0m

[1m> Finished chain.[0m
We have 1 patient named Jean Dupont.


In [113]:
prettyCypherChain("How many patient files belong to each patient?")



[1m> Entering new GraphCypherQAChain chain...[0m


ValueError: Missing some input keys: {'patientId'}

In [112]:
prettyCypherChain(
    "what is the diagnosis of the patient with patientId 105700?")



[1m> Entering new GraphCypherQAChain chain...[0m


ValueError: Missing some input keys: {'patientId'}

In [104]:
prettyCypherChain(
    "What is the diagnosis of the patient with patientId 105700?", "105700"
)



[1m> Entering new GraphCypherQAChain chain...[0m


ValueError: Missing some input keys: {'patientId'}