# Knowlegde graphs GenAI

<img src=visualisation.png>

In [None]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')

In [None]:
# Instantiate Neo4j

kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

def get_llm():
    load_dotenv('.env', override=True)
    return AzureChatOpenAI(azure_deployment="gpt-4o-mini", api_version="2024-08-01-preview")

def get_embedding():
    load_dotenv('.env', override=True)
    return AzureOpenAIEmbeddings(azure_deployment="text-embedding-ada-002", api_version="2023-05-15")

embedding_model = get_embedding()
llm_model = get_llm()

In [None]:
def pdf_loader(file_path: str):
    loader = PyPDFLoader(file_path)
    text = ""
    for page in loader.load():
        text += page.page_content
    return text

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

## Extract metadata for Chunks

In [None]:
import requests
import xmltodict
from time import sleep

url = "https://www.boe.es/datosabiertos/api/boe/sumario/20240101"


def remove_duplicated_from_list_dict(dict_list):
    unique_list = []
    for d in dict_list:
        if d not in unique_list:
            unique_list.append(d)
    return unique_list


def get_items(departament):
        if type(departament["epigrafe"]) == dict:
            if type(departament["epigrafe"]["item"]) == list:
                for item in departament["epigrafe"]["item"]:
                    yield item, departament["epigrafe"]["@nombre"], departament["@nombre"]
            else:
                yield departament["epigrafe"]["item"], departament["epigrafe"]["@nombre"], departament["@nombre"]
        else:
            for epigraf in departament["epigrafe"]:
                if type(epigraf["item"]) == list:
                    for item in epigraf["item"]:
                        yield item, epigraf["@nombre"], departament["@nombre"]
                else:
                    yield epigraf["item"], epigraf["@nombre"], departament["@nombre"]


def get_metadata(url: str):
    headers = {"Accept": "application/xml"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data_dict = xmltodict.parse(response.content)
        chunks_with_metadata = []
        items_with_metadata = []
        epigraf_with_metadata = []
        department_with_metadata = []
        seccion_with_metadata = []
        sumario_with_metadata = []
        sumario = data_dict["response"]["data"]["sumario"]["diario"]["sumario_diario"]
        sumario_with_metadata.append({"sumario_id": sumario["identificador"], "source": sumario["url_pdf"]["#text"]})
        for seccion in data_dict["response"]["data"]["sumario"]["diario"]["seccion"]:
            seccion_with_metadata.append({'seccion_id': seccion["@nombre"], "sumario_id": sumario["identificador"]})
            for departament in seccion["departamento"]:
                department_with_metadata.append({'department_id': departament["@nombre"], 'seccion_id': seccion["@nombre"], "sumario_id": sumario["identificador"]})
                for item, epigraf, department in get_items(departament):
                    epigraf_with_metadata.append({'epigraf_id': epigraf, 'department_id': department, 'seccion_id': seccion["@nombre"], "sumario_id": sumario["identificador"]})
                    items_with_metadata.append(
                        {
                            'title': item["titulo"],
                            'item_id': item["identificador"],
                            'epigraf_id': epigraf,
                            'department_id': department,
                            'seccion_id': seccion["@nombre"],
                            'sumario_id': sumario["identificador"],
                            'control': item["control"],
                            'source': item["url_pdf"]["#text"]
                        }
                    )
                    item_text = pdf_loader(file_path=f"C:/Users/2373225/projects/genai-1/data/pdfs/{item['identificador']}.pdf")
                    item_text_chunks = text_splitter.split_text(item_text)
                    chunk_seq_id = 0
                    for chunk in item_text_chunks:
                        chunks_with_metadata.append(
                            {
                                'text': chunk,
                                'chunk_id': f"{item['identificador']}-chunk{chunk_seq_id:04d}",
                                'item_id': item["identificador"],
                                'epigraf_id': epigraf,
                                'department_id': department,
                                'seccion_id': seccion["@nombre"],
                                'sumario_id': sumario["identificador"],
                                'chunkSeqId': chunk_seq_id,
                                'source': item["url_pdf"]["#text"]
                            }
                        )
                        chunk_seq_id += 1

    return (remove_duplicated_from_list_dict(chunks_with_metadata), 
           remove_duplicated_from_list_dict(items_with_metadata), 
           remove_duplicated_from_list_dict(epigraf_with_metadata), 
           remove_duplicated_from_list_dict(department_with_metadata), 
           remove_duplicated_from_list_dict(seccion_with_metadata), 
           remove_duplicated_from_list_dict(sumario_with_metadata))


chunks_metadata, items_metadata, epigraf_metadata, department_metadata, seccion_metadata, sumario_metadata = get_metadata(url=url)

## Create all nodes

#### 1. Chunk nodes

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunk_id: $chunkParam.chunk_id})
    ON CREATE SET 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.item_id = $chunkParam.item_id,
        mergedChunk.epigraf_id = $chunkParam.epigraf_id,
        mergedChunk.department_id = $chunkParam.epigraf_id,
        mergedChunk.seccion_id = $chunkParam.seccion_id,
        mergedChunk.sumario_id = $chunkParam.sumario_id,
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId
        
RETURN mergedChunk
"""

kg.query(merge_chunk_node_query, params={'chunkParam': chunks_metadata[0]})

In [None]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
""")

In [None]:
node_count = 0
for chunk in chunks_metadata:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunk_id']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

#### 2. Item nodes

In [None]:
kg.query("""
CREATE CONSTRAINT unique_item IF NOT EXISTS 
    FOR (i:Item) REQUIRE i.item_id IS UNIQUE
""")

In [None]:
merge_item_node_query = """
MERGE(mergedItem:Item {item_id:$ItemParam.item_id})
    ON CREATE SET 
        mergedItem.title = $ItemParam.title,
        mergedItem.item_id = $ItemParam.item_id,
        mergedItem.epigraf_id = $ItemParam.epigraf_id,
        mergedItem.department_id = $ItemParam.department_id,
        mergedItem.seccion_id = $ItemParam.seccion_id,
        mergedItem.sumario_id = $ItemParam.sumario_id,
        mergedItem.control = $ItemParam.control,
        mergedItem.source = $ItemParam.source
RETURN mergedItem
"""

node_count = 0
for item in items_metadata:
    print(f"Creating `:Item` node for item_id {item['item_id']}")
    kg.query(merge_item_node_query, 
            params={
                'ItemParam': item
            })
    node_count += 1
print(f"Created {node_count} nodes")

## Create Vector Index

In [None]:
kg.query("""
         CREATE VECTOR INDEX `item_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

In [None]:
kg.query("""
         CREATE VECTOR INDEX `item_titles` IF NOT EXISTS
          FOR (i:Item) ON (i.titleEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

## Embed text of chunk nodes and title of item nodes

In [None]:
from time import sleep

# Total chunks to embed
total_nodes_to_embed = kg.query("""
    MATCH (chunk:Chunk)
    RETURN count(chunk) AS TotalChunksToEmbed
""")

# Function to get the number of nodes pending to embed
def pending_nodes_to_emded():
    number_embedded_nodes = kg.query(
      """
      MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL 
      RETURN count(chunk) AS NotEmbedChunks                          
      """
    )
    return number_embedded_nodes[0]["NotEmbedChunks"]

# Initialize pending_nodes variable
pending_nodes = pending_nodes_to_emded()

while pending_nodes != 0:
  try:
    print("Embedding nodes...")
    kg.query("""
        MATCH (chunk:Chunk WHERE chunk.textEmbedding IS NULL)
        WITH chunk
        LIMIT $embedding_batch_size
        WITH chunk, genai.vector.encode(
          chunk.text, 
          "AzureOpenAI", 
          {
            token: $openAiApiKey,
            resource: "knowledge-graphs",
            deployment: "text-embedding-ada-002"
          }) AS vector
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
        """, 
        params={"openAiApiKey": OPENAI_API_KEY, "embedding_batch_size": 50} 
        )
    
    pending_nodes = pending_nodes_to_emded()
    print(f"Number of pending nodes to embed: {pending_nodes}")
    sleep(60)
  except:
    print("Embedding has reach the limit rate per minut")
    sleep(60)
    continue

print("All nodes have been embeded!")

In [None]:
total_nodes_to_embed = kg.query("""
    MATCH (item:Item)
    RETURN count(item) AS TotalItemsToEmbed
""")

# Function to get the number of nodes pending to embed
def pending_nodes_to_emded():
    number_embedded_nodes = kg.query(
      """
      MATCH (item:Item) WHERE item.titleEmbedding IS NULL 
      RETURN count(item) AS NotEmbedItems                          
      """
    )
    return number_embedded_nodes[0]["NotEmbedItems"]

# Initialize pending_nodes variable
pending_nodes = pending_nodes_to_emded()

while pending_nodes != 0:
  try:
    print("Embedding nodes...")
    kg.query("""
        MATCH (item:Item WHERE item.titleEmbedding IS NULL)
        WITH item
        LIMIT $embedding_batch_size
        WITH item, genai.vector.encode(
          item.title, 
          "AzureOpenAI", 
          {
            token: $openAiApiKey,
            resource: "knowledge-graphs",
            deployment: "text-embedding-ada-002"
          }) AS vector
        CALL db.create.setNodeVectorProperty(item, "titleEmbedding", vector)
        """, 
        params={"openAiApiKey": OPENAI_API_KEY, "embedding_batch_size": 50} 
        )
    
    pending_nodes = pending_nodes_to_emded()
    print(f"Number of pending nodes to embed: {pending_nodes}")
    if pending_nodes != 0:
        sleep(60)
  except Exception as e:
    print(e)
    sleep(60)
    continue

print("All nodes have been embeded!")

## Use similarity search to find relevant chunks

In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "AzureOpenAI", 
      {
        token: $openAiApiKey,
        resource: "knowledge-graphs",
        deployment: "text-embedding-ada-002"
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.title AS title, node.item_id AS boe_id
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'index_name': "item_titles", 
                      'top_k': 10})
  return similar

In [None]:
search_results = neo4j_vector_search('Quiero información sobre la Resolución 1A0/38511/2023')

# RAG without connections. It's the same as standard RAG pipeline

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="item_chunks",
    node_label="Chunk",
    text_node_property="text",
    embedding_node_property="textEmbedding"
)

retriever = neo4j_vector_store.as_retriever(search_kwargs={'k': 5})

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm_model, 
    chain_type="stuff", 
    retriever=retriever
)

# Create connections between items and chunks

### 1. NEXT connection

In [None]:
for i in range(86):
  cypher = """
    MATCH (from_same_item:Chunk)
      WHERE from_same_item.item_id = $ItemIdParam
    WITH from_same_item
      ORDER BY from_same_item.chunkSeqId ASC
    WITH collect(from_same_item) as item_chunk_list
      CALL apoc.nodes.link(
        item_chunk_list,
        "NEXT",
        {avoidDuplicates: true}
      )
    RETURN size(item_chunk_list)
  """

  kg.query(cypher, params={'ItemIdParam': items_metadata[i]['item_id']})

  print(f"NEXT connections from chunks of item_id {items_metadata[i]['item_id']} created.")

### 2. PART_OF connection

In [None]:
cypher = """
  MATCH (c:Chunk), (i:Item)
    WHERE c.item_id = i.item_id
  MERGE (c)-[newRelationship:PART_OF]->(i)
  RETURN count(newRelationship)
"""

kg.query(cypher)

### 3. SECTION connection

In [None]:
cypher = """
  MATCH (first:Chunk), (i:Item)
  WHERE first.item_id = i.item_id
    AND first.chunkSeqId = 0
  WITH first, i
    MERGE (i)-[r:SECTION]->(first)
  RETURN count(r)
"""

kg.query(cypher)

## RAG with connections and window retriever

In [None]:
retrieval_query_window = """
MATCH window=
    (:Chunk)-[:NEXT*0..2]->(node)-[:NEXT*0..2]->(:Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC LIMIT 1
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.text) as textList, node, score
RETURN apoc.text.join(textList, " \n ") as text,
    score,
    node {.source} AS metadata
"""

In [None]:
vector_store_window = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="item_chunks",
    node_label="Chunk",
    text_node_property="text",
    retrieval_query=retrieval_query_window
)

# Create a retriever from the vector store
retriever_window = vector_store_window.as_retriever(search_kwargs={'k': 1})

# Create a chatbot Question & Answer chain from the retriever
chain_window = RetrievalQAWithSourcesChain.from_chain_type(
    llm_model, 
    chain_type="stuff", 
    retriever=retriever_window
)

In [None]:
from rich.console import Console
from rich.markdown import Markdown

def rich_output(response):
    console = Console()
    md = Markdown(response)
    console.print(md)

In [None]:
def print_comparison(question):
    response_a = chain({"question": question}, return_only_outputs=True,)
    response_b = chain_window({"question": question}, return_only_outputs=True,)    

    response_rag = ''
    response_rag += 'Response with standard RAG pipeline\n'
    response_rag += '-----------------------------------\n'
    response_rag += response_a["answer"]

    rich_output(response_rag)

    response_graphrag = ''
    response_graphrag += 'Response with GRAPH-RAG pipeline\n'
    response_graphrag += '-----------------------------------\n'
    response_graphrag += response_b["answer"]

    rich_output(response_graphrag)

## Example 1: Ask questions contained in the documents

In [None]:
# Define a question
question = "Quiero información sobre la Resolución 1A0/38511/2023"

print_comparison(question)

In [574]:
# Define a question
question = "Quiero información sobre la Resolución 1A0/38511/2023, de 15 de noviembre, del Centro Criptológico Nacional"

print_comparison(question)

In [577]:
# Define a question
question = "Indica y explica los 5 puntos sobre la Resolución 1A0/38511/2023, de 15 de noviembre, del Centro Criptológico Nacional"

print_comparison(question)

## Retrieve chunks creating a window from the item title and getting chunk context

In [581]:
retrieval_query_window_for_item_title = """
MATCH window=
    (node)-[:SECTION*0..1]->(:Chunk)-[:NEXT*0..1]->(:Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC LIMIT 6
WITH nodes(longestWindow) as itemList, node, score
  UNWIND itemList as itemRows
WITH collect(itemRows.text) as textList, node, score
RETURN apoc.text.join(textList, " \n ") as text, score, node {.source} AS metadata
"""

In [586]:
vector_store_window_for_item_title = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database="neo4j",
    index_name="item_titles",
    text_node_property="title",
    retrieval_query=retrieval_query_window_for_item_title
)

# Create a retriever from the vector store
retriever_window_for_item_title = vector_store_window_for_item_title.as_retriever(search_kwargs={'k': 6})

# Create a chatbot Question & Answer chain from the retriever
chain_window_for_item_title = RetrievalQAWithSourcesChain.from_chain_type(
    llm_model, 
    chain_type="stuff", 
    retriever=retriever_window
)

In [588]:
response_b = chain_window({"question": question}, return_only_outputs=True,)   

rich_output(response_b["answer"])