In [None]:
# Print the the graph in Neo4J
MATCH (n)
OPTIONAL MATCH (n)-[r]-(m)
RETURN n, r, m

# Delete the the graph in Neo4J
MATCH (n)
DETACH DELETE n


In [None]:
%pip install --upgrade --quiet  langchain langchain-community langchain-ollama  langchain-experimental neo4j wikipedia tiktoken yfiles_jupyter_graphs python-dotenv json-repair langchain-openai langchain_core


In [1]:
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from neo4j import GraphDatabase
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_ollama import OllamaEmbeddings,ChatOllama
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
import os
import re
import numpy as np
import getpass
from dotenv import load_dotenv
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from yfiles_jupyter_graphs import GraphWidget



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
# Load environment variables from .env file
load_dotenv()

# Set them in the OS environment
os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")

#Instantiate the Graph DB
graph = Neo4jGraph()

  graph = Neo4jGraph()


In [3]:
# It initialise a language model.

# It gets the entities from the user's question
model_entity="qwen2:7b" 

# It translates the cypher langaue into human language
model_translator="qwen2:7b" 

# It generates the responses based on the given context
model_generator="deepseek-r1:14b" 
#model_generator="mistral-small:latest"
#model_generator="tulu3:8b"
#model_generator="qwen2.5:32b"


llm_entity = ChatOllama(model=model_entity, temperature=0, format="json")
llm_translator = ChatOllama(model=model_translator, temperature=0, format="json")
llm_generator = ChatOllama(model=model_generator, temperature=0)



In [4]:
### Ploting the Graph ###

# It finds relationships between nodes where the relationship type is NOT "MENTIONS".
#default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"
default_cypher = "MATCH (n) OPTIONAL MATCH (n)-[r]-(m) RETURN n, r, m"

#default_cypher = "MATCH (s)-[r]->(t) WHERE NOT type(r) = 'MENTIONS' RETURN s,r,t LIMIT 50"

# GraphWidget: A visualisation tool for rendering and exploring graphs in Jupyter Notebooks.
# GraphDatabase: The official Neo4j driver for querying and updating graph data in a Neo4j database.
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase


# It checks if the script is running in Google Colab and, if so, enable support for custom widgets.
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

# It runs a Cypher query on a Neo4j graph database to retrieve data (nodes and relationships).
# It visualises the graph data interactively using GraphWidget from the yfiles_jupyter_graphs library.
def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget


showGraph()


GraphWidget(layout=Layout(height='800px', width='100%'))

GraphWidget(layout=Layout(height='800px', width='100%'))

In [5]:
                                            ### Database Settings ###

# Initialize embeddings
embeddings = OllamaEmbeddings(
#model="mxbai-embed-large",
model="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
)

# Create a vector index from the existing graph
vector_index =Neo4jVector.from_existing_graph(
embeddings,
search_type="hybrid",
node_label="Document",
text_node_properties=["text"],
embedding_node_property="embedding"
)

In [6]:
                                            ### Structured Data Retriever: Graph ###


### 1. Entity Extraction

# It creates a full-text search index in a Neo4j graph database for nodes with the label Entity.
# Full-text indexes are used to efficiently search text fields, especially when searching for words, phrases, or patterns in text properties.
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")


# It defines a Pydantic model using BaseModel from the langchain_core.pydantic_v1 module. The model, named Entities, is used to structure and validate data related to entities extracted from text.
# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

# This template is designed for extracting named entities (organisations and persons) from text.
# It defines a structured conversation designed for an AI model to perform entity extraction (specifically for organisations and persons) from a given text input.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

# It creates a "chain" or pipeline where:
# A prompt (from ChatPromptTemplate) defines the structure of the input or task.
# The LLM processes the prompt to generate an output.
# The output is returned in a structured format, defined by a class or schema (here, Entities).
dict_schema = convert_to_openai_function(Entities)
entity_chain = prompt | llm_entity.with_structured_output(dict_schema)


### 2. Entity Extraction

# 2.1 It generates a full-text search query for a given input string
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """

    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# 2.2 It converts entiy relationship language into human language. Example: Deepseek - PART_OF -> Ai Space Race = Deepseek is part of Ai Space Race    
def convert_cypher_to_text(context: str) -> str:

    # Define the prompt template
    template = ChatPromptTemplate.from_messages([
        ("system", "You are an expert in converting cypher language statements into natural language sentences based on the given context."),
        ("human", "{user_input}")
    ])
    
    # Format the user question for translation
    user_input = f""" Convert the given cypher language statements into human-readable language.
    Each statement follows the pattern:  "Entity - RELATIONSHIP -> Target"
    
    Follow these guidelines:
    
    1. Convert the entity and target into readable names without modification.
    2. Convert the relationship into a natural language phrase based on common meanings.
    3. Ensure the sentence is grammatically correct and clear.
    
    Example Input:
    Deepseek - PART_OF -> Ai Space Race
    Deepseek - HAS_NAME -> Shendu Qiusuo
    Deepseek - DEVELOPS -> Artificial Intelligence
    
    Example Output:
    Deepseek is part of Ai Space Race
    Deepseek is named Shendu Qiusuo
    Deepseek develops Artificial Intelligence.

    
    Now, translate the following cypher languange statements into human-readable language based on the following context:
    
    {context}
    
    Ouput:
    """
    
    # Generate messages and invoke the LLM response
    messages = template.format_messages(user_input=user_input)
    response = llm_translator.invoke(messages)
    #print(response.content)
    return response

# 2.3 It extracts and returns all sentences after the ':' character
def extract_converted_cypher(text: str) -> str:
    """
    Extracts and returns all sentences after the ':' character,
    ensuring each sentence ends with a period, separated by a line break.
    """
    sentences = re.findall(r':\s*"(.*?)"', text)
    sentences = [s if s.endswith('.') else s + '.' for s in sentences]
    return "\n".join(sentences)


# 2.4 It performs structured retrieval of information from a Neo4j graph database based on an input question. It combines entity extraction, full-text search, and graph traversal to return relationships involving extracted entities.
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    entity_names =entities['names']
    
    for entity in entity_names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 100
            """,
            {"query": generate_full_text_query(entity)},
        )
        #result += "\n".join([el['output'] for el in response])
        for el in response:
          #print("ddd:",el['output'])  # Print each output separately
          result += el['output'] + "\n"  # Append to result if needed
        partial_result= convert_cypher_to_text(result)
        final_result=extract_converted_cypher(partial_result.content)
    return final_result


In [7]:
                                        ### Unstructured Data Retriever: Chunks-based ###

# It gets the top 15 similar chunks based on the users input
def unstructured_retriever(question: str) -> str:
    result= [el.page_content for el in vector_index.similarity_search(question, k = 15)]

    return result   

In [8]:
### Testing the Entity extraction
entities=entity_chain.invoke({"question": "The DeepSeek-R1 model provides responses comparable to other contemporary large language models, such as OpenAI's GPT-4o and o1."})
entity_names =entities['names']
entity_names 

['The DeepSeek-R1 model', "OpenAI's GPT-4"]

In [9]:
### Testing the structured retriever
print(structured_retriever("What is DeepSeek-R1?"))

  words = [el for el in remove_lucene_chars(input).split() if el]


Documenting includes What Works.
Documenting includes What Doesn’t Work.
Deepseek-R1 affects the Internet.
Deepseek-R1 impacts the Stock Market.
Deepseek-R1 performed as well or better than O1.
Deepseek-R1 is accompanied by a Tech Report.
Deepseek-R1 leaves an open question about Data Collection.
Deepseek-R1 leaves an open question about Model Training.
Deepseek-R1 contributes to Open-R1.
Deepseek-R1 is built on the foundation of Deepseek-V3.
Deepseek-R1 benefits the Community.
Deepseek-R1-Zero uses Group Relative Policy Optimization.
Deepseek-R1-Zero skips Supervised Fine-Tuning.
Deepseek-R1-Zero relies on Reinforcement Learning.
Reproduction of Open-R1 leads to Deepseek-R1.
The reconstruction of the Open-R1 Project results in Deepseek-R1.
Deepseek introduces Deepseek-R1.
The Reasoning Dataset originates from Deepseek-R1.
Deepseek releases Deepseek-R1.
Model weights are open for Deepseek-R1.
Datasets are not open for Deepseek-R1.
The code is not open for Deepseek-R1.
Deepseek introduc

In [10]:
### Testing the unstructured retriever
print(unstructured_retriever("What is DeepSeek-R1?"))



['\ntext: However, the DeepSeek-R1 release leaves open several questions about:\n•\tData collection: How were the reasoning-specific datasets curated?\n•\tModel training: No training code was released by DeepSeek, so it is unknown which hyperparameters work best and how they differ across different model families and scales.', '\ntext: Open-R1: a fully open reproduction of DeepSeek-R1\nWhat is DeepSeek-R1?\nIf you’ve ever struggled with a tough math problem, you know how useful it is to think a little longer and work through it carefully.', '\ntext: Besides performing as well or better than o1, the DeepSeek-R1 release was accompanied by a detailed tech report that outlined the key steps of their training recipe.', '\ntext: That’s where DeepSeek-R1 comes in.', '\ntext: In this blog post we take a look at key ingredients behind DeepSeek-R1, which parts we plan to replicate, and how to contribute to the Open-R1 project.', '\ntext: •\tScaling laws: What are the compute and data trade-offs 

In [11]:
                                                                ### Final Retriever ###

# It combines structured data retrieval (from a graph database) with unstructured data retrieval (from a vector-based search index).
# It creates a unified result containing both types of data for a given question.
def retriever(question: str, query_type:str):
    print(f"Search query: {question}")
    if query_type=="graph":
        structured_data = structured_retriever(question)
        final_data = f"""Structured data:
                    {structured_data}
                   """

    if query_type=="similarity":
        unstructured_data = unstructured_retriever(question)

        final_data = f"""Unstructured data:
                        {"#Document ". join(unstructured_data)}
                    """
        
    if query_type=="hybrid":
        structured_data = structured_retriever(question)
        unstructured_data = unstructured_retriever(question)
        
        final_data = f"""Structured data:
                    {structured_data}
                    Unstructured data:
                        {"#Document ". join(unstructured_data)}
                    """
    print("RETRIEVER-OUTPUT:\n",final_data)
    return final_data

In [14]:
### Testing the final retriever
retriever("What is DeepSeek?","hybrid")

Search query: What is DeepSeek?




RETRIEVER-OUTPUT:
 Structured data:
                    Deepseek was introduced as Deepseek-R1.
Deepseek was introduced with the version Deepseek-R1-Zero.
Deepseek-R1 was released by Deepseek.
Deepseek-R1 impacted the Internet.
Deepseek-R1 influenced the stock market.
Deepseek-R1 outperformed or matched O1 in performance.
Deepseek-R1 was accompanied by a tech report.
Deepseek-R1 raises an open question regarding data collection.
Deepseek-R1 poses an open question about model training.
Deepseek-R1 contributes to the development of Open-R1.
Deepseek-R1 was developed based on the foundation of Deepseek-V3.
Deepseek-R1 benefits the community.
The Pure Rl Pipeline is utilized by Deepseek.
Reproduction of Open-R1 led to the creation of Deepseek-R1.
Deepseek-R1 was reconstructed based on the Open-R1 Project.
Deepseek introduced Deepseek-R1.
Deepseek-R1 was officially released by Deepseek.
The model weights for Deepseek-R1 are accessible or open.
The datasets related to Deepseek-R1 are not pub

'Structured data:\n                    Deepseek was introduced as Deepseek-R1.\nDeepseek was introduced with the version Deepseek-R1-Zero.\nDeepseek-R1 was released by Deepseek.\nDeepseek-R1 impacted the Internet.\nDeepseek-R1 influenced the stock market.\nDeepseek-R1 outperformed or matched O1 in performance.\nDeepseek-R1 was accompanied by a tech report.\nDeepseek-R1 raises an open question regarding data collection.\nDeepseek-R1 poses an open question about model training.\nDeepseek-R1 contributes to the development of Open-R1.\nDeepseek-R1 was developed based on the foundation of Deepseek-V3.\nDeepseek-R1 benefits the community.\nThe Pure Rl Pipeline is utilized by Deepseek.\nReproduction of Open-R1 led to the creation of Deepseek-R1.\nDeepseek-R1 was reconstructed based on the Open-R1 Project.\nDeepseek introduced Deepseek-R1.\nDeepseek-R1 was officially released by Deepseek.\nThe model weights for Deepseek-R1 are accessible or open.\nThe datasets related to Deepseek-R1 are not pu

In [15]:
                                                                    ### Generator ###
# It includes the context(retrieval output) into the LLM based on the query type and response the user's question. 
def get_response(question: str, query_type:str):

    template = ChatPromptTemplate.from_messages([
        ("system", "You are an AI assistant using a Retrieval-Augmented Generation (RAG) system to provide accurate answers based on the given context."),
        ("human", "{user_input}")
    ])
    
    context = retriever(question,query_type) # The options are: graph, similarity, hybrid.
    
    user_question = f"""Answer the question following these guidelines:
    
    - Answer the question based *only* on the provided context.
    - Make inference based on the provided context noy based on your knowlege.
    - If the context does not contain enough information, state: "I do not have enough information to answer your question."
    - If you dont know the answer, state: "I do not know the answer to your question."
    - Use clear, natural, and professional language in your response.
    - Avoid making up information or speculating beyond the provided context.
    - If applicable, provide citations or references to the retrieved context.
    
    Context:
    {context}
    
    Question: {question}
    
    Answer:
    """
    
    
    messages = template.format_messages(user_input=user_question)
    response = llm_generator.invoke(messages)
    return response.content


In [16]:
# Testing the Generator
get_response("What is DeepSeek-R1?","graph")

Search query: What is DeepSeek-R1?




RETRIEVER-OUTPUT:
 Structured data:
                    Documenting includes What Works.
Documenting includes What Doesn’t Work.
Deepseek-R1 affects the Internet.
Deepseek-R1 impacts the Stock Market.
Deepseek-R1 performed as well or better than O1.
Deepseek-R1 is accompanied by a Tech Report.
Deepseek-R1 leaves an open question about Data Collection.
Deepseek-R1 leaves an open question about Model Training.
Deepseek-R1 contributes to Open-R1.
Deepseek-R1 is built on the foundation of Deepseek-V3.
Deepseek-R1 benefits the Community.
Deepseek-R1-Zero uses Group Relative Policy Optimization.
Deepseek-R1-Zero skips Supervised Fine-Tuning.
Deepseek-R1-Zero relies on Reinforcement Learning.
Reproduction of Open-R1 leads to Deepseek-R1.
The reconstruction of the Open-R1 Project results in Deepseek-R1.
Deepseek introduces Deepseek-R1.
The Reasoning Dataset originates from Deepseek-R1.
Deepseek releases Deepseek-R1.
Model weights are open for Deepseek-R1.
Datasets are not open for Deepseek-R1.

"<think>\nOkay, so I need to figure out what DeepSeek-R1 is based on the provided context. Let me go through each point one by one.\n\nFirst, it says that Deepseek-R1 affects the Internet and impacts the Stock Market. That suggests it's some kind of tool or system used in financial contexts, maybe for trading or analysis.\n\nNext, it performed as well or better than O1. I'm not sure what O1 refers to, but it seems like a competitor or another version. So Deepseek-R1 is at least as good if not better than this other thing.\n\nDeepseek-R1 comes with a Tech Report, which probably means there's documentation or technical details available about it. That makes sense for something that's being released formally.\n\nIt leaves open questions about Data Collection and Model Training. This implies that while the system has been developed, there are still areas where improvements can be made in how data is gathered and models are trained.\n\nDeepseek-R1 contributes to Open-R1. I'm not sure what O