
# Print the the graph in Neo4J
MATCH (n)
OPTIONAL MATCH (n)-[r]-(m)
RETURN n, r, m

# Delete the the graph in Neo4J
MATCH (n)
DETACH DELETE n



In [None]:
%pip install --upgrade --quiet  langchain langchain-community langchain-openai langchain-experimental neo4j wikipedia tiktoken yfiles_jupyter_graphs python-dotenv

In [None]:
from langchain_core.runnables import (RunnableBranch,RunnableLambda,RunnableParallel,RunnablePassthrough,)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_openai import ChatOpenAI
import getpass
import os
from dotenv import load_dotenv


from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import ConfigurableField
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars


In [None]:
# Load environment variables from .env file
load_dotenv()

# Set them in the OS environment
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["NEO4J_URI"] = os.getenv("NEO4J_URI")
os.environ["NEO4J_USERNAME"] = os.getenv("NEO4J_USERNAME")
os.environ["NEO4J_PASSWORD"] = os.getenv("NEO4J_PASSWORD")

#Instantiate the Graph DB
graph = Neo4jGraph()


In [None]:
# It initialise a language model.
llm=ChatOpenAI(temperature=0, model_name="gpt-4o")


In [None]:
### Ploting the Graph ###

# It finds relationships between nodes where the relationship type is NOT "MENTIONS".
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"
#default_cypher = "MATCH (s)-[r]->(t) WHERE NOT type(r) = 'MENTIONS' RETURN s,r,t LIMIT 50"

# GraphWidget: A visualisation tool for rendering and exploring graphs in Jupyter Notebooks.
# GraphDatabase: The official Neo4j driver for querying and updating graph data in a Neo4j database.
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase


# It checks if the script is running in Google Colab and, if so, enable support for custom widgets.
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

# It runs a Cypher query on a Neo4j graph database to retrieve data (nodes and relationships).
# It visualises the graph data interactively using GraphWidget from the yfiles_jupyter_graphs library.
def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget


showGraph()

In [None]:
                                                    ### Database Settings ###


# It combines OpenAI embeddings with Neo4j graph data to enable hybrid search (a combination of vector similarity search and graph traversal).
#  GPT embedding: text-embedding-ada-002.
from langchain_openai import OpenAIEmbeddings
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [None]:
                                            ### Structured Data Retriever: Graph ###



### 1. Entity Extraction

# It creates a full-text search index in a Neo4j graph database for nodes with the label Entity.
# Full-text indexes are used to efficiently search text fields, especially when searching for words, phrases, or patterns in text properties.

graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")


# It defines a Pydantic model using BaseModel from the langchain_core.pydantic_v1 module. The model, named Entities, is used to structure and validate data related to entities extracted from text.
# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

# This template is designed for extracting named entities (organisations and persons) from text.
# It defines a structured conversation designed for an AI model to perform entity extraction (specifically for organisations and persons) from a given text input.

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

# It creates a "chain" or pipeline where:
# A prompt (from ChatPromptTemplate) defines the structure of the input or task.
# The LLM (e.g., OpenAI's GPT or another model) processes the prompt to generate an output.
# The output is returned in a structured format, defined by a class or schema (here, Entities).

entity_chain = prompt | llm.with_structured_output(Entities)



### 2. Entity Extraction

# 2.1 It generates a full-text search query for a given input string
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """

    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


# 2.2 It performs structured retrieval of information from a Neo4j graph database based on an input question. It combines entity extraction, full-text search, and graph traversal to return relationships involving extracted entities.
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 100
            """,
            {"query": generate_full_text_query(entity)},
        )
        #result += "\n".join([el['output'] for el in response])
        for el in response:
          result += el['output'] + "\n"  # Append to result if needed

    return result

In [None]:
                                            ### Unstructured Data Retriever: Chunks-based ###

# It gets the top 15 similar chunks based on the users input
def unstructured_retriever(question: str) -> str:
    result= [el.page_content for el in vector_index.similarity_search(question, k = 15)]

    return result

In [None]:
# Testing Entity Extraction
entity_chain.invoke({"question": "The DeepSeek-R1 model provides responses comparable to other contemporary large language models, such as OpenAI's GPT-4o and o1."}).names

In [None]:
# Testing Structured Retriever
structured_retriever("What is DeepSeek?-R1")

In [None]:
# Testing Unstructured Retriever
unstructured_retriever("What is DeepSeek?-R1")

In [None]:
                                                        ### Final Retriever ###

# It combines structured data retrieval (from a graph database) with unstructured data retrieval (from a vector-based search index).
# It creates a unified result containing both types of data for a given question.

def retriever(question: str, query_type:str):
    print(f"Search query: {question}")
    if query_type=="graph":
        structured_data = structured_retriever(question)
        final_data = f"""Structured data:
                    {structured_data}
                   """

    if query_type=="similarity":
        unstructured_data = unstructured_retriever(question)

        final_data = f"""Unstructured data:
                        {"#Document ". join(unstructured_data)}
                    """
        
    if query_type=="hybrid":
        structured_data = structured_retriever(question)
        unstructured_data = unstructured_retriever(question)
        
        final_data = f"""Structured data:
                    {structured_data}
                    Unstructured data:
                        {"#Document ". join(unstructured_data)}
                    """
    print("RETRIEVER-OUTPUT:\n",final_data)
    return final_data

In [None]:
retriever("What is DeepSeek?","graph")

In [45]:
                                                                    ### Generator ###


# It defines a template for rephrasing a follow-up question into a standalone question.
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""


# This is part of a workflow where prompts are dynamically formatted for interaction with language models.
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

# It takes a chat history in the form of a list of conversation turns (between a human and an AI) and converts it into a formatted list of message objects (HumanMessage and AIMessage).
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer


# It processes an input query differently based on whether or not the query includes a chat history.
_search_query = RunnableBranch(
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser()
        #| RunnableLambda(lambda question: {"question": question, "query_type": "similarity"})  # Ensure it returns a dictionary
        | RunnableLambda(lambda question: {"question": question, "query_type": query_type})  # Ensure it returns a dictionary

    ),
    RunnableLambda(lambda x: {"question": x["question"], "query_type": x["query_type"]})
)


# It structures input for a language model to answer a question based on provided context.
template = """Answer the question following these guidelines:

- Answer the question based *only* on the provided context.
- Make inference based on the provided context, not based on your knowledge.
- If the context does not contain enough information, state: "I do not have enough information to answer your question."
- If you don't know the answer, state: "I do not know the answer to your question."
- Use clear, natural, and professional language in your response.
- Avoid making up information or speculating beyond the provided context.
- If applicable, provide citations or references to the retrieved context.

Context:
{context}

Question: {question}

Answer:
"""


# It creates a chat-style prompt template for a conversational AI model by converting the provided template into a ChatPromptTemplate object.
prompt = ChatPromptTemplate.from_template(template)

# It defines a processing chain using LangChain's RunnableParallel and other components.
# The chain integrates multiple steps to answer a question using both context retrieval and language model processing.

chain = (
    RunnableParallel(
        {
            "context": _search_query | RunnableLambda(lambda x: retriever(x.get("question", ""), x.get("query_type", ""))), 
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [46]:
# Testing the Generator
chain.invoke({"question": "What is DeepSeek-R1", "query_type": "graph"})

Search query: What is DeepSeek-R1




RETRIEVER-OUTPUT:
 Structured data:
                    Deepseek-R1 - AFFECTED -> Internet
Deepseek-R1 - AFFECTED -> Stock Market
Deepseek-R1 - PERFORMED_AS_WELL_OR_BETTER_THAN -> O1
Deepseek-R1 - ACCOMPANIED_BY -> Tech Report
Deepseek-R1 - LEAVES_OPEN_QUESTION -> Data Collection
Deepseek-R1 - LEAVES_OPEN_QUESTION -> Model Training
Deepseek-R1 - CONTRIBUTE_TO -> Open-R1
Deepseek-R1 - BUILT_ON_THE_FOUNDATION_OF -> Deepseek-V3
Deepseek-R1 - BOON -> Community
Deepseek-R1-Zero - USED -> Group Relative Policy Optimization
Deepseek-R1-Zero - SKIPPED -> Supervised Fine-Tuning
Deepseek-R1-Zero - RELIED_ON -> Reinforcement Learning
Open-R1 - REPRODUCTION -> Deepseek-R1
Open-R1 Project - RECONSTRUCT -> Deepseek-R1
Deepseek - INTRODUCED -> Deepseek-R1
Reasoning Dataset - FROM -> Deepseek-R1
Deepseek - RELEASED -> Deepseek-R1
Model Weights - OPEN -> Deepseek-R1
Datasets - NOT_OPEN -> Deepseek-R1
Code - NOT_OPEN -> Deepseek-R1
Deepseek - INTRODUCED -> Deepseek-R1-Zero

                   


'Deepseek-R1 is a model or system that has affected the Internet and the Stock Market. It has performed as well or better than another entity referred to as O1 and is accompanied by a Tech Report. It leaves open questions regarding Data Collection and Model Training. Deepseek-R1 contributes to Open-R1 and is built on the foundation of Deepseek-V3. It is considered a boon to the community. Additionally, Deepseek-R1 has been introduced and released by Deepseek, and its model weights are open, while its datasets and code are not open.'

In [44]:
### Generator with history ###

# chain.invoke(
#     {
#         #"question": "When was she born?", "query_type": "graph",
#         "question": "What is Open-R1?", "query_type": "graph",
#         "chat_history": [("What is DeepSeek-R1?", "It is a reasoning LLM.")],
#     }
# )