In [50]:
%pip install --upgrade --quiet  langchain langchain-community langchain-openai langchain-experimental neo4j wikipedia tiktoken yfiles_jupyter_graphs

# Environment Setup
You need to set up a Neo4j instance. The easiest way is to start a free instance on Neo4j Aura, which offers cloud instances of Neo4j database. Alternatively, you can also set up a local instance of the Neo4j database by downloading the Neo4j Desktop application and creating a local database instance.

Also you would need the OPENAI API Key


In [51]:
import os
import openai
from langchain_community.graphs import Neo4jGraph
from google.colab import userdata
userdata.get('open_key')
os.environ["OPENAI_API_KEY"] = userdata.get('open_key')
openai.api_key = os.getenv('OPENAI_API_KEY')
os.environ["NEO4J_URI"] = userdata.get('NEO4J_URI')
os.environ["NEO4J_USERNAME"] =  userdata.get('NEO4J_USERNAME')
os.environ["NEO4J_PASSWORD"] = userdata.get('NEO4J_PASSWORD')

# Step 1a : Reading & Chunking Wikipedia article

Process for reading and chunking a Wikipedia article is pretty simple, as LangChain has integrated the Wikipedia document loader as well as the text chunking modules. Refer to documentation for adjusting the default parameters for loader such as language, or to downloaded fields other than published, title, summary etc.

The tiktoken text chunking module, uses a tokenizer made by OpenAI, to split the article into chunks with 1000 tokens. (Refer Split by tokens for details)


In [52]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

##Data Ingestion

In [53]:
# Read the wikipedia article
raw_documents = WikipediaLoader(query="Chief Information Security Officer ").load()
# Define chunking strategy
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=1000, chunk_overlap=20 )
# Chunk the document
documents = text_splitter.split_documents(raw_documents)



  lis = BeautifulSoup(html).find_all('li')


# Step 2 : Create & Store Vector Index of Text
LangChain makes it easy to import the documents into Neo4j and index them using the newly added vector index. Neo4j vector index is wrapped as a LangChain vector store and, therefore, follows the syntax used to interact with other vector databases

The from_documents method connects to a Neo4j database, imports and embeds the documents, and creates a vector index. The data will be represented as the Chunk nodes by default. We have used OpenAI embeddings

In [54]:
from langchain_community.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings

neo4j_vector = Neo4jVector.from_documents(
    documents,
    OpenAIEmbeddings()

)


# Step 3 : Vector Similarity Search

The LangChain module used the specified embedding function (OpenAI in this example) to embed the question and then find the most similar documents by comparing the cosine similarity between the user question and indexed documents from the database.

Neo4j vector index also supports the Euclidean similarity metric along with the cosine similarity.


In [55]:
query = "What does a CISO do?"

results = neo4j_vector.similarity_search(query, k=1)
print(results[0].page_content)


A chief information security officer (CISO) is a senior-level executive within an organization responsible for establishing and maintaining the enterprise vision, strategy, and program to ensure information assets and technologies are adequately protected. The CISO directs staff in identifying, developing, implementing, and maintaining processes across the enterprise to reduce information and information technology (IT) risks. They respond to incidents, establish appropriate standards and controls, manage security technologies, and direct the establishment and implementation of policies and procedures. The CISO is also usually responsible for information-related compliance (e.g. supervises the implementation to achieve ISO/IEC 27001 certification for an entity or a part of it). The CISO is also responsible for protecting proprietary information and assets of the company, including the data of clients and consumers. CISO works with other executives to make sure the company is growing in

# Q&A with LangChain and ChatOpenAI

LangChain supports question-answering workflows using only a line or two of code. For example, to create a question-answering workflow that generates answers based on the provided context but also provides which documents it used as the context, use the RetrievalQAWithSourcesChain


In [56]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=neo4j_vector.as_retriever()
)

query = "What is a CISO?"

chain.invoke(
    {"question": query},
    return_only_outputs=True,
)


{'answer': 'A chief information security officer (CISO) is a senior-level executive within an organization responsible for establishing and maintaining the enterprise vision, strategy, and program to ensure information assets and technologies are adequately protected. The CISO directs staff in identifying, developing, implementing, and maintaining processes across the enterprise to reduce information and information technology (IT) risks. They respond to incidents, establish appropriate standards and controls, manage security technologies, and direct the establishment and implementation of policies and procedures. The CISO is also usually responsible for information-related compliance. The office of the Federal Chief Information Security Officer of the United States (CISO) was created on September 8, 2016, to guide cybersecurity policy, planning, and implementation in the U.S. Federal Government.\n',
 'sources': 'https://en.wikipedia.org/wiki/Chief_information_security_officer, https:/

# Construct Graph using LLM Graph Transformer
LangChain makes it easy to construct and store knowledge graph based on the retrieved documents. It provides LLM Graph Transformer which hides away all the complexity.

The LLMGraphTransformer converts text documents into structured graph documents by leveraging a LLM to parse and categorize entities and their relationships.

The generated graph documents can be stored to a graph database using the add_graph_documents method. The baseEntityLabel parameter assigns an additional __Entity__ label to each node, enhancing indexing and query performance. The include_source parameter links nodes to their originating documents, facilitating data traceability and context understanding.


In [57]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
#from langchain.graphs import graph
#from langchain_experimental import LLMGraphTransformer
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125") # gpt-4-0125-preview occasionally has issues
llm_transformer = LLMGraphTransformer(llm=llm)

#graph_documents = llm_transformer.convert_to_graph_documents(documents)
#graph.add_graph_documents(
 #   graph_documents,
#    baseEntityLabel=True,
#    include_source=True
#)

#llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125")
#llm_transformer = LLMGraphTransformer(llm)

#documents = ["This is a sample document.", "This is another sample document."]
graph_documents = llm_transformer.convert_to_graph_documents(documents)

graph = Neo4jGraph()
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

# Visualize the graph
You can inspect the generated graph with yfiles visualization. yFiles Graphs for Jupyter is a free diagram visualization extension for JupyterLab and Jupyter Notebook.

In [58]:
from google.colab import output
output.enable_custom_widget_manager()

In [59]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 600"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

showGraph()


GraphWidget(layout=Layout(height='800px', width='100%'))

# Graph Similiary Search
The LangChain module also supports similarity search from graph.


In [60]:
from langchain_community.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings

existing_graph = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    index_name="New Index",
    node_label="Person",
    text_node_properties=["id"],
    embedding_node_property="embedding",
)
result = existing_graph.similarity_search("Who does the CISO report to?", k=1)

In [61]:
print(result[0].page_content)


id: Chief Information Security Officer (Ciso)


#New Code from here

In [62]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [63]:
# Retriever

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

In [64]:
entity_chain.invoke({"question": "Who has been the CISO?"}).names

['CISO']

In [65]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node, score
            CALL {
              MATCH (node)-[r]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION
              MATCH (node)<-[r]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        # Filter out None values and then join the results
        filtered_outputs = [el['output'] for el in response if el['output'] is not None]
        result += "\n".join(filtered_outputs)
    return result

In [66]:
print(structured_retriever("Who has been a Chief Information Security Officer?"))

35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Ciso
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Organization
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Enterprise
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Information Assets
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Technologies
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Staff
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Processes
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Risks
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Incidents
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Standards
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Controls
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Security Technologies
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Policies
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Procedures
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Proprietary Information
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Assets
35dcc310af9f3ff74c85362890be65dc - MENTIONS -> Company
35dcc310

In [67]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [68]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [69]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [70]:
chain.invoke({"question": "Who was the first CISO?"})

Search query: Who was the first CISO?


'The first CISO was Gregory Touhill.'

In [71]:
chain.invoke(
    {
        "question": "Where where they born?",
        "chat_history": [("Who was the first CISO?", "The first CISO was Gregory Touhill.")],
    }
)

Search query: Where was Gregory Touhill born?


'Gregory Touhill was born in the United States. Michael P. Burgess was born in England. Phil Venables was born in Britain.'