In [None]:
# https://python.langchain.com/docs/how_to/#extraction
# https://python.langchain.com/docs/tutorials/extraction/
%pip install --upgrade --quiet typing langchain langchain_core pydantic langchain-community langchain_experimental langchain_ollama neo4j langchain_neo4j os yfiles_jupyter_graphs langchain_ollama llama-cpp-python json_repair

In [2]:
#instantiate neo4j instance.

from langchain_neo4j import Neo4jGraph
graph = Neo4jGraph()


In [3]:
#Define the Schema
from typing import List, Optional
from pydantic import BaseModel, Field

class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )


class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]


In [8]:
#Define the extractor
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [36]:
#Load LLM
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_ollama import OllamaEmbeddings, ChatOllama

llm_model="mistral:7b-instruct-v0.3-q8_0" #mistral:7b-instruct-v0.3-q8_0, llama3.1:8b-instruct-q8_0, llama3.2:3b-instruct-fp16

llm = OllamaFunctions(model=llm_model, temperature=0, format="json")
# llm = ChatOllama(model=llm_model, temperature=0)

In [37]:
#Loading via LlammaCPP
# # from langchain_community.llms import LlamaCpp
# from langchain_community.chat_models import ChatLlamaCpp
# from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

# # https://github.com/langchain-ai/langchain/issues/25318
# # https://github.com/langchain-ai/langchainjs/issues/6051
# llm = ChatLlamaCpp(
#     model_path="/home/ubuntu/repos/gguf/llama3_1_8b_f16.gguf", #"/home/ubuntu/repos/gguf/llama3_2_3b_q8_0.gguf", mistral_7b_f16.gguf, llama3_1_8b_f16.gguf
#     n_gpu_layers=-1,
#     temperature=0,
#     echo='True',
#     grammar='json',
#     n_batch=1,
#     n_ctx=2048,
#     f16_kv=False,
#     # callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
#     # verbose=True,
# );

In [38]:
structured_llm = llm.with_structured_output(schema=Data)
text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Data(people=[Person(name='Jeff', hair_color='black', height_in_meters=None), Person(name='Anna', hair_color='black', height_in_meters=None)])

In [39]:
#load text instead
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

loader = TextLoader(file_path="dummytext.txt")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) #or TokenTextSplitter
documents = text_splitter.split_documents(documents=docs)

#see https://python.langchain.com/docs/how_to/extraction_long_text/ for more efficient parallel runs

In [None]:
#convert raw unstructured text into graph documents.
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm) #, ignore_tool_usage=True

#if filtering is required.
# llm_transformer_filtered = LLMGraphTransformer(
#     llm=llm,
#     allowed_nodes=["Person", "Country", "Organization"],
#     allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
# )
# graph_documents_filtered = llm_transformer_filtered.convert_to_graph_documents(
#     documents
# )
# print(f"Nodes:{graph_documents_filtered[0].nodes}")
# print(f"Relationships:{graph_documents_filtered[0].relationships}")

graph_documents = llm_transformer.convert_to_graph_documents(documents); 

In [21]:
#view the contents
print(f"graph_documents:{graph_documents[0]}")
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

doc = graph_documents[0:2]
non_empty_nodes = [doc for doc in doc if doc.nodes]
non_empty_rel = [doc for doc in doc if doc.relationships]

# for graph_document in non_empty_nodes:
#     print(graph_document.nodes)

for doc in non_empty_rel:
    print(doc.relationships)

graph_documents:nodes=[Node(id='Maria', type='Person', properties={}), Node(id='Caruso family', type='Family', properties={}), Node(id='Giovanni Caruso', type='Person', properties={}), Node(id='culinary heritage', type='Culinary Tradition', properties={}), Node(id='Sicily', type='Location', properties={}), Node(id='Santa Caterina', type='Location', properties={})] relationships=[Relationship(source=Node(id='Giovanni Caruso', type='Person', properties={}), target=Node(id='Caruso family', type='Family', properties={}), type='BELONGS_TO', properties={}), Relationship(source=Node(id='Maria', type='Person', properties={}), target=Node(id='Caruso family', type='Family', properties={}), type='BELONGS_TO', properties={}), Relationship(source=Node(id='Santa Caterina', type='Location', properties={}), target=Node(id='Sicily', type='Location', properties={}), type='LOCATED_IN', properties={}), Relationship(source=Node(id='Caruso family', type='Family', properties={}), target=Node(id='culinary her

In [22]:
import os
from neo4j import GraphDatabase, Driver

# Add contents into neo4j database
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )

def clear_database(tx):
    tx.run("""
        MATCH (n)
        DETACH DELETE n
    """)    

with driver.session() as session:
    session.execute_write(clear_database)

graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [None]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase, Driver

default_cypher = "MATCH (s)-[r]->(t) WHERE type(r) <> 'MENTIONS'RETURN s,r,t"

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [34]:
#Display Neo4j Database Schema
# https://python.langchain.com/docs/tutorials/graph/
graph.refresh_schema()
# print(graph.schema)
enhanced_graph = Neo4jGraph(enhanced_schema=True)
#more details
print(enhanced_graph.schema)



Node properties:
- **Document**
  - `id`: STRING Example: "3d712ead111d00139576a93246cf5f25"
  - `source`: STRING Available options: ['dummytext.txt']
  - `text`: STRING Example: "1. The Story of Amico’s Family: A Legacy of Love a"
- **Person**
  - `id`: STRING Example: "Maria"
- **Location**
  - `id`: STRING Available options: ['Sicily', 'Santa Caterina', 'Amalfi Coast', 'New York City', 'picturesque village of Santa Caterina', 'bustling streets of New York City']
- **Family**
  - `id`: STRING Available options: ['Caruso family', 'Antonio, Giovanni and Maria', 'their children', 'family', 'Caruso Family', 'The Carusos']
- **Concept**
  - `id`: STRING Available options: ['Tradition', 'environmental sustainability', 'benefits of consuming seasonal and organic produce']
- **Food**
  - `id`: STRING Available options: ['organic salads', 'Cannoli', 'gluten-free pasta', 'Lasagna', 'small plates', 'nutritious food', 'traditional Sicilian dishes', 'other international cuisines']
- **Restaurant*

In [35]:
# https://python.langchain.com/docs/tutorials/graph/

from langchain_neo4j import GraphCypherQAChain

chain = GraphCypherQAChain.from_llm(
    graph=enhanced_graph, llm=llm, verbose=True, allow_dangerous_requests=True
)
response = chain.invoke({"query": "Who is Lucia"})
response



[1m> Entering new GraphCypherQAChain chain...[0m


ValueError: Failed to parse a response from mistral:7b-instruct-v0.3-q8_0 output: {
    "Lucia": {
      "type": "Family",
      "name": "Lucia"
    }
  }

In [None]:
# his method configures keyword and vector search indexes for a hybrid search approach, targeting nodes labeled Document. Additionally, it calculates text embedding values if they are missing.
#  The vector index can then be called with the similarity_search method.
#     
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)

# from langchain_community.llms import LlamaCppEmbeddings
# embeddings = LlamaCppEmbeddings(model_path="/home/ubuntu/repos/gguf/mxbai-embed-large-v1-f16.gguf",
#                               n_ctx=4096,
#                               n_gpu_layers=-1,
#                             #   n_threads=8,
#                             #   n_batch=1000
#                               )

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

vector_retriever = vector_index.as_retriever()

In [None]:
# The graph retriever starts by identifying relevant entities in the input. For simplicity, we instruct the LLM to identify people, organizations, and locations. To achieve this, we will use LCEL with the newly added with_structured_output method to achieve this.

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

 Great, now that we can detect entities in the question, let's use a full-text index to map them to the knowledge graph. First, we need to define a full-text index and a function that will generate full-text queries that allow a bit of misspelling, which we won't go into much detail here.

In [None]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

graph.query(
    "CREATE FULLTEXT INDEX `fulltext_entity_id` IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke(question)
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

The structured_retriever function starts by detecting entities in the user question. Next, it iterates over the detected entities and uses a Cypher template to retrieve the neighborhood of relevant nodes. Let's test it out!

In [None]:
print(structured_retriever("Who is Giuseppe Genco Russo"))

In [None]:
# combine the unstructured and graph retriever to create the final context that will be passed to an LLM.

def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [None]:
#Follow up Question prompts
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = ChatPromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOllama(model = llm_model, temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
 chain.invoke({"question": "Who is most powerful mafia family?"})