In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchain-ollama langchain-experimental neo4j tiktoken yfiles_jupyter_graphs python-dotenv json-repair langchain-openai langchain_core pypdf

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
import os
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from neo4j import  Driver

from dotenv import load_dotenv

load_dotenv()

True

In [4]:
graph = Neo4jGraph()

In [5]:
loader = TextLoader(file_path="dummytext.txt")
# loader = PyPDFLoader(file_path="Cosa_Nostra_-_John_Dickie.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
documents = text_splitter.split_documents(documents=docs)

In [None]:
# https://python.langchain.com/api_reference/_modules/langchain_experimental/graph_transformers/llm.html#create_unstructured_prompt

# llm = OllamaFunctions(model="llama3.1:8b-instruct-q5_K_M", temperature=0, format="json")
# llm = OllamaFunctions(model="mistral:7b-instruct-v0.3-q8_0", temperature=0, format="json")
# llm = ChatOllama(model="mistral:7b-instruct-v0.3-q6_K", temperature=0, format="json")
llm = ChatOllama(model="llama3.1:8b-instruct-q5_K_M", temperature=0, format="json")
# llm = OllamaFunctions(model="mistral:7b-instruct-v0.3-q6_K", temperature=0, format="json")

llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)

## customising nodes and relationships https://python.langchain.com/v0.2/docs/how_to/graph_constructing/
# llm_transformer_filtered = LLMGraphTransformer(
#     llm=llm,
#     allowed_nodes=["Person", "Country", "Organization"],
#     allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
#     node_properties=["born_year"],
# )
# graph_documents_filtered = llm_transformer_filtered.convert_to_graph_documents(documents)


In [24]:
graph_documents[0]

GraphDocument(nodes=[Node(id='Giovanni Caruso', type='Person', properties={}), Node(id='Maria', type='Person', properties={}), Node(id='Amico', type='Person', properties={}), Node(id='Santa Caterina', type='Location', properties={}), Node(id='Sicily', type='Location', properties={})], relationships=[], source=Document(metadata={'source': 'dummytext.txt', 'id': '74f7336c31d83fc11faeeb6ab4cbca9f'}, page_content="1. The Story of Amicoâ€™s Family: A Legacy of Love and Tradition\nIn the idyllic village of Santa Caterina, amidst the rolling hills and sun-kissed landscapes of Sicily, lies the genesis of the Caruso family, a lineage intertwined with the island's rich culinary tapestry. The Carusos were not mere inhabitants of the land; they were the keepers of a culinary heritage that spanned generations. Each family member contributed their unique flair, crafting a narrative of flavors that reflected their diverse experiences and deep-seated love for food.\n\nGiovanni Caruso and Maria: The Fo

In [25]:
# clear before adding more
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )

def clear_specific_source(tx):
    tx.run("""
        MATCH (n)
        WHERE n.source = 'dummytext.txt'
        DETACH DELETE n
    """)

with driver.session() as session:
    session.execute_write(clear_specific_source)

In [26]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [27]:
def showGraph():
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    session = driver.session()
    widget = GraphWidget(graph = session.run("MATCH (s)-[r:!MENTIONS] ->(t) RETURN s,r,t").graph())
    widget.mode_label_mapping = 'id'
    return widget
showGraph()


GraphWidget(layout=Layout(height='760px', width='100%'))

In [28]:
embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

In [29]:
#clear before adding more embedding

def clear_specific_source_embeddings(tx, source_file):
    # Clear nodes with embeddings from specific source
    query = '''
    MATCH (n:__Entity__)
    WHERE n.source = $source
    DETACH DELETE n
    '''
    tx.run(query, source=source_file)

def check_embeddings(tx, source_file=None):
    if source_file:
        # Check embeddings for specific source
        query = '''
        MATCH (n:__Entity__)
        WHERE n.source = $source
        RETURN count(n) as count
        '''
        result = tx.run(query, source=source_file)
    else:
        # Check all embeddings
        query = '''
        MATCH (n:__Entity__)
        RETURN n.source, count(n) as count
        '''
        result = tx.run(query)
    return list(result)

# Execute operations
with driver.session() as session:
    # Check before clearing
    source_file = "dummytext.txt"  # your specific source
    
    print("Before clearing:")
    counts = session.execute_read(check_embeddings, source_file)
    print(f"Embeddings for {source_file}:", counts)
    
    # Clear specific source
    session.execute_write(clear_specific_source_embeddings, source_file)
    print(f"\nCleared embeddings for {source_file}")
    
    # Verify after clearing
    print("\nAfter clearing:")
    counts = session.execute_read(check_embeddings, source_file)
    print(f"Embeddings for {source_file}:", counts)
    
    # Optionally check all remaining sources
    print("\nAll remaining sources:")
    all_counts = session.execute_read(check_embeddings)
    for record in all_counts:
        print(f"Source: {record['n.source']}, Count: {record['count']}")

driver.close()

Before clearing:
Embeddings for dummytext.txt: [<Record count=0>]

Cleared embeddings for dummytext.txt

After clearing:
Embeddings for dummytext.txt: [<Record count=0>]

All remaining sources:
Source: None, Count: 48


In [30]:
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))

def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id` 
    FOR (n:__Entity__) 
    ON EACH [n.id];
    '''
    tx.run(query)

# Function to execute the query
def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

# Call the function to create the index
try:
    create_index()
except:
    pass

# Close the driver connection
driver.close()

In [31]:

class Entities(BaseModel):
    """Identifying information about entities."""

    names: list[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)


entity_chain = llm.with_structured_output(Entities)

In [35]:
entity_chain.invoke("Who are Nonna Lucia and Giovanni Caruso?")

ValidationError: 1 validation error for Entities
names
  Input should be a valid list [type=list_type, input_value='["Nonna Lucia", "Giovanni Caruso"]', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/list_type

In [14]:
def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()


# Fulltext index query
def graph_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke(question)
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [15]:
print(graph_retriever("Who is Nonna Lucia?"))

AttributeError: 'NoneType' object has no attribute 'names'

In [4]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"#Document ". join(vector_data)}
    """
    return final_data

In [15]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
        {
            "context": full_retriever,
            "question": RunnablePassthrough(),
        }
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
chain.invoke(input="Who is Nonna Lucia? Did she teach anyone about restaurants or cooking?")

'Nonna Lucia was the matriarch of the Caruso family, a culinary sage who taught her grandchildren, including Amico, the art of Sicilian cooking. She was a pillar of strength and wisdom, guiding her family with love and a firm hand.'