In [1]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.2-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.2-py3-none-any.whl (313 kB)
Installing collected packages: neo4j
Successfully installed neo4j-5.28.2


In [2]:
# --- Core Libraries ---
import pandas as pd
import io
import numpy as np

# --- Libraries for Manual RAG Implementation ---
# pip install pandas sentence-transformers faiss-cpu torch transformers neo4j
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
from neo4j import GraphDatabase

# --- Neo4j Connection Details ---
# IMPORTANT: Update these with your Neo4j instance details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "classic123" 

class Neo4jGraph:
    """A wrapper for interacting with the Neo4j database."""
    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self._driver.close()

    def run_query(self, query, **params):
        with self._driver.session() as session:
            result = session.run(query, **params)
            return [record for record in result]

def build_graph_in_neo4j(graph, df):
    """Clears the existing graph and rebuilds it from the DataFrame."""
    # Clear existing data
    print("Clearing existing graph data...")
    graph.run_query("MATCH (n) DETACH DELETE n")

    print("Building new graph in Neo4j...")
    
    # --- Create Root Node: Overall Summary ---
    total_sales = df['Sale Amount'].sum()
    total_units = df['Units Sold'].sum()
    summary_text = (
        f"Overall Summary: Total sales amounted to ${total_sales:,.2f} from a total of {total_units} units sold."
    )
    graph.run_query(
        "CREATE (:GlobalSummary {text: $text, total_sales: $sales, total_units: $units})",
        text=summary_text, sales=total_sales, units=total_units
    )

    # --- Create Region Nodes and Relationships ---
    region_summary_df = df.groupby('Region').agg({'Sale Amount': 'sum', 'Units Sold': 'sum'}).reset_index()
    for _, row in region_summary_df.iterrows():
        region_text = f"Region Summary for {row['Region']}: Total sales were ${row['Sale Amount']:,.2f} from {row['Units Sold']} units sold."
        graph.run_query(
            """
            MATCH (g:GlobalSummary)
            CREATE (r:Region {name: $name, text: $text, total_sales: $sales, total_units: $units})
            CREATE (g)-[:CONTAINS_REGION]->(r)
            """,
            name=row['Region'], text=region_text, sales=row['Sale Amount'], units=row['Units Sold']
        )

    # --- Create Product-in-Region Nodes and Relationships ---
    prod_region_df = df.groupby(['Region', 'Product']).agg({'Sale Amount': 'sum', 'Units Sold': 'sum'}).reset_index()
    for _, row in prod_region_df.iterrows():
        product_text = f"Product Summary for {row['Product']} in {row['Region']}: Sales were ${row['Sale Amount']:,.2f} from {row['Units Sold']} units sold."
        graph.run_query(
            """
            MATCH (r:Region {name: $region_name})
            CREATE (p:ProductSummary {
                product_name: $product, 
                region: $region_name, 
                text: $text, 
                total_sales: $sales, 
                total_units: $units
            })
            CREATE (r)-[:SOLD_PRODUCT]->(p)
            """,
            region_name=row['Region'], product=row['Product'], text=product_text, sales=row['Sale Amount'], units=row['Units Sold']
        )
    print("Graph build complete.")

def create_rag_with_neo4j(graph):
    """
    Creates the RAG pipeline using Neo4j as the knowledge store.
    """
    # 1. Fetch all text nodes from Neo4j to embed them
    print("Fetching node data from Neo4j for embedding...")
    results = graph.run_query("MATCH (n) RETURN elementId(n) AS id, n.text AS text")
    node_ids = [record['id'] for record in results]
    all_docs_text = [record['text'] for record in results]

    # 2. Embed and Index using SentenceTransformer and FAISS
    print("Loading embedding model and creating FAISS index...")
    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    doc_embeddings = embedding_model.encode(all_docs_text, convert_to_tensor=False)
    index = faiss.IndexFlatL2(doc_embeddings.shape[1])
    index.add(np.array(doc_embeddings, dtype=np.float32))
    print("FAISS index created.")

    # 3. LLM Setup
    print("Setting up local LLM...")
    generator = pipeline("text-generation", model="gpt2", max_new_tokens=256)
    
    # 4. RAG Query Function
    def answer_question(question, top_k=2):
        query_embedding = embedding_model.encode([question])
        _, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        
        # Get the IDs of the best-matching nodes from the vector search
        matched_node_ids = [node_ids[i] for i in indices[0]]
        
        # **Hierarchical Context Retrieval using Cypher**
        # For each matched node, get the node itself and its parent.
        context_query = """
        UNWIND $node_ids AS nodeId
        MATCH (n) WHERE elementId(n) = nodeId
        OPTIONAL MATCH (p)-[]->(n) // Find parent
        RETURN n.text AS text, p.text AS parent_text
        """
        context_results = graph.run_query(context_query, node_ids=matched_node_ids)
        
        # Assemble context, avoiding duplicates
        context_texts = set()
        for record in context_results:
            context_texts.add(record['text'])
            if record['parent_text']:
                context_texts.add(record['parent_text'])

        context = "\n\n".join(context_texts)
        prompt_template = f"Context:\n{context}\n\nQuestion: {question}\n\nHelpful Answer:"
        
        generated_text = generator(prompt_template)[0]['generated_text']
        answer = generated_text.split("Helpful Answer:")[1].strip()
        
        return answer, list(context_texts)

    return answer_question

# --- Main Execution ---
if __name__ == "__main__":
    csv_data = """Date,Region,Product,Units Sold,Sale Amount
1/5/2023,North,Laptop,10,12000
1/6/2023,North,Keyboard,50,5000
1/7/2023,South,Mouse,75,1875
2/10/2023,South,Laptop,5,6000
2/11/2023,West,Monitor,20,10000
3/15/2023,West,Mouse,30,750
3/16/2023,East,Keyboard,45,4500
"""
    df = pd.read_csv(io.StringIO(csv_data))
    
    # Connect to Neo4j and build the graph
    graph = Neo4jGraph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
    build_graph_in_neo4j(graph, df)
    
    # Create the RAG query function
    rag_query_function = create_rag_with_neo4j(graph)

    # Query the RAG
    print("\n--- Querying the Neo4j-Powered Knowledge Tree RAG ---")
    questions = [
        "What were the total sales in the North region?",
        "Compare the sales performance of Laptops in the North vs. the South region.",
    ]
    for q in questions:
        print(f"\n> Question: {q}")
        answer, sources = rag_query_function(q)
        print(f"\n< Answer: {answer}")
        print("\n  -- Retrieved Context Sources from Neo4j --")
        for text in sources:
            print(f"  - {text}")
        print("-" * 30)

    # Clean up the connection
    graph.close()



  from .autonotebook import tqdm as notebook_tqdm


Clearing existing graph data...
Building new graph in Neo4j...
Graph build complete.
Fetching node data from Neo4j for embedding...
Loading embedding model and creating FAISS index...
FAISS index created.
Setting up local LLM...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu



--- Querying the Neo4j-Powered Knowledge Tree RAG ---

> Question: What were the total sales in the North region?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



< Answer: Total sales totaled $19,959.00 from 5,800 units sold.

Find a Home

How About a Boat? Can You Find a Home?

What Are the Differences in the Sales Levels by Province?

Can You Find Your Home in North Ontario? If so, what can we expect to see in sales and purchases?

More Information about The North End of Ontario, the West End of Ontario and Ontario of Ontario.

Awards

Famous for their Outlaws, Famous for its Beautiful and Glorious Coast, and Famous for their Famous Ship. More about The North End of Ontario, The West End of Ontario and Ontario of Ontario.

The Coast and the Ship have been a very good indicator of trade flow for several hundred years. As a North West Coast Ship and an East Coast Ship, the shipping will be very good as it's very small. For example, in a city known for its high quality and strong maritime resources, one might expect this to become a trend by the time the North West Coast Ship ships arrive. A few thousand vessels come to Toronto each year as the

In [None]:
# --- Core Libraries ---
import pandas as pd
import io
import numpy as np
import requests
import json

# --- Libraries for Manual RAG Implementation ---
# pip install pandas sentence-transformers faiss-cpu torch neo4j requests
from sentence_transformers import SentenceTransformer
import faiss
from neo4j import GraphDatabase

# --- Neo4j Connection Details ---
# IMPORTANT: Update these with your Neo4j instance details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "classic123" 

# --- LM Studio Connection Details ---
# Make sure LM Studio is running and a model is loaded.
LM_STUDIO_URL = "http://localhost:1234/v1/chat/completions"

class Neo4jGraph:
    """A wrapper for interacting with the Neo4j database."""
    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self._driver.close()

    def run_query(self, query, **params):
        with self._driver.session() as session:
            result = session.run(query, **params)
            return [record for record in result]

def build_graph_in_neo4j(graph, df):
    """Clears the existing graph and rebuilds it from the DataFrame."""
    # Clear existing data
    print("Clearing existing graph data...")
    graph.run_query("MATCH (n) DETACH DELETE n")

    print("Building new graph in Neo4j...")
    
    # --- Create Root Node: Overall Summary ---
    total_sales = df['Sale Amount'].sum()
    total_units = df['Units Sold'].sum()
    summary_text = (
        f"Overall Summary: Total sales amounted to ${total_sales:,.2f} from a total of {total_units} units sold."
    )
    graph.run_query(
        "CREATE (:GlobalSummary {text: $text, total_sales: $sales, total_units: $units})",
        text=summary_text, sales=total_sales, units=total_units
    )

    # --- Create Region Nodes and Relationships ---
    region_summary_df = df.groupby('Region').agg({'Sale Amount': 'sum', 'Units Sold': 'sum'}).reset_index()
    for _, row in region_summary_df.iterrows():
        region_text = f"Region Summary for {row['Region']}: Total sales were ${row['Sale Amount']:,.2f} from {row['Units Sold']} units sold."
        graph.run_query(
            """
            MATCH (g:GlobalSummary)
            CREATE (r:Region {name: $name, text: $text, total_sales: $sales, total_units: $units})
            CREATE (g)-[:CONTAINS_REGION]->(r)
            """,
            name=row['Region'], text=region_text, sales=row['Sale Amount'], units=row['Units Sold']
        )

    # --- Create Product-in-Region Nodes and Relationships ---
    prod_region_df = df.groupby(['Region', 'Product']).agg({'Sale Amount': 'sum', 'Units Sold': 'sum'}).reset_index()
    for _, row in prod_region_df.iterrows():
        product_text = f"Product Summary for {row['Product']} in {row['Region']}: Sales were ${row['Sale Amount']:,.2f} from {row['Units Sold']} units sold."
        graph.run_query(
            """
            MATCH (r:Region {name: $region_name})
            CREATE (p:ProductSummary {
                product_name: $product, 
                region: $region_name, 
                text: $text, 
                total_sales: $sales, 
                total_units: $units
            })
            CREATE (r)-[:SOLD_PRODUCT]->(p)
            """,
            region_name=row['Region'], product=row['Product'], text=product_text, sales=row['Sale Amount'], units=row['Units Sold']
        )
    print("Graph build complete.")

def create_rag_with_neo4j(graph):
    """
    Creates the RAG pipeline using Neo4j as the knowledge store.
    """
    # 1. Fetch all text nodes from Neo4j to embed them
    print("Fetching node data from Neo4j for embedding...")
    results = graph.run_query("MATCH (n) RETURN elementId(n) AS id, n.text AS text")
    node_ids = [record['id'] for record in results]
    all_docs_text = [record['text'] for record in results]

    # 2. Embed and Index using SentenceTransformer and FAISS
    print("Loading embedding model and creating FAISS index...")
    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    doc_embeddings = embedding_model.encode(all_docs_text, convert_to_tensor=False)
    index = faiss.IndexFlatL2(doc_embeddings.shape[1])
    index.add(np.array(doc_embeddings, dtype=np.float32))
    print("FAISS index created.")

    # 3. RAG Query Function (LLM part is inside)
    def answer_question(question, top_k=2):
        query_embedding = embedding_model.encode([question])
        _, indices = index.search(np.array(query_embedding, dtype=np.float32), top_k)
        
        # Get the IDs of the best-matching nodes from the vector search
        matched_node_ids = [node_ids[i] for i in indices[0]]
        
        # **Hierarchical Context Retrieval using Cypher**
        context_query = """
        UNWIND $node_ids AS nodeId
        MATCH (n) WHERE elementId(n) = nodeId
        OPTIONAL MATCH (p)-[]->(n) // Find parent
        RETURN n.text AS text, p.text AS parent_text
        """
        context_results = graph.run_query(context_query, node_ids=matched_node_ids)
        
        context_texts = set()
        for record in context_results:
            context_texts.add(record['text'])
            if record['parent_text']:
                context_texts.add(record['parent_text'])

        context = "\n\n".join(context_texts)
        
        # --- LLM Generation using LM Studio ---
        system_prompt = "You are a helpful assistant that answers questions based ONLY on the provided context. If the answer is not in the context, say that you don't know."
        user_prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nHelpful Answer:"
        
        payload = {
            "model": "local-model", # This is a placeholder in LM Studio
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            "temperature": 0.7,
            "max_tokens": 256,
            "stream": False
        }

        try:
            response = requests.post(LM_STUDIO_URL, json=payload, headers={"Content-Type": "application/json"})
            response.raise_for_status()
            result = response.json()
            answer = result['choices'][0]['message']['content'].strip()
        except requests.exceptions.RequestException as e:
            answer = f"Error communicating with LM Studio: {e}"

        return answer, list(context_texts)

    return answer_question

# --- Main Execution ---
if __name__ == "__main__":
    csv_data = """Date,Region,Product,Units Sold,Sale Amount
1/5/2023,North,Laptop,10,12000
1/6/2023,North,Keyboard,50,5000
1/7/2023,South,Mouse,75,1875
2/10/2023,South,Laptop,5,6000
2/11/2023,West,Monitor,20,10000
3/15/2023,West,Mouse,30,750
3/16/2023,East,Keyboard,45,4500
"""
    df = pd.read_csv(io.StringIO(csv_data))
    
    # Connect to Neo4j and build the graph
    graph = Neo4jGraph(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
    build_graph_in_neo4j(graph, df)
    
    # Create the RAG query function
    rag_query_function = create_rag_with_neo4j(graph)

    # Query the RAG
    print("\n--- Querying the Neo4j-Powered Knowledge Tree RAG ---")
    questions = [
        "What were the total sales in the North region?",
        "Compare the sales performance of Laptops in the North vs. the South region.",
    ]
    for q in questions:
        print(f"\n> Question: {q}")
        answer, sources = rag_query_function(q)
        print(f"\n< Answer: {answer}")
        print("\n  -- Retrieved Context Sources from Neo4j --")
        for text in sources:
            print(f"  - {text}")
        print("-" * 30)

    # Clean up the connection
    graph.close()

