In [13]:
# !pip install neo4j google-generativeai langchain-google-genai langchain-community faiss-cpu python-dotenv sentence-transformers


import os
from neo4j import GraphDatabase
from google.generativeai import GenerativeModel
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv

In [None]:
# Load environment variables
load_dotenv()

# Neo4j connection setup from environment variables

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [15]:
# Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [16]:
def get_schema():
    with driver.session() as session:
        # Get node labels
        labels_query = """
        CALL db.labels() YIELD label
        RETURN collect(label) as labels
        """
        result = session.run(labels_query).single()
        labels = result['labels'] if result else []

        # Get relationship types
        rels_query = """
        CALL db.relationshipTypes() YIELD relationshipType
        RETURN collect(relationshipType) as relationships
        """
        result = session.run(rels_query).single()
        relationships = result['relationships'] if result else []

        # Get properties for each node label
        schema = {}
        for label in labels:
            props_query = f"""
            MATCH (n:{label})
            RETURN distinct keys(n) as properties
            LIMIT 1
            """
            result = session.run(props_query).single()
            properties = result['properties'] if result else []
            schema[label] = properties

        return {
            'node_labels': labels,
            'relationships': relationships,
            'node_properties': schema
        }

In [17]:
def extract_knowledge_base():
    with driver.session() as session:
        # Extract nodes and relationships as text
        query = """
        MATCH (n)
        OPTIONAL MATCH (n)-[r]->(m)
        RETURN n, r, m
        """
        results = session.run(query)

        # Convert to text chunks
        text_chunks = []
        for record in results:
            node1 = record['n']
            rel = record['r']
            node2 = record['m']

            if rel and node2:
                chunk = f"{dict(node1)} is {rel.type} {dict(node2)}"
            else:
                chunk = f"Node: {dict(node1)}"
            text_chunks.append(chunk)

        return text_chunks

In [18]:
# Initialize Gemini
# GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
# genai.configure(api_key=GOOGLE_API_KEY)
os.environ["GOOGLE_API_KEY"] =GOOGLE_API_KEY

In [19]:
# Create embeddings and vector store
def create_vector_store(text_chunks):
    # Using 'all-MiniLM-L6-v2' - a lightweight but effective model
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    # better model (but slightly slower)

    # embeddings = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/all-mpnet-base-v2",
    #     model_kwargs={'device': 'cpu'}
    # )

    # a smaller, faster model:

    # embeddings = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-MiniLM-L3-v2",
    #     model_kwargs={'device': 'cpu'}
    # )

    vector_store = FAISS.from_texts(text_chunks, embeddings)
    return vector_store


In [20]:
# Setup RAG with Gemini
def setup_rag(vector_store):
    # llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
    llm = GoogleGenerativeAI(model="gemini-pro",)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    return qa_chain

In [21]:
# main functions

schema = get_schema()
schema

{'node_labels': ['User', 'Project', 'Tag', 'Entity'],
 'relationships': ['OWNS', 'TAGGED_WITH', 'FRIEND', 'STARRED', 'HAS_SKILL'],
 'node_properties': {'User': ['password',
   'email',
   'username',
   'bio',
   'github_username',
   'name',
   'leetcode_username',
   'profile_image',
   'skillset',
   'suggestions'],
  'Project': ['repo_link', 'description', 'title'],
  'Tag': ['name'],
  'Entity': []}}

In [22]:
# Extract knowledge base
text_chunks = extract_knowledge_base()

In [24]:
# Create vector store
vector_store = create_vector_store(text_chunks)

In [25]:
# Setup RAG
qa_chain = setup_rag(vector_store)

In [26]:
# Create system prompt with schema context
schema_context = f"Database Schema: {schema}\n\n"

In [27]:
# Example query
query = "Tell me about the database"
# query = "give me some data from the dataabase"
response = qa_chain.run(schema_context + query)
print(response)

  response = qa_chain.run(schema_context + query)


The database schema is as follows:

* **Node labels:** User, Project, Tag, Entity
* **Relationships:** OWNS, TAGGED_WITH, FRIEND, STARRED, HAS_SKILL
* **Node properties:**
    * User: password, email, username, bio, github_username, name, leetcode_username, profile_image, skillset, suggestions
    * Project: repo_link, description, title
    * Tag: name
    * Entity: []
