In [2]:
import os
from neo4j import GraphDatabase
from google.generativeai import GenerativeModel
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv

In [3]:
# Load environment variables
load_dotenv()

# Neo4j connection setup from environment variables
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [4]:
# Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [11]:
def get_schema():
    with driver.session() as session:
        # Get node labels
        labels_query = """
        CALL db.labels() YIELD label
        RETURN collect(label) as labels
        """
        result = session.run(labels_query).single()
        labels = result['labels'] if result else []
        
        # Get relationship types
        rels_query = """
        CALL db.relationshipTypes() YIELD relationshipType
        RETURN collect(relationshipType) as relationships
        """
        result = session.run(rels_query).single()
        relationships = result['relationships'] if result else []
        
        # Get properties for each node label
        schema = {}
        for label in labels:
            props_query = f"""
            MATCH (n:{label})
            RETURN distinct keys(n) as properties
            LIMIT 1
            """
            result = session.run(props_query).single()
            properties = result['properties'] if result else []
            schema[label] = properties
            
        return {
            'node_labels': labels,
            'relationships': relationships,
            'node_properties': schema
        }

In [6]:
def extract_knowledge_base():
    with driver.session() as session:
        # Extract nodes and relationships as text
        query = """
        MATCH (n)
        OPTIONAL MATCH (n)-[r]->(m)
        RETURN n, r, m
        """
        results = session.run(query)
        
        # Convert to text chunks
        text_chunks = []
        for record in results:
            node1 = record['n']
            rel = record['r']
            node2 = record['m']
            
            if rel and node2:
                chunk = f"{dict(node1)} is {rel.type} {dict(node2)}"
            else:
                chunk = f"Node: {dict(node1)}"
            text_chunks.append(chunk)
            
        return text_chunks

In [7]:
# Initialize Gemini
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [8]:
# Create embeddings and vector store
def create_vector_store(text_chunks):
    # Using 'all-MiniLM-L6-v2' - a lightweight but effective model
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    # better model (but slightly slower)
    
    # embeddings = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/all-mpnet-base-v2",
    #     model_kwargs={'device': 'cpu'}
    # )
    
    # a smaller, faster model:
    
    # embeddings = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/paraphrase-MiniLM-L3-v2",
    #     model_kwargs={'device': 'cpu'}
    # )
    
    vector_store = FAISS.from_texts(text_chunks, embeddings)
    return vector_store


In [9]:
# Setup RAG with Gemini
def setup_rag(vector_store):
    llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever()
    )
    return qa_chain

In [13]:
# main functions

schema = get_schema()
schema

{'node_labels': ['User', 'Project', 'Tag', 'Entity'],
 'relationships': ['OWNS', 'TAGGED_WITH', 'FRIEND', 'STARRED'],
 'node_properties': {'User': ['password',
   'email',
   'username',
   'bio',
   'github_username',
   'name',
   'leetcode_username'],
  'Project': ['repo_link', 'description', 'title'],
  'Tag': ['name'],
  'Entity': []}}

In [14]:
# Extract knowledge base
text_chunks = extract_knowledge_base()

In [15]:
# Create vector store
vector_store = create_vector_store(text_chunks)

  embeddings = VertexAIEmbeddings()
Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


GoogleAuthError: 
Unable to authenticate your request.
Depending on your runtime environment, you can complete authentication by:
- if in local JupyterLab instance: `!gcloud auth login` 
- if in Colab:
    -`from google.colab import auth`
    -`auth.authenticate_user()`
- if in service account or other: please follow guidance in https://cloud.google.com/docs/authentication

In [16]:
# Setup RAG
qa_chain = setup_rag(vector_store)

NameError: name 'vector_store' is not defined

In [17]:
# Create system prompt with schema context
schema_context = f"Database Schema: {schema}\n\n"

In [18]:
# Example query
query = "Tell me about the database"
response = qa_chain.run(schema_context + query)
print(response)

NameError: name 'qa_chain' is not defined