In [8]:
# Import required libraries
import os
from pathlib import Path
from typing import List
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext, Settings
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [9]:
# Configure LlamaIndex Settings (Using OpenRouter - No OpenAI API Key needed)
def setup_llamaindex_settings():
    """
    Configure LlamaIndex with local embeddings and OpenRouter for LLM.
    This solution uses local embeddings to avoid API costs.
    """
    # Check for OpenRouter API key (for future use, not needed for this basic assignment)
    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        print("ℹ️  OPENROUTER_API_KEY not found - that's OK for this assignment!")
        print("   This assignment only uses local embeddings for vector operations.")
    else:
        print("✅ OPENROUTER_API_KEY found - ready for future LLM operations!")
    
    # Configure local embeddings (no API key required)
    Settings.embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-small-en-v1.5",
        trust_remote_code=True
    )
    
    print("✅ LlamaIndex configured with local embeddings")
    print("   Using BAAI/bge-small-en-v1.5 for document embeddings")

# Setup the configuration
setup_llamaindex_settings()


2025-09-21 09:30:19,687 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


✅ OPENROUTER_API_KEY found - ready for future LLM operations!


2025-09-21 09:30:25,726 - INFO - 1 prompt is loaded, with the key: query


✅ LlamaIndex configured with local embeddings
   Using BAAI/bge-small-en-v1.5 for document embeddings


In [10]:
def load_documents_from_folder(folder_path: str):
    """
    Load documents from a folder using SimpleDirectoryReader.
    
    Args:
        folder_path (str): Path to the folder containing documents
        
    Returns:
        List of documents loaded from the folder
    """
    # Create SimpleDirectoryReader instance with recursive search
    reader = SimpleDirectoryReader(
        input_dir=folder_path,
        recursive=True
    )
    
    # Load and return documents
    documents = reader.load_data()
    return documents

# Test the function
test_folder = "../data"
documents = load_documents_from_folder(test_folder)
print(f"✅ Loaded {len(documents)} documents from {test_folder}")
print(f"   Using local embeddings - no OpenAI API key required!")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Loaded 42 documents from ../data
   Using local embeddings - no OpenAI API key required!


In [11]:
def create_vector_store(db_path: str = "./vectordb", table_name: str = "documents"):
    """
    Create a LanceDB vector store for storing document embeddings.
    
    Args:
        db_path (str): Path where the vector database will be stored
        table_name (str): Name of the table in the vector database
        
    Returns:
        LanceDBVectorStore: Configured vector store
    """
    # Create the directory if it doesn't exist
    Path(db_path).mkdir(parents=True, exist_ok=True)
    
    # Create vector store
    vector_store = LanceDBVectorStore(
        uri=db_path,
        table_name=table_name
    )
    return vector_store

# Test the function
vector_store = create_vector_store("./assignment_vectordb")
print(f"✅ Vector store created successfully")




✅ Vector store created successfully


In [12]:
def create_vector_index(documents: List, vector_store):
    """
    Create a vector index from documents using the provided vector store.
    
    Args:
        documents: List of documents to index
        vector_store: LanceDB vector store to use for storage
        
    Returns:
        VectorStoreIndex: The created vector index
    """
    # Create storage context with vector store
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Create index from documents
    index = VectorStoreIndex.from_documents(
        documents, 
        storage_context=storage_context,
        show_progress=True
    )
    return index

# Test the function
index = create_vector_index(documents, vector_store)
print(f"✅ Vector index created successfully with {len(documents)} documents")


Parsing nodes: 100%|██████████| 42/42 [00:00<00:00, 144.00it/s]
Generating embeddings: 100%|██████████| 55/55 [00:02<00:00, 26.68it/s]
2025-09-21 09:30:37,613 - INFO - Create new table documents adding data.


✅ Vector index created successfully with 42 documents


[90m[[0m2025-09-21T04:00:37Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /Users/ishandutta/Documents/code/ai-accelerator/Day_6/session_2/solutions/assignment_vectordb/documents.lance, it will be created


In [13]:
def search_documents(index, query: str, top_k: int = 3):
    """
    Search for relevant documents using the vector index.
    
    Args:
        index: Vector index to search
        query (str): Search query
        top_k (int): Number of top results to return
        
    Returns:
        List of retrieved document nodes
    """
    # Create retriever from index
    retriever = index.as_retriever(similarity_top_k=top_k)
    
    # Retrieve documents for the query
    results = retriever.retrieve(query)
    return results

# Test the function
test_query = "What are AI agents?"
results = search_documents(index, test_query, top_k=2)
print(f"✅ Found {len(results)} results for query: '{test_query}'")
for i, result in enumerate(results, 1):
    print(f"   Result {i}: {result.text[:100]}...")
    print(f"   Score: {result.score:.4f}")


2025-09-21 09:30:37,731 - INFO - query_type :, vector


✅ Found 2 results for query: 'What are AI agents?'
   Result 1: THE LANDSCAPE OF EMERGING AI AGENT ARCHITECTURES
FOR REASONING , PLANNING , AND TOOL CALLING : A S U...
   Score: 0.6240
   Result 2: task. In single agent patterns there is no feedback mechanism from other AI agents; however, there m...
   Score: 0.6195


In [14]:
# Demonstration: Complete pipeline with multiple queries
print("🚀 Complete Vector Database Pipeline Demonstration")
print("=" * 60)

# Test multiple search queries
search_queries = [
    "What are AI agents?",
    "How to evaluate agent performance?", 
    "Italian recipes and cooking",
    "Financial analysis and investment"
]

for query in search_queries:
    print(f"\n🔎 Query: '{query}'")
    results = search_documents(index, query, top_k=2)
    
    for i, result in enumerate(results, 1):
        print(f"   Result {i}: {result.text[:100]}...")
        print(f"   Score: {result.score:.4f}")

print("\n" + "=" * 60)
print("🎉 Solution completed successfully!")
print("   ✅ Documents loaded from folder")
print("   ✅ Vector store created with LanceDB")
print("   ✅ Vector index built from documents") 
print("   ✅ Semantic search implemented and tested")


2025-09-21 09:30:37,812 - INFO - query_type :, vector
2025-09-21 09:30:37,847 - INFO - query_type :, vector


🚀 Complete Vector Database Pipeline Demonstration

🔎 Query: 'What are AI agents?'
   Result 1: THE LANDSCAPE OF EMERGING AI AGENT ARCHITECTURES
FOR REASONING , PLANNING , AND TOOL CALLING : A S U...
   Score: 0.6240
   Result 2: task. In single agent patterns there is no feedback mechanism from other AI agents; however, there m...
   Score: 0.6195

🔎 Query: 'How to evaluate agent performance?'
   Result 1: steps, but the answers are limited to Yes/No responses [7]. As the industry continues to pivot towar...
   Score: 0.6757
   Result 2: system prompt for each agent can minimize excess chatter by prompting the agents not to engage in un...
   Score: 0.6320

🔎 Query: 'Italian recipes and cooking'


2025-09-21 09:30:37,880 - INFO - query_type :, vector
2025-09-21 09:30:37,902 - INFO - query_type :, vector


   Result 1: # 🍝 Classic Spaghetti Carbonara Recipe

## Ingredients
- 400g spaghetti pasta
- 4 large egg yolks
- ...
   Score: 0.6167
   Result 2: Spaghetti Carbonara, Italian, 20, Easy, Pasta, 450
Margherita Pizza, Italian, 45, Medium, Tomato, 32...
   Score: 0.6134

🔎 Query: 'Financial analysis and investment'
   Result 1: Stock, AAPL, Apple Inc, 10000, 12500, 25.0, Medium
Stock, GOOGL, Alphabet Inc, 8000, 9200, 15.0, Med...
   Score: 0.5600
   Result 2: A Comprehensive Survey of AI Agent Frameworks
and Their Applications in Financial Services
Satyadhar...
   Score: 0.5580

🎉 Solution completed successfully!
   ✅ Documents loaded from folder
   ✅ Vector store created with LanceDB
   ✅ Vector index built from documents
   ✅ Semantic search implemented and tested
