# Langchain App - Vector Database Management
This notebook provides a comprehensive interface to manage the vector database using Langchain. You can add, remove, list, and search documents in the vector database.

## setup:-

In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader, DirectoryLoader
import os
from datetime import datetime
import logging
import json
from pathlib import Path



# Set your API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyAYew4okjx4jmR7xbKhLj2mAckgtUUbR-k"

# Initialize the language model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-exp",
    temperature=1.0,
    top_p=0.95,
    top_k=40,
    max_output_tokens=8192,
)

# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Initialize ChromaDB
PERSIST_DIRECTORY = "db"
vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

DB_TRACKING_FILE = "db_files.json"

  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)


## DB management:-

In [None]:
# Function to load the database tracking information
def load_db_tracking():
    if os.path.exists(DB_TRACKING_FILE):
        with open(DB_TRACKING_FILE, 'r') as f:
            return json.load(f)
    return {}

# Function to save the database tracking information
def save_db_tracking(tracking_data):
    with open(DB_TRACKING_FILE, 'w') as f:
        json.dump(tracking_data, f, indent=2)

In [3]:

# Function to add a single file to the vector database and track its IDs
from langchain.text_splitter import RecursiveCharacterTextSplitter

def add_file(file_path):
    try:
        tracking_data = load_db_tracking()
        loader = TextLoader(file_path)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_documents(documents)
        ids = vectorstore.add_documents(texts)
        vectorstore.persist()
        tracking_data[str(Path(file_path).resolve())] = {
            'chunk_ids': ids,
            'added_date': datetime.now().isoformat()
        }
        save_db_tracking(tracking_data)
        return f"Added {len(texts)} chunks from {file_path}"
    except Exception as e:
        return f"Error adding file: {str(e)}"

In [4]:
data_files = [
            "./data/mess_menu.txt",
            "./data/inst_calender.txt",
            "./data/faculty_details.txt",
            "./data/caricululm.txt"
            "./data/milma_menu.txt"
            "./data/general_info.txt"            
        ]
for file_path in data_files:
    if os.path.exists(file_path):
        result = add_file(file_path)
        logging.info(f"Added file {file_path}: {result}")

In [None]:
# Function to remove a file's chunks from the vector database
def remove_file(file_path):
    try:
        tracking_data = load_db_tracking()
        resolved_path = str(Path(file_path).resolve())
        if resolved_path not in tracking_data:
            return f"File {file_path} not found in tracking data"
        chunk_ids = tracking_data[resolved_path]['chunk_ids']
        vectorstore.delete(chunk_ids)
        vectorstore.persist()
        del tracking_data[resolved_path]
        save_db_tracking(tracking_data)
        return f"Removed file {file_path} and its {len(chunk_ids)} chunks"
    except Exception as e:
        return f"Error removing file: {str(e)}"

In [5]:
# Function to list all tracked files and their chunks in the database
def list_tracked_files():
    tracking_data = load_db_tracking()
    return {
        path: {
            'chunk_count': len(data['chunk_ids']),
            'added_date': data['added_date']
        }
        for path, data in tracking_data.items()
    }

In [None]:
# Function to add all text files from a directory to the vector database
def add_directory(dir_path):
    loader = DirectoryLoader(dir_path, glob="**/*.txt")
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    vectorstore.add_documents(texts)
    vectorstore.persist()
    return f"Added {len(texts)} chunks from directory {dir_path}"

In [None]:
# Function to search the vector database for relevant context
def search_knowledge_base(query, k=3):
    docs = vectorstore.similarity_search(query, k=k)
    return "\n".join([doc.page_content for doc in docs])

In [None]:
# Function to clear all documents from the vector database
def clear_database():
    vectorstore.delete_collection()
    vectorstore.persist()
    return "Database cleared"

In [6]:
# Function to list all documents in the vector database with their metadata
def list_documents():
    try:
        collection = vectorstore.get()
        if not collection['ids']:
            return "Database is empty"
        results = []
        for i, doc_id in enumerate(collection['ids']):
            doc = collection['documents'][i]
            metadata = collection['metadatas'][i]
            results.append({
                'id': doc_id,
                'content_preview': doc[:200] + '...' if len(doc) > 200 else doc,
                'metadata': metadata
            })
        return results
    except Exception as e:
        return f"Error listing documents: {str(e)}"

In [None]:
# Function to delete a specific document from the vector database
def delete_document(doc_id):
    try:
        vectorstore.delete([doc_id])
        vectorstore.persist()
        return f"Deleted document {doc_id}"
    except Exception as e:
        return f"Error deleting document: {str(e)}"

In [None]:
# Function to retrieve a specific document's full content by ID
def get_document_by_id(doc_id):
    try:
        collection = vectorstore.get([doc_id])
        if collection['ids']:
            return {
                'id': doc_id,
                'content': collection['documents'][0],
                'metadata': collection['metadatas'][0]
            }
        return "Document not found"
    except Exception as e:
        return f"Error retrieving document: {str(e)}"

In [None]:
# Initialize the chatbot and return the chat function
def initialize_bot():
    if not os.path.exists(PERSIST_DIRECTORY):
        data_files = [
            "./data/mess_menu.txt",
            "./data/inst_calender.txt",
            "./data/faculty_details.txt",
            "./data/caricululm.txt",
            "./data/milma_menu.txt",
            "./data/general_info.txt"
        ]
        for file_path in data_files:
            if os.path.exists(file_path):
                result = add_file(file_path)
                logging.info(f"Added file {file_path}: {result}")
    return chat

# Initialize the bot
initialize_bot()

In [None]:
# List tracked files in the database
tracked_files = list_tracked_files()
for file_path, info in tracked_files.items():
    print(f"\nFile: {file_path}")
    print(f"Chunks: {info['chunk_count']}")
    print(f"Added: {info['added_date']}")

In [None]:
# Search the knowledge base
query = "What is the mess menu?"
print(search_knowledge_base(query))

In [None]:
# Clear the database
print(clear_database())

In [9]:
# List all documents in the vector database
print(list_documents())

Database is empty


In [None]:
# Delete a specific document by ID
doc_id = "example_doc_id"
print(delete_document(doc_id))

In [None]:
# Retrieve a specific document's full content by ID
doc_id = "example_doc_id"
print(get_document_by_id(doc_id))