<a href="https://colab.research.google.com/github/comethrusws/VectorSearch/blob/main/VectorSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain faiss-cpu transformers datasets pandas numpy sentence-transformers torch ctransformers

In [None]:
import os
import pandas as pd
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import HuggingFacePipeline, CTransformers
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain.schema import Document
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
def setup_models():
    """
    Set up the embedding model and LLM
    """

    embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    print(f"Initialized embedding model: {embedding_model_name}")


    llm_local = CTransformers(
        model="TheBloke/Llama-2-7B-Chat-GGUF",
        model_file="llama-2-7b-chat.Q4_K_M.gguf",
        model_type="llama",
        max_new_tokens=512,
        temperature=0.2
    )


    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = AutoModelForCausalLM.from_pretrained("google/flan-t5-base")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.2
    )

    llm_pipeline = HuggingFacePipeline(pipeline=pipe)

    # choose which LLM to use based on your hardware capabilities
    # llm = llm_local  # For more powerful systems
    llm = llm_pipeline  # For systems with less resources

    return embeddings, llm

In [None]:
def load_documents(directory_path):
    """
    Load documents from a directory containing PDF and text files
    """
    # Load PDFs
    pdf_loader = DirectoryLoader(directory_path, glob="**/*.pdf", loader_cls=PyPDFLoader)
    pdf_documents = pdf_loader.load()

    # Load text files
    text_loader = DirectoryLoader(directory_path, glob="**/*.txt", loader_cls=TextLoader)
    text_documents = text_loader.load()

    # Combine all documents
    all_documents = pdf_documents + text_documents
    print(f"Loaded {len(all_documents)} documents")
    return all_documents


In [None]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks for better processing
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks


In [None]:
def create_vector_store(chunks, embeddings):
    """
    Create embeddings for the document chunks and build a vector store
    """
    # Create a FAISS vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    print("Vector store created successfully")
    return vector_store


In [None]:
def semantic_search(vector_store, query, k=5):
    """
    Perform a basic semantic search
    """
    docs = vector_store.similarity_search(query, k=k)
    return docs


In [None]:
def advanced_search(vector_store, query, metadata_filter=None, k=5):
    """
    Perform advanced search with metadata filtering
    """
    if metadata_filter:
        docs = vector_store.similarity_search(
            query,
            k=k,
            filter=metadata_filter
        )
    else:
        docs = vector_store.similarity_search(query, k=k)

    return docs


In [None]:
def create_hybrid_retriever(vector_store, documents):
    """
    Create a hybrid retriever combining vector search with BM25
    """
    from langchain.retrievers import BM25Retriever
    from langchain.retrievers import EnsembleRetriever

    # Create BM25 retriever
    bm25_retriever = BM25Retriever.from_documents(documents)
    bm25_retriever.k = 10

    # Create vector store retriever
    vector_retriever = vector_store.as_retriever(search_kwargs={"k": 10})

    # Create ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, vector_retriever],
        weights=[0.5, 0.5]
    )

    return ensemble_retriever

In [None]:
def create_rag_chain(retriever, llm):
    """
    Create a RAG chain using the retriever and LLM
    """
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

    return rag_chain

In [None]:
from datasets import load_dataset

# Load a sample dataset (ArXiv papers dataset)
dataset = load_dataset("arxiv_dataset", split="train[:100]")
print(f"Loaded {len(dataset)} research paper abstracts")

documents = [
    Document(
        page_content=row["abstract"],
        metadata={"title": row["title"], "authors": row["authors"], "categories": row["categories"]}
    )
    for row in dataset
]


embeddings, llm = setup_models()

# Split documents into chunks
chunks = split_documents(documents, chunk_size=500, chunk_overlap=50)

# Create vector store
vector_store = create_vector_store(chunks, embeddings)

# Create a standard retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 5})


def query_rag(rag_chain, query):
    """
    Query the RAG system
    """
    result = rag_chain({"query": query})

    print(f"Query: {query}")
    print(f"Answer: {result['result']}")
    print("\nSource Documents:")

    for i, doc in enumerate(result["source_documents"]):
        print(f"\nDocument {i+1}:")
        print(f"Content: {doc.page_content[:200]}...")
        print(f"Metadata: {doc.metadata}")

    return result

In [None]:
# Create RAG chain
rag_chain = create_rag_chain(retriever, llm)

# Example query
query = "What are recent advances in quantum computing?"
result = query_rag(rag_chain, query)

# Cell 14: Metadata Filtering
# Example of search with metadata filtering
query = "neural networks in computer vision"
metadata_filter = {"categories": "cs.CV"}  # Filter for computer vision papers

filtered_docs = advanced_search(vector_store, query, metadata_filter)

print(f"Query: {query} (filtered by {metadata_filter})")
for i, doc in enumerate(filtered_docs):
    print(f"\nDocument {i+1}:")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}")

# Cell 15: Create Hybrid Retriever
hybrid_retriever = create_hybrid_retriever(vector_store, documents)

# Create RAG chain with hybrid retriever
hybrid_rag_chain = create_rag_chain(hybrid_retriever, llm)

# Compare results
query = "machine learning applications in healthcare"
print("Standard RAG Results:")
standard_result = query_rag(rag_chain, query)

print("\n\nHybrid RAG Results:")
hybrid_result = query_rag(hybrid_rag_chain, query)

In [None]:
import pickle

# Save the vector store to disk
vector_store.save_local("my_faiss_index")

print("Vector store saved successfully")

# Cell 17: Load Vector Store for Future Use
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Load the vector store from disk
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
loaded_vector_store = FAISS.load_local("my_faiss_index", embeddings)

print("Vector store loaded successfully")

# Cell 18: Advanced RAG Implementation with Source Context Management
def enhanced_rag_query(query, vector_store, llm, max_tokens=4000):
    """
    Enhanced RAG query that manages context window and properly formats sources
    """
    # Retrieve relevant documents
    docs = vector_store.similarity_search(query, k=8)

    # Format context with sources
    context = ""
    sources = []

    for i, doc in enumerate(docs):
        # Format source reference
        source_ref = f"[{i+1}] {doc.metadata.get('title', f'Document {i+1}')}"
        sources.append(source_ref)

        # Format document content with source reference
        doc_content = f"Source {i+1}: {doc.page_content}"
        context += doc_content + "\n\n"

    # Create prompt
    prompt = f"""
    Answer the following question based on the provided context.
    If you don't know the answer based on the context, just say so.

    Context:
    {context}

    Question: {query}

    Please cite your sources using the numbers in square brackets (e.g., [1], [2]).
    """

    # Get response from LLM
    from langchain.prompts import PromptTemplate
    from langchain.chains import LLMChain

    prompt_template = PromptTemplate(
        input_variables=["context", "query"],
        template=prompt
    )

    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run(context=context, query=query)

    # Format the final answer with source references
    final_answer = {
        "answer": response,
        "sources": sources
    }

    return final_answer

In [None]:
enhanced_result = enhanced_rag_query(
    "What are the applications of reinforcement learning in robotics?",
    vector_store,
    llm
)

print(f"Answer: {enhanced_result['answer']}\n")
print("Sources:")
for source in enhanced_result['sources']:
    print(f"- {source}")

# Cell 20: Evaluation of RAG System
def evaluate_rag(vector_store, test_questions, ground_truth, llm):
    """
    Evaluate the RAG system using test questions and ground truth
    """
    results = []

    for q, truth in zip(test_questions, ground_truth):
        # Get RAG response
        rag_response = enhanced_rag_query(q, vector_store, llm)

        # Create evaluation prompt
        eval_prompt = f"""
        Question: {q}

        Ground Truth Answer: {truth}

        RAG System Answer: {rag_response['answer']}

        On a scale of 1-10, rate how well the RAG system answer matches the ground truth in terms of:
        1. Factual correctness (1-10)
        2. Completeness (1-10)
        3. Relevance (1-10)

        Provide the three scores and a brief explanation for each.
        """

        # Get evaluation using a pipeline
        from langchain.prompts import PromptTemplate
        from langchain.chains import LLMChain

        prompt_template = PromptTemplate(
            input_variables=["eval_prompt"],
            template="{eval_prompt}"
        )

        chain = LLMChain(llm=llm, prompt=prompt_template)
        evaluation = chain.run(eval_prompt=eval_prompt)

        results.append({
            "question": q,
            "ground_truth": truth,
            "rag_answer": rag_response['answer'],
            "evaluation": evaluation
        })

    return results

# Cell 21: Additional - Demonstrating MMR (Maximal Marginal Relevance) for Diversity
def diverse_search(vector_store, query, k=5):
    """
    Perform a diverse search using Maximal Marginal Relevance
    """
    docs = vector_store.max_marginal_relevance_search(
        query,
        k=k,  # Number of documents to return
        fetch_k=15,  # Fetch more documents, then rerank
        lambda_mult=0.7  # Diversity factor (0 = max diversity, 1 = standard search)
    )

    return docs

# Example of MMR search
query = "deep learning applications"
diverse_docs = diverse_search(vector_store, query)

print(f"Diverse search results for: {query}")
for i, doc in enumerate(diverse_docs):
    print(f"\nDocument {i+1}:")
    print(f"Title: {doc.metadata.get('title', 'Unknown')}")
    print(f"Content: {doc.page_content[:150]}...")


In [None]:
def compressed_retrieval(vector_store, query, k=5):
    """
    Perform a search with contextual compression
    """
    from langchain.retrievers import ContextualCompressionRetriever
    from langchain.retrievers.document_compressors import LLMChainExtractor

    # Create the base retriever
    base_retriever = vector_store.as_retriever(search_kwargs={"k": k})

    # Create the document compressor
    compressor = LLMChainExtractor.from_llm(llm)

    # Create the contextual compression retriever
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=base_retriever
    )

    # Retrieve compressed documents
    compressed_docs = compression_retriever.get_relevant_documents(query)
    return compressed_docs

# Example of compressed retrieval
query = "quantum computing algorithms"
compressed_docs = compressed_retrieval(vector_store, query)

print(f"Compressed retrieval results for: {query}")
for i, doc in enumerate(compressed_docs):
    print(f"\nDocument {i+1}:")
    print(f"Title: {doc.metadata.get('title', 'Unknown')}")
    print(f"Content: {doc.page_content[:150]}...")