In [1]:
print('hello')

hello


In [2]:
import os
from typing import List, Any
from pathlib import Path
from glob import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pathlib import Path
from typing import List, Any
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader


class DataLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def document_loader(self) -> List[Any]:
        """
        Reads all the PDF and DOCX files from file_path and returns a list of documents.
        """
        file_path = Path(self.file_path).resolve()
        print(f"[INFO] Loading documents from: {file_path}")
        documents = []

        # --- Load PDF Files ---
        pdf_files = list(file_path.glob("**/*.pdf"))
        print(f"[INFO] Found {len(pdf_files)} PDF files.")

        for pdf_file in pdf_files:
            try:
                loader = PyPDFLoader(str(pdf_file))
                loaded = loader.load()
                print(f"[INFO] Loaded {len(loaded)} pages from {pdf_file.name}")
                documents.extend(loaded)
            except Exception as e:
                print(f"[ERROR] Failed to load {pdf_file.name}: {e}")
                raise

        # --- Load DOCX Files ---
        docx_files = list(file_path.glob("**/*.docx"))
        print(f"[INFO] Found {len(docx_files)} DOCX files.")

        for docx_file in docx_files:
            try:
                loader = Docx2txtLoader(str(docx_file))
                loaded = loader.load()
                print(f"[INFO] Loaded {len(loaded)} pages from {docx_file.name}")
                documents.extend(loaded)
            except Exception as e:
                print(f"[ERROR] Failed to load {docx_file.name}: {e}")
                raise

        return documents


# Example usage
if __name__ == "__main__":
    dl = DataLoader("../content/.")
    all_docs = dl.document_loader()
    print(f"[INFO] Total documents loaded: {len(all_docs)}")


[INFO] Loading documents from: C:\Users\saura\Downloads\Gaurav Files\LangChain\contact-center-rag\content
[INFO] Found 6 PDF files.


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)


[INFO] Loaded 10 pages from Example Corp Hospitality Group.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


[INFO] Loaded 15 pages from Example Corp Family Getaways.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)


[INFO] Loaded 17 pages from Example Corp Luxury Suites.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


[INFO] Loaded 13 pages from Example Corp Party Times.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)


[INFO] Loaded 15 pages from Example Corp Seaside Resorts.pdf
[INFO] Loaded 17 pages from Example Corp Waypoint Inns.pdf
[INFO] Found 6 DOCX files.
[INFO] Loaded 1 pages from Example Corp Hospitality Group.docx
[INFO] Loaded 1 pages from Example Corp Family Getaways.docx
[INFO] Loaded 1 pages from Example Corp Luxury Suites.docx
[INFO] Loaded 1 pages from Example Corp Party Times.docx
[INFO] Loaded 1 pages from Example Corp Seaside Resorts.docx
[INFO] Loaded 1 pages from Example Corp Waypoint Inns.docx
[INFO] Total documents loaded: 93


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np


class EmbeddingManager:
    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2", chunk_size: int= 1000, chunk_overlap: int= 200):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        print(f"[INFO] Using embedding model: {embedding_model}")

    def chunk_text(self, document: str) -> List[str]:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, 
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""])
        texts = text_splitter.split_documents(document)
        print(f"[INFO] Split document into {len(texts)} chunks.")
        return texts
    
    def embed_chunks(self, chunks: List[Any]) -> np.ndarray:
        texts = [chunk.page_content for chunk in chunks]
        print(f"[INFO] Generating embeddings for {len(texts)} chunks...")
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        print(f"[INFO] Embeddings shape: {embeddings.shape}")
        return embeddings

em= EmbeddingManager()
chunks = em.chunk_text(all_docs)
embeddings= em.embed_chunks(chunks)
print("[INFO] Example embedding:", embeddings[0] if len(embeddings) > 0 else None)

[INFO] Using embedding model: all-MiniLM-L6-v2
[INFO] Split document into 605 chunks.
[INFO] Generating embeddings for 605 chunks...


Batches: 100%|██████████| 19/19 [00:26<00:00,  1.39s/it]

[INFO] Embeddings shape: (605, 384)
[INFO] Example embedding: [ 3.44330110e-02 -3.39309014e-02 -4.00605090e-02 -9.94981732e-03
  5.08219711e-02 -1.69940777e-02 -3.77718955e-02 -5.98174036e-02
  5.30437641e-02  1.58252232e-02  2.86705550e-02  1.03579890e-02
  3.45988981e-02 -5.03477342e-02  3.77396606e-02 -8.13127533e-02
  2.69953236e-02 -8.40371698e-02 -5.64725250e-02 -3.42965983e-02
 -5.13245314e-02  3.94952521e-02 -6.79766685e-02  4.12578806e-02
 -7.17843547e-02  4.89981286e-02 -5.61770760e-02  6.97996616e-02
  5.69278598e-02 -1.08932279e-01 -1.41089307e-02  1.85071249e-02
  1.14520773e-01  5.75467944e-02  6.80644512e-02  9.35693830e-02
 -2.12627631e-02 -4.11354117e-02 -2.76304241e-02  3.91070507e-02
  8.31266213e-03 -4.72312532e-02  2.23568804e-03 -3.86026390e-02
 -5.97500578e-02 -6.37177676e-02  1.93466782e-03  5.67272976e-02
  3.68588939e-02  1.47044724e-02  1.74112357e-02 -6.17548823e-02
  6.82596350e-03  2.11318284e-02 -3.27931345e-02 -7.15029240e-02
 -3.13570611e-02 -3.17981802




In [27]:
chunks

[Document(metadata={'producer': 'macOS Version 13.6.7 (Build 22G720) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20240730221511Z00'00'", 'moddate': "D:20240730221511Z00'00'", 'source': 'C:\\Users\\saura\\Downloads\\Gaurav Files\\LangChain\\contact-center-rag\\content\\content-pdf\\corporate\\Example Corp Hospitality Group.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='Copyright © 2024, Amazon Web Services, Inc. 1 \nExample CorpHospitality Group\nCorporate Overview'),
 Document(metadata={'producer': 'macOS Version 13.6.7 (Build 22G720) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20240730221511Z00'00'", 'moddate': "D:20240730221511Z00'00'", 'source': 'C:\\Users\\saura\\Downloads\\Gaurav Files\\LangChain\\contact-center-rag\\content\\content-pdf\\corporate\\Example Corp Hospitality Group.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content="Escape the Ordinary with Example Corp Hospitality Group  At Example Corp Hospital

In [6]:
import os
import faiss
import numpy as np
import pickle
from typing import List, Any
from sentence_transformers import SentenceTransformer

In [None]:
class FaissVectorStore: 
    def __init__(self, persist_dir: str= "faiss_store", embedding_model: str= "all-MiniLM-L6-v2", chunk_size: int= 1000, chunk_overlap: int= 200):
        self.persist_dir = persist_dir
        os.makedirs(self.persist_dir, exist_ok=True)
        print(f"[INFO] Initializing FaissVectorStore with persist_dir: {self.persist_dir}")
        self.embedding_model = embedding_model
        self.model= SentenceTransformer(embedding_model)
        self.index= None
        self.metadata= []
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        print(f"[INFO] Loaded Embedding Model: {embedding_model}")

    def build_from_documents(self, documents: List[Any]):
        print(f"[INFO] Building vectore store from {len(documents)} raw documents ....")
        embedding_pipeline= EmbeddingManager(self.embedding_model, self.chunk_size, self.chunk_overlap)
        chunks = embedding_pipeline.chunk_text(documents)   # could be chunk_documents
        embeddings = embedding_pipeline.embed_chunks(chunks)
        metadatas= [{"text": chunk.page_content} for chunk in chunks]
        self.add_embeddings(np.array(embeddings).astype('float32'), metadatas)
        self.save()
        print(f"[INFO] Faiss vector store built and saved to {self.persist_dir}")

    def add_embeddings(self, embeddings: np.ndarray, metadatas: List[Any]= None):
        dim= embeddings.shape[1]
        if self.index is None:
            self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings)
        if metadatas:
            self.metadata.extend(metadatas)
        print(f"[INFO] Added {embeddings.shape[0]} embeddings to the index.")
    
    def save(self):
        faiss_path= os.path.join(self.persist_dir, "faiss.index")
        meta_path= os.path.join(self.persist_dir, "metadata.pkl")
        faiss.write_index(self.index, faiss_path)
        with open(meta_path, "wb") as f:
            pickle.dump(self.metadata, f)
        print(f"[INFO] Faiss index and metadata saved to {self.persist_dir}")
    
    def load(self):
        faiss_path= os.path.join(self.persist_dir, "faiss.index")
        meta_path= os.path.join(self.persist_dir, "metadata.pkl")
        self.index = faiss.read_index(faiss_path)
        with open(meta_path, "rb") as f:
            self.metadata = pickle.load(f)
        print(f"[INFO] Faiss index and metadata loaded from {self.persist_dir}")
    
    def search(self, query_embedding: np.ndarray, top_k: int= 5):
        if self.index is None:
            raise ValueError("Index not loaded. Please load or build the index first.")
        D, I = self.index.search(query_embedding, top_k)
        results = []
        for idx, dist in zip(I[0], D[0]):
            meta = self.metadata[idx] if idx < len(self.metadata) else None
            results.append({"index": idx, "distance": dist, "metadata": meta})
        return results
    
    def query(self, query_text: str, top_k: int= 5):
        print(f"[INFO] Querying for: {query_text}")
        query_embedding = self.model.encode([query_text]).astype('float32')
        return self.search(query_embedding, top_k)
        

faiss_store = FaissVectorStore()
faiss_store.build_from_documents(all_docs)
faiss_store.load()
query_results = faiss_store.query("What is the return policy?", top_k=3)
print("[INFO] Query Results:")
for i, result in enumerate(query_results):
    print(f"  Result {i+1}: Distance={result['distance']:.4f}")
    print(f"  Text: {result['metadata']['text'][:200]}...")

[INFO] Initializing FaissVectorStore with persist_dir: faiss_store
[INFO] Loaded Embedding Model: all-MiniLM-L6-v2
[INFO] Building vectore store from 93 raw documents ....
[INFO] Using embedding model: all-MiniLM-L6-v2
[INFO] Split document into 605 chunks.
[INFO] Generating embeddings for 605 chunks...


Batches: 100%|██████████| 19/19 [00:24<00:00,  1.27s/it]


[INFO] Embeddings shape: (605, 384)
[INFO] Added 605 embeddings to the index.
[INFO] Faiss index and metadata saved to faiss_store
[INFO] Faiss vector store built and saved to faiss_store
[INFO] Faiss index and metadata loaded from faiss_store
[INFO] Querying for: What is the return policy?
[INFO] Query Results:
  Result 1: Distance=0.8916
  Text: - Cancella<on and refund policies vary based on the rate booked and the length of stay. - Guests are advised to review the speciﬁc cancella<on and refund policies at the <me of booking.  Addi7onal Pol...
  Result 2: Distance=0.9201
  Text: Cancellation and Refund Policy

- Cancellation and refund policies vary based on the rate booked and the length of stay.

- Guests are advised to review the specific cancellation and refund policies a...
  Result 3: Distance=0.9856
  Text: Corp Party Times is not responsible for lost, stolen, or damaged personal belongings. Guests are advised to keep their valuables secure at all 2mes.  CancellaDon and Refu

In [13]:
query_results = faiss_store.query("What is name of hotel?", top_k=3)
print("[INFO] Query Results:")
for i, result in enumerate(query_results):
    print(f"  Result {i+1}: Distance={result['distance']:.4f}")
    print(f"  Text: {result['metadata']['text'][:200]}...")

[INFO] Querying for: What is name of hotel?
[INFO] Query Results:
  Result 1: Distance=0.8548
  Text: With Example Corp Hospitality Group, every journey is an opportunity to create lasCng memories and forge meaningful connecCons.  Embark on an extraordinary adventure with us and experience the epitome...
  Result 2: Distance=0.8920
  Text: Over the decades, Example Corp Hospitality Group has grown through strategic acquisitions and the development of new hotel brands, each one carefully crafted to cater to the unique needs and preferenc...
  Result 3: Distance=0.9177
  Text: Accommodations Designed for Families

Forget about cramped quarters and uncomfortable sleeping arrangements. At Example Corp Family Getaways, our spacious family suites and interconnecting rooms provi...


In [15]:
import os
from dotenv import load_dotenv
from langchain_ollama import ChatOllama

load_dotenv("../.env")
OLLAMA_API_KEY= os.getenv("OLLAMA_API_KEY")

In [20]:

class RAGSearch:
    def __init__(self, persist_dir: str = "faiss_store", embedding_model: str = "all-MiniLM-L6-v2", llm_model: str = "llama2"):
        self.vectorstore = FaissVectorStore(persist_dir, embedding_model)
        # Load or build vectorstore
        faiss_path = os.path.join(persist_dir, "faiss.index")
        meta_path = os.path.join(persist_dir, "metadata.pkl")
        if not (os.path.exists(faiss_path) and os.path.exists(meta_path)):
            # from data_loader import load_all_documents
            docs = DataLoader.document_loader("data")
            self.vectorstore.build_from_documents(docs)
        else:
            self.vectorstore.load()
        self.llm = ChatOllama(OLLAMA_API_KEY=OLLAMA_API_KEY, model=llm_model)
        print(f"[INFO] OLLAMA LLM initialized: {llm_model}")

    def search_and_summarize(self, query: str, top_k: int = 5) -> str:
        results = self.vectorstore.query(query, top_k=top_k)
        texts = [r["metadata"].get("text", "") for r in results if r["metadata"]]
        context = "\n\n".join(texts)
        if not context:
            return "No relevant documents found."
        prompt= f"""
                You are a helpful customer support agent. 
                Your goal is to answer the user's question using the provided context and nothing else:
                '{query}'\n\nContext:\n{context}\n\nAnswer:
            """
        response = self.llm.invoke([prompt])
        return response.content
    
rag_search = RAGSearch()

[INFO] Initializing FaissVectorStore with persist_dir: faiss_store
[INFO] Loaded Embedding Model: all-MiniLM-L6-v2
[INFO] Faiss index and metadata loaded from faiss_store
[INFO] OLLAMA LLM initialized: llama2


In [None]:
query = "Which hotel is this?"
response = rag_search.search_and_summarize(query, top_k=3)
print("Response:", response)

[INFO] Querying for: Which hotel is this?
Response: The hotel is Example Corp Luxury Suites Miami.


In [23]:
query = "Tell me about the activities i can do?"
response = rag_search.search_and_summarize(query, top_k=3)
print("Response:", response)

[INFO] Querying for: Tell me about the activities i can do?
Response: There are plenty of fun activities for kids to enjoy! In our Outdoor Adventurers program, they can go on guided nature hikes and camping trips, learning valuable survival skills in the great outdoors. They can also explore their creativity in our Art Studio, where they'll have the chance to work with various mediums like painting, sculpting, digital art, and photography. For the budding scientists and engineers among them, our STEM Lab offers hands-on experiments, coding workshops, and robotics challenges that encourage critical thinking and problem-solving skills in a fun environment. And for those who love to perform, they can explore the world of drama, music, and dance through our Theater Troupe, culminating in an end-of-week performance for family and friends. With our Kids' Clubs and Supervised Activities, you can rest assured that your children are having a blast while developing new skills and interests, all 

In [25]:
query = "what are all the locations?"
response = rag_search.search_and_summarize(query, top_k=3)
print("Response:", response)

[INFO] Querying for: what are all the locations?
Response: Here are all the locations mentioned in the passage:

1. East Coast locations: New York City (multiple locations), Boston, Philadelphia, Washington D.C., Atlanta, and Miami.
2. Midwest locations: Chicago (multiple locations), Detroit, Minneapolis, Cleveland, Indianapolis, and St. Louis.
3. West Coast locations: Los Angeles (multiple locations), San Francisco, Seattle, Portland, San Diego, and Phoenix.
4. Southern locations: Dallas, Houston, Austin, New Orleans, Nashville, and Charlotte.
5. Airport locations: JFK, LAX, ORD, DFW, ATL, and many more.
