In [1]:
print('hello')

hello


In [19]:
!pip install docx2txt
import os
from typing import List, Any
from pathlib import Path
from glob import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader


Defaulting to user installation because normal site-packages is not writeable
Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [24]:
from pathlib import Path
from typing import List, Any
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader


class DataLoader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    def document_loader(self) -> List[Any]:
        """
        Reads all the PDF and DOCX files from file_path and returns a list of documents.
        """
        file_path = Path(self.file_path).resolve()
        print(f"[INFO] Loading documents from: {file_path}")
        documents = []

        # --- Load PDF Files ---
        pdf_files = list(file_path.glob("**/*.pdf"))
        print(f"[INFO] Found {len(pdf_files)} PDF files.")

        for pdf_file in pdf_files:
            try:
                loader = PyPDFLoader(str(pdf_file))
                loaded = loader.load()
                print(f"[INFO] Loaded {len(loaded)} pages from {pdf_file.name}")
                documents.extend(loaded)
            except Exception as e:
                print(f"[ERROR] Failed to load {pdf_file.name}: {e}")
                raise

        # --- Load DOCX Files ---
        docx_files = list(file_path.glob("**/*.docx"))
        print(f"[INFO] Found {len(docx_files)} DOCX files.")

        for docx_file in docx_files:
            try:
                loader = Docx2txtLoader(str(docx_file))
                loaded = loader.load()
                print(f"[INFO] Loaded {len(loaded)} pages from {docx_file.name}")
                documents.extend(loaded)
            except Exception as e:
                print(f"[ERROR] Failed to load {docx_file.name}: {e}")
                raise

        return documents


# Example usage
if __name__ == "__main__":
    dl = DataLoader("../content/.")
    all_docs = dl.document_loader()
    print(f"[INFO] Total documents loaded: {len(all_docs)}")


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)


[INFO] Loading documents from: C:\Users\saura\Downloads\Gaurav Files\LangChain\contact-center-rag\content
[INFO] Found 6 PDF files.


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)


[INFO] Loaded 10 pages from Example Corp Hospitality Group.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


[INFO] Loaded 15 pages from Example Corp Family Getaways.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)


[INFO] Loaded 17 pages from Example Corp Luxury Suites.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)


[INFO] Loaded 13 pages from Example Corp Party Times.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)


[INFO] Loaded 15 pages from Example Corp Seaside Resorts.pdf
[INFO] Loaded 17 pages from Example Corp Waypoint Inns.pdf
[INFO] Found 6 DOCX files.
[INFO] Loaded 1 pages from Example Corp Hospitality Group.docx
[INFO] Loaded 1 pages from Example Corp Family Getaways.docx
[INFO] Loaded 1 pages from Example Corp Luxury Suites.docx
[INFO] Loaded 1 pages from Example Corp Party Times.docx
[INFO] Loaded 1 pages from Example Corp Seaside Resorts.docx
[INFO] Loaded 1 pages from Example Corp Waypoint Inns.docx
[INFO] Total documents loaded: 93


In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np


class EmbeddingManager:
    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2", chunk_size: int= 1000, chunk_overlap: int= 200):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        print(f"[INFO] Using embedding model: {embedding_model}")

    def chunk_text(self, document: str) -> List[str]:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, 
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""])
        texts = text_splitter.split_documents(document)
        print(f"[INFO] Split document into {len(texts)} chunks.")
        return texts
    
    def embed_chunks(self, chunks: List[Any]) -> np.ndarray:
        texts = [chunk.page_content for chunk in chunks]
        print(f"[INFO] Generating embeddings for {len(texts)} chunks...")
        embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        print(f"[INFO] Embeddings shape: {embeddings.shape}")
        return embeddings

em= EmbeddingManager()
chunks = em.chunk_text(all_docs)
embeddings= em.embed_chunks(chunks)
print("[INFO] Example embedding:", embeddings[0] if len(embeddings) > 0 else None)

[INFO] Using embedding model: all-MiniLM-L6-v2
[INFO] Split document into 605 chunks.
[INFO] Generating embeddings for 605 chunks...


Batches: 100%|██████████| 19/19 [00:22<00:00,  1.16s/it]

[INFO] Embeddings shape: (605, 384)
[INFO] Example embedding: [ 3.44330110e-02 -3.39309014e-02 -4.00605090e-02 -9.94981732e-03
  5.08219711e-02 -1.69940777e-02 -3.77718955e-02 -5.98174036e-02
  5.30437641e-02  1.58252232e-02  2.86705550e-02  1.03579890e-02
  3.45988981e-02 -5.03477342e-02  3.77396606e-02 -8.13127533e-02
  2.69953236e-02 -8.40371698e-02 -5.64725250e-02 -3.42965983e-02
 -5.13245314e-02  3.94952521e-02 -6.79766685e-02  4.12578806e-02
 -7.17843547e-02  4.89981286e-02 -5.61770760e-02  6.97996616e-02
  5.69278598e-02 -1.08932279e-01 -1.41089307e-02  1.85071249e-02
  1.14520773e-01  5.75467944e-02  6.80644512e-02  9.35693830e-02
 -2.12627631e-02 -4.11354117e-02 -2.76304241e-02  3.91070507e-02
  8.31266213e-03 -4.72312532e-02  2.23568804e-03 -3.86026390e-02
 -5.97500578e-02 -6.37177676e-02  1.93466782e-03  5.67272976e-02
  3.68588939e-02  1.47044724e-02  1.74112357e-02 -6.17548823e-02
  6.82596350e-03  2.11318284e-02 -3.27931345e-02 -7.15029240e-02
 -3.13570611e-02 -3.17981802


