# Data Ingestion

### Document Structure

In [1]:
# For Write
from langchain_core.documents import Document

In [2]:
# To create metadata
doc = Document(
    page_content="This is main content and this use to create RAG",
    metadata={
        "source": "/content/drive/MyDrive/RAG/rag_pipline_project_1/example.txt",
         "pages": 1,
         "author": "Denuwan",
         "date_created": "2025-01-01"
        }
    )

doc

Document(metadata={'source': '/content/drive/MyDrive/RAG/rag_pipline_project_1/example.txt', 'pages': 1, 'author': 'Denuwan', 'date_created': '2025-01-01'}, page_content='This is main content and this use to create RAG')

In [3]:
# Create a simple txt file
import os
os.makedirs("/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files", exist_ok=True)

In [4]:
# Write content for the file
sample_text = {
    "/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files/python_intro.txt": """Python is a programming language that lets you work quickly and integrate systems more effectively.""",
}

for file_path, text in sample_text.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

print("Sample text files created!")

Sample text files created!


In [5]:
!pip install langchain_community



In [6]:
# For load document (Any file can be loaded)
# [Check this link for more details](https://python.langchain.com/)

# from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

# Directory Loader
from langchain_community.document_loaders import DirectoryLoader

# PDF Loader
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

In [7]:
loader = TextLoader("/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
print(document)

[Document(metadata={'source': '/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files/python_intro.txt'}, page_content='Python is a programming language that lets you work quickly and integrate systems more effectively.')]


In [8]:

# load all the text files from the directory
dir_loader = DirectoryLoader(
    "/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files",
    glob="**/*.txt",    # pattern to match files
    loader_cls=TextLoader,       # loader class to use
    loader_kwargs={"encoding": "utf-8"}, # encoding
    show_progress=True,         # show progress bar
    )

documents = dir_loader.load()
documents

100%|██████████| 1/1 [00:00<00:00, 179.43it/s]


[Document(metadata={'source': '/content/drive/MyDrive/RAG/rag_pipline_project_1/data/text_files/python_intro.txt'}, page_content='Python is a programming language that lets you work quickly and integrate systems more effectively.')]

In [9]:
pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7


In [10]:
# load all the pdf files from the directory
dir_loader = DirectoryLoader(
    "/content/drive/MyDrive/RAG/rag_pipline_project_1/data/pdf",
    glob="**/*.pdf",    # pattern to match files
    loader_cls=PyMuPDFLoader,       # loader class to use
    show_progress=False,         # show progress bar
    )

pdf_documents = dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'Adobe PDF Library 9.9', 'creator': 'Adobe InDesign CS5 (7.0)', 'creationdate': '2011-03-08T19:02:03+08:00', 'source': '/content/drive/MyDrive/RAG/rag_pipline_project_1/data/pdf/Thermal-imaging-cameras-testing-solar-panels.pdf', 'file_path': '/content/drive/MyDrive/RAG/rag_pipline_project_1/data/pdf/Thermal-imaging-cameras-testing-solar-panels.pdf', 'total_pages': 4, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2011-03-08T19:02:05+08:00', 'trapped': '', 'modDate': "D:20110308190205+08'00'", 'creationDate': "D:20110308190203+08'00'", 'page': 0}, page_content="Technical note\nwww.flir.com\nIn the field of research and development \n(R&D) thermal imaging cameras are already \nan established tool for the evaluation of \nsolar cells and panels. For these sophisticated \nmeasurements, usually high performance \ncameras with cooled detectors are used \nunder controlled laboratory conditions.\nHowever, the use of the

### Text splitting get into chunks

In [11]:
# import os
# from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from pathlib import Path

### Embedding & VectorStoreDB

In [12]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [13]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
## Handles document embedding generation using SentenceTransformer
class EmbeddingManager:
    """
    Initialize the Embedding Manager
     Args:
        model_name: `HuggingFace model` name for sentence embeddings
        `HuggingFace model` - Provides access through its Model Hub to thousands of pre-trained models for tasks like speech recognition, text classification, text generation, text summarization, question answering, image generation and more.
     """
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()  # _ - protected function in Python

    # Load the SentenceTransfer model
    def _load_model(self):
        try:
            print(f"Load embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimention: {self.model.get_sentence_embedding_dimension()}")       # Get the embedding dimension of the model
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    # Generate embedding for a list of texts
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Args:
            texts - List of text string to embed

        Returns:
            numpy array of embedding with shape(len(text), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")

        print(f"Generating embedded for {len(texts)} texts ...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generating embedded with shape: {embeddings.shape}")
        return embeddings


# Initialize
embeddings_manager = EmbeddingManager()
embeddings_manager

Load embedding model: all-MiniLM-L6-v2


KeyboardInterrupt: 

### VectorStore

In [None]:
## Manage documents embedding in a ChromaDB vectorStore
class VectorStore:

    # Initialize the vector store
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    # Initialize ChromaDB client and collection
    def _initialize_store(self):
        try:
            # create persistent ChromeDB client
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection (used to organize and store related embeddings together[like a folder, Keeps similar documents together])
            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata = {"description": "PDF document embedding for RAG"}
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")

    ## Add documents and their embeddings to the vector store
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
            Args:
                documents: List of LangChain documents
                embeddings: Corresponding to the vector store
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        # prepare data for chromaDB
        ids= []
        metadatas= []
        documents_text= []
        embeddings_list= []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):

            # Generate unique id
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # uuid4() → random unique number
            # .hex → converts it to letters & numbers
            # [:8] → takes only first 8 characters (short & readable)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            #Embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection
        try:
            self.collection.add(
                ids= ids,
                embeddings= embeddings_list,
                metadatas= metadatas,
                documents= documents_text
            )

            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore = VectorStore()
vectorstore

In [None]:
chunks