<a href="https://colab.research.google.com/github/ganeshbulagondla/content/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
!pip install pyPDF2
!pip install sentence_transformers
!pip install langchain
!pip install langchain-community
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    """
    Splits text into chunks of a specified size with overlap.

    Args:
        text (str): The input text.
        chunk_size (int): The size of each chunk.
        chunk_overlap (int): The number of overlapping characters between chunks.

    Returns:
        List[str]: List of text chunks.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)
def generate_embeddings(chunks, embedding_model):
    """
    Generates embeddings for text chunks using a pre-trained model.

    Args:
        chunks (List[str]): List of text chunks.
        embedding_model (SentenceTransformer): Pre-trained embedding model.

    Returns:
        List[np.ndarray]: List of vector embeddings.
    """
    return [embedding_model.encode(chunk) for chunk in chunks]
def store_embeddings_in_faiss(chunks, embeddings):
    """
    Stores text chunks and their embeddings in a FAISS vector database.

    Args:
        chunks (List[str]): List of text chunks.
        embeddings (List[np.ndarray]): Corresponding embeddings for the chunks.

    Returns:
        FAISS: A FAISS vector database instance.
    """
    faiss_db = FAISS.from_texts(texts=chunks, embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
    return faiss_db
def data_ingestion_pipeline(pdf_dir, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Executes the data ingestion pipeline: text extraction, chunking, embedding generation, and storage.

    Args:
        pdf_dir (str): Path to the directory containing PDF files.
        embedding_model_name (str): Name of the pre-trained embedding model.

    Returns:
        FAISS: A FAISS vector database instance with stored embeddings.
    """
    embedding_model = SentenceTransformer(embedding_model_name)
    all_chunks = []

    for pdf_file in os.listdir(pdf_dir):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, pdf_file)
            print(f"Processing: {pdf_path}")

            text = extract_text_from_pdf(pdf_path)
            chunks = chunk_text(text)
            all_chunks.extend(chunks)
    embeddings = generate_embeddings(all_chunks, embedding_model)
    faiss_db = store_embeddings_in_faiss(all_chunks, embeddings)

    return faiss_db
if _name_ == "_main_":
    pdf_directory = "path/to/your/pdf/files"
    vector_db = data_ingestion_pipeline(pdf_directory)
    print("Data ingestion completed. Vector database is ready.")

Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-