<a href="https://colab.research.google.com/github/divyasingh2611/RAG-Fast-Api/blob/main/faissdb_finance_report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install langchain langchain_community pdfplumber faiss-cpu torch accelerate pytesseract huggingface_hub -qq -U
!pip install bitsandbytes transformers xformers -qq -U

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

# Storing Api's key

In [None]:
from google.colab import userdata
key_1=userdata.get('huggingface')
key_2=userdata.get('gemini')

# Extracting Text and Table's from pdfs

In [None]:
import os
import pdfplumber
import pytesseract
from PIL import Image
import csv
from io import StringIO


# Path to your folder containing PDFs
pdf_folder = "/content/drive/MyDrive/work/data"
# output_folder= "/content/drive/MyDrive/work/output_folder"

# Ensure output folder exists
os.makedirs("output_folder", exist_ok=True)

def clamp_bbox(bbox, page_width, page_height):
    """Ensure the bounding box is within the page boundaries."""
    x0, top, x1, bottom = bbox
    return max(0, min(x0, page_width)), max(0, min(top, page_height)), \
           max(0, min(x1, page_width)), max(0, min(bottom, page_height))

def extract_text_and_tables_from_pdf(pdf_path):
    """Extract text, OCR text from images, and tables from a single PDF and store everything in a variable."""
    output_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            output_text += f"--- Page {i + 1} ---\n"

            # Extract text from the page
            text = page.extract_text()
            if text:
                output_text += "Extracted Text:\n"
                output_text += text + "\n"

            # Extract tables and format them as a string
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                output_text += f"Table {table_index + 1} on Page {i + 1}:\n"
                for row in table:
                    # Handle None values in the table cells
                    output_text += ", ".join(str(cell) if cell is not None else "" for cell in row) + "\n"

            # Extract images and perform OCR on them
            for image_index, image in enumerate(page.images):
                x0, top, x1, bottom = image["x0"], image["top"], image["x1"], image["bottom"]
                page_width, page_height = page.width, page.height

                # Clamp the bounding box to stay within the page boundaries
                clamped_bbox = clamp_bbox((x0, top, x1, bottom), page_width, page_height)

                # Crop the image using the clamped bounding box
                img = page.crop(clamped_bbox)
                pil_image = img.to_image().original

                # Ensure the image has valid dimensions
                if pil_image.width > 0 and pil_image.height > 0:
                    # Perform OCR on the image
                    ocr_text = pytesseract.image_to_string(pil_image)
                    if ocr_text.strip():
                        output_text += f"OCR Text from Image {image_index + 1} on Page {i + 1}:\n"
                        output_text += ocr_text + "\n"
                else:
                    output_text += f"Warning: Invalid image size in Page {i + 1}, Image {image_index + 1}\n"

            output_text += "\n"

    return output_text

# Process all PDFs in the folder and accumulate the extracted content in a variable
all_pdf_text = ""
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, file)
        pdf_text = extract_text_and_tables_from_pdf(pdf_path)
        all_pdf_text += pdf_text

# Output the combined result as a single string
print(all_pdf_text)

# Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=409)
chunks = text_splitter.split_text(all_pdf_text)

# Convert chunks (strings) to Document objects
docs = [Document(page_content=chunk) for chunk in chunks]


# Embeddings

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import os

In [None]:
# Creating a vectorstore folder to store the FAISS vector database embeddings
vectorstore_path ='/content/drive/MyDrive/work'
db_faiss_path = os.path.join(vectorstore_path, 'db_faiss')

# Create the directories
os.makedirs(db_faiss_path, exist_ok=True)
# Verify directory creation
os.path.exists(vectorstore_path), os.path.exists(db_faiss_path)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='FinLang/finance-embeddings-investopedia',model_kwargs={'device':'cuda'})

db = FAISS.from_documents(docs, embeddings)
db.save_local(db_faiss_path)
print("Saved into the vector database")