In [12]:
# !pip install pdf2image paddleocr sentence-transformers faiss-cpu transformers pandas opencv-python-headless paddlepaddle
#!apt-get update
#!apt-get install -y poppler-utils

# Import Libraries
import os
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
import json
import pandas as pd
from google.colab import files
import cv2
import re

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang="en")

# Step 1: Extract Text from All Pages in a PDF
def extract_text_from_all_pages(pdf_path):
    """Extracts text from all PDF pages using OCR."""
    images = convert_from_path(pdf_path)
    page_texts = {}

    for i, image in enumerate(images):
        page_number = i + 1
        image_path = f"page_{page_number}.jpg"
        image.save(image_path)

        # Perform OCR on the page
        result = ocr.ocr(image_path, cls=True)
        os.remove(image_path)  # Clean up

        # Process OCR results
        page_data = ""
        for res in result[0]:
            if len(res) > 1:
                text, confidence = res[1]
                if confidence > 0.5:  # Filter low-confidence text
                    page_data += text + " "
        clean_text = re.sub(r'\s+', ' ', page_data.strip())
        page_texts[page_number] = clean_text if clean_text else "No text found."

    return page_texts

# Step 2: Create Embeddings for Each Page
def create_page_embeddings(page_texts, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """Generates embeddings for each page using SentenceTransformer."""
    embedding_model = SentenceTransformer(model_name)
    page_embeddings = {}

    for page_number, text in page_texts.items():
        sentences = [line.strip() for line in text.split("\n") if line.strip()]
        embeddings = embedding_model.encode(sentences, convert_to_tensor=False)
        page_embeddings[page_number] = (sentences, embeddings)
    return embedding_model, page_embeddings

# Step 3: Build FAISS Vector Store for Each Page
def build_faiss_index_for_pages(page_embeddings):
    """Builds FAISS indices for each page's embeddings."""
    faiss_indices = {}
    for page_number, (sentences, embeddings) in page_embeddings.items():
        if len(embeddings) > 0:
            dimension = embeddings.shape[1]
            index = faiss.IndexFlatL2(dimension)
            index.add(np.array(embeddings))
            faiss_indices[page_number] = index
    return faiss_indices

# Step 4: Query-Specific Page or Entire Document
def retrieve_context(query, page_number, page_embeddings, faiss_indices, embedding_model, top_k=5):
    """Retrieves relevant context from a specific page or all pages."""
    results = {}

    if page_number == "all":
        for pg_num, (sentences, embeddings) in page_embeddings.items():
            index = faiss_indices[pg_num]
            query_embedding = embedding_model.encode([query], convert_to_tensor=True)
            distances, indices = index.search(np.array(query_embedding), top_k)
            retrieved_sentences = [sentences[idx] for idx in indices[0]]
            results[pg_num] = retrieved_sentences
    else:
        page_number = int(page_number)
        sentences, embeddings = page_embeddings[page_number]
        index = faiss_indices[page_number]
        query_embedding = embedding_model.encode([query], convert_to_tensor=True)
        distances, indices = index.search(np.array(query_embedding), top_k)
        retrieved_sentences = [sentences[idx] for idx in indices[0]]
        results[page_number] = retrieved_sentences

    return results

# Step 5: Generate Answer using Open-Source LLM
def generate_answer(context, query, llm_model="google/flan-t5-large"):
    """Generates an answer using an open-source LLM and retrieved context."""
    generator = pipeline("text2text-generation", model=llm_model)
    input_text = f"Context: {' '.join(context)} \n\nQuestion: {query}"
    result = generator(input_text, max_length=200, truncation=True)
    return result[0]['generated_text']

# Step 6: Main Workflow
uploaded = files.upload()
for filename in uploaded.keys():
    print(f"Processing file: {filename}")

    try:
        # Extract text from all pages
        print("Extracting text from all pages using OCR...")
        page_texts = extract_text_from_all_pages(filename)
        print("Text Extracted Successfully!")

        # Create embeddings for each page
        embedding_model, page_embeddings = create_page_embeddings(page_texts)
        print("Embeddings Created Successfully for all pages!")

        # Build FAISS indices for each page
        faiss_indices = build_faiss_index_for_pages(page_embeddings)
        print("FAISS Indices Built Successfully!")

        # User Query Input
        while True:
            query = input("Enter your query (or type 'exit' to stop): ").strip()
            if query.lower() == 'exit':
                print("Exiting the query loop. Goodbye!")
                break

            page_number = input("Enter the page number to query (or type 'all' for the entire document): ").strip()
            try:
                context_results = retrieve_context(query, page_number, page_embeddings, faiss_indices, embedding_model)
                print("Relevant Context Retrieved!")

                # Display results
                for pg_num, context in context_results.items():
                    print(f"\nPage {pg_num} - Relevant Context:")
                    for sentence in context:
                        print(f"- {sentence}")

                # Generate and display answer
                combined_context = [sent for sentences in context_results.values() for sent in sentences]
                answer = generate_answer(combined_context, query)
                print("\nAnswer:")
                print(answer)

                # Save the result to a file
                with open("answer.json", "w") as f:
                    json.dump({"query": query, "answer": answer}, f, indent=4)
                files.download("answer.json")
                print("Answer saved and downloaded successfully!")

            except ValueError as e:
                print(f"Error: {e}. Please try again.")

    except ValueError as e:
        print(f"Error: {e}")


[2024/12/17 06:57:09] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c

Saving sdfsdfdsfs.pdf to sdfsdfdsfs (4).pdf
Processing file: sdfsdfdsfs (4).pdf
Extracting text from all pages using OCR...
[2024/12/17 06:57:40] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.34424543380737305
[2024/12/17 06:57:40] ppocr DEBUG: cls num  : 6, elapsed : 0.03413105010986328
[2024/12/17 06:57:41] ppocr DEBUG: rec_res num  : 6, elapsed : 1.333397626876831
[2024/12/17 06:57:42] ppocr DEBUG: dt_boxes num : 27, elapsed : 0.34801244735717773
[2024/12/17 06:57:42] ppocr DEBUG: cls num  : 27, elapsed : 0.13609695434570312
[2024/12/17 06:57:44] ppocr DEBUG: rec_res num  : 27, elapsed : 2.3483710289001465
[2024/12/17 06:57:45] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.3064243793487549
[2024/12/17 06:57:45] ppocr DEBUG: cls num  : 2, elapsed : 0.06110572814941406
[2024/12/17 06:57:45] ppocr DEBUG: rec_res num  : 2, elapsed : 0.26683473587036133
[2024/12/17 06:57:45] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.32292795181274414
[2024/12/17 06:57:45] ppocr DEBUG: cls num  : 5, elapsed : 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Answer saved and downloaded successfully!
Enter your query (or type 'exit' to stop): From page 6 get the tabular data 
Enter the page number to query (or type 'all' for the entire document): From page 6 get the tabular data
Error: invalid literal for int() with base 10: 'From page 6 get the tabular data'. Please try again.
Enter your query (or type 'exit' to stop): 6
Enter the page number to query (or type 'all' for the entire document): 6
Relevant Context Retrieved!

Page 6 - Relevant Context:
- Tabl ofYeary U S.G DP by nd ustry (n m illionsofdoIa rs) Source:US.Bureau ofLaborStatistcs Year 2010 2011 2012 2013 2014 2015 Andustres 26093515 27535971 28663246 29601191 30895407 31397023 M anufacturng 4992521 5581942 5841608 5953299 6047477 5829554 Fnance, nsurance,Real 4522451 4618678 4797313 5031881 5339678 5597018 Esta te, Rental, Lea sng A rts, Entertanm ent, Recreaton, 964032 1015238 1076249 1120496 1189646 1283813 Accom m odaton, and Food Servre 0 ther 15614511 16320113 16948076 17495

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Answer saved and downloaded successfully!
Enter your query (or type 'exit' to stop): stop
Enter the page number to query (or type 'all' for the entire document): stop
Error: invalid literal for int() with base 10: 'stop'. Please try again.


KeyboardInterrupt: Interrupted by user