In [None]:
import os
import re
import fitz # PyMuPDF
import io
import cv2
import pytesseract
import json
from PIL import Image, ImageDraw, ImageFont
import numpy as np

def extract_block_diagrams_with_titles(pdf_path, output_folder, search_title, search_component):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)
    fig_pattern = re.compile(r'Fig \d+\.\d+.*')
    data = []

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        image_list = page.get_images(full=True)
        images = []
        titles = []

        # Extract images from the page
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            images.append(image)

        # Extract text and match figure titles
        text = page.get_text("text")
        for match in fig_pattern.finditer(text):
            title = match.group().strip()
            titles.append(title)

        # Combine images and titles
        for i, (image, title) in enumerate(zip(images, titles)):
            # Perform OCR on the image with detailed output
            ocr_data = pytesseract.image_to_data(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR), output_type=pytesseract.Output.DICT)
            elements = []
            current_element = ""
            previous_bottom = -1
            previous_top = -1
            previous_left = -1
            previous_right = -1

            for j in range(len(ocr_data['text'])):
                text = ocr_data['text'][j].strip()
                if text:
                    top = ocr_data['top'][j]
                    left = ocr_data['left'][j]
                    width = ocr_data['width'][j]
                    height = ocr_data['height'][j]
                    right = left + width
                    bottom = top + height

                    if current_element and (top > previous_bottom + 10 or left > previous_right + 50):
                        elements.append(current_element.strip())
                        current_element = text
                    else:
                        if current_element:
                            current_element += ' '
                        current_element += text

                    previous_bottom = bottom
                    previous_top = top
                    previous_left = left
                    previous_right = right

            if current_element:
                elements.append(current_element.strip())

            data.append({"title": title, "elements": elements, "image_index": i, "page_number": page_number + 1})

            title_without_fig = re.sub(r'Fig \d+\.\d+\.', '', title)

            # Search for title and component
            if search_title.lower() in title_without_fig.lower():
                for element in elements:
                    if search_component.lower() in element.lower():
                        print(f"Found component '{search_component}' in title '{search_title}' on page {page_number + 1}")

            # Add title to the image
            new_image = Image.new('RGB', (image.width, image.height + 30), 'white')
            new_image.paste(image, (0, 0))
            draw = ImageDraw.Draw(new_image)
            font = ImageFont.load_default()
            draw.text((10, image.height), title, font=font, fill="black")

            # Save the new image
            image_path = os.path.join(output_folder, f"page_{page_number+1}_img_{i+1}.png")
            new_image.save(image_path)

    # Save data to a JSON file
    output_path = os.path.join(output_folder, "titles_and_elements.json")
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=4)

    return data

pdf_path = "/Users/ramganeshkasturi/Desktop/now/data.pdf"
output_folder = "/Users/ramganeshkasturi/Desktop/now/output"
search_title = input("Enter the title to search for: ")
search_component = input("Enter the component to search for: ")
data = extract_block_diagrams_with_titles(pdf_path, output_folder, search_title, search_component)


In [None]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -qU langchain Faiss-gpu tiktoken sentence-transformers
!pip install -qU trl Py7zr auto-gptq optimum
!pip install -q rank_bm25
!pip install -q PyPdf

In [None]:
!pip install -U langchain-community

In [None]:
import langchain
from langchain.embeddings import CacheBackedEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.llms import HuggingFacePipeline
from langchain.cache import InMemoryCache
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import prompt
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler
from langchain import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
dir_loader = DirectoryLoader("/content/data",
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
docs = dir_loader.load()
#
print(f"len of documents in :{len(docs)}")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=200,)
esops_documents = text_splitter.transform_documents(docs)
print(f"number of chunks in barbie documents : {len(esops_documents)}")

In [None]:
store = LocalFileStore("./cache/")
embed_model_id = 'BAAI/bge-small-en-v1.5'
core_embeddings_model = HuggingFaceEmbeddings(model_name=embed_model_id)
embedder = CacheBackedEmbeddings.from_bytes_store(core_embeddings_model,
                                                  store,
                                                  namespace=embed_model_id)
# Create VectorStore
vectorstore = FAISS.from_documents(esops_documents,embedder)

In [None]:
bm25_retriever = BM25Retriever.from_documents(esops_documents)
bm25_retriever.k=5

In [None]:
query = "what is a computer?"
embedding_vector = core_embeddings_model.embed_query(query)
print(len(embedding_vector))
#
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)
#
for page in docs_resp:
  print(page.page_content)
  print("\n")

In [None]:
%%timeit -n 1 -r 1
query = "what is a computer?"
embedding_vector = core_embeddings_model.embed_query(query)
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)

In [None]:
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k":5})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-8bit-32g-actorder_True")
#
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

In [None]:
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
langchain.llm_cache = InMemoryCache()


In [None]:
PROMPT_TEMPLATE = '''
You are my tech advisor. You are great at providing tips on computer hardware, software, and general technology with your knowledge in computer science.
With the information being provided, try to answer the question.
If you can't answer the question based on the information, either say you can't find an answer or unable to find an answer.
So try to understand in depth about the context and answer only based on the information provided. Don't generate irrelevant answers.

Context: {context}
Question: {question}
Do provide only helpful answers

Helpful answer:
'''

input_variables = ['context', 'question']

custom_prompt = PromptTemplate(template=PROMPT_TEMPLATE,
                            input_variables=input_variables)

In [None]:
handler = StdOutCallbackHandler()
qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = vectorstore.as_retriever(search_kwargs={"k":5}),
    verbose=True,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

In [None]:
query = "what is a computer?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")

In [None]:
handler = StdOutCallbackHandler()
qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = ensemble_retriever,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

In [None]:
query = " "
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {response['result']}")
print(f"Source Documents : \n {response['source_documents']}")