In [2]:

!pip install PyMuPDF
!pip install gradio

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [3]:
import fitz
#ANY TRANSFORMER PROD, DEMO
from transformers import AutoTokenizer, AutoModel, pipeline
#just like BPE we have a open sorce aLTERnative called sentence Transformer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity #BM25
import os
import gradio as gr
import markdown
import torch
import numpy as np

In [4]:
def convert_pdf_to_markdown(pdf_path):
  print(f"Converting {pdf_path} to Markdown.....")

  doc = fitz.open(pdf_path)
  markdown_text = ""

  for page_num, page in enumerate(doc):
    #extract data from the page
    text = page.get_text()
    #add page header
    markdown_text += f"## Page {page_num + 1}\n\n"

    #prcess paragraph
    paragraphs = text.split("\n\n") #basics
    for para in paragraphs:
      para = para.replace('\n', ' ').strip()
      if para:
        markdown_text += para + "\n\n"

  markdown_path = pdf_path.replace('.pdf','.md')
  with open(markdown_path, 'w', encoding='utf-8') as f: #absolute its manadatory iso-
    f.write(markdown_text)

  print(f"Markdown saved to {markdown_path}")
  return markdown_path, markdown_text

In [6]:
path,markdown_text = convert_pdf_to_markdown("/content/Types_and_systems_of_farming-489.pdf")
markdown_text

Converting /content/Types_and_systems_of_farming-489.pdf to Markdown.....
Markdown saved to /content/Types_and_systems_of_farming-489.md


'## Page 1\n\nPrepared By,  Prof. Waghmode B.R.  Types and systems of farming    Contents  Preface   I.   Introduction   II.   Classification of Farming   A. Types of farming   B. Factors determining the type of farming   1. Physical factors    2. Economic factors   C. Systems of farming   1. Cooperative farming   2. Peasant farming   3. State farming   4. Capitalistic farming or estate farming   5. Collective farming   6. Contract farming             D. Factors affecting the system of farming     Preface  In cultivation, it is necessary to answer three basic questions: what to produce, how  much to produce and how to produce. These three questions if answered, give directions to all  agricultural processes to be followed by the farmers. In this booklet we have briefed the types of  farming to answer what to produce and systems of farming to answer how to produce, and the  other question how much to produce will be automatically solved by knowing total need of the  country.    Dr. K. T

In [7]:
def chunk_markdown(markdown_text, chunk_size=200): #>200
    sentences = markdown_text.replace('\n', ' ').split('.')
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + "."
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + "."

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [8]:
chunk_markdown(markdown_text, chunk_size=200)

['## Page 1  Prepared By,  Prof. Waghmode B.R.  Types and systems of farming    Contents  Preface   I.   Introduction   II.   Classification of Farming   A. Types of farming   B.',
 'Factors determining the type of farming   1. Physical factors    2. Economic factors   C. Systems of farming   1. Cooperative farming   2. Peasant farming   3. State farming   4.',
 'Capitalistic farming or estate farming   5. Collective farming   6. Contract farming             D.',
 'Factors affecting the system of farming     Preface  In cultivation, it is necessary to answer three basic questions: what to produce, how  much to produce and how to produce.',
 'These three questions if answered, give directions to all  agricultural processes to be followed by the farmers.',
 'In this booklet we have briefed the types of  farming to answer what to produce and systems of farming to answer how to produce, and the  other question how much to produce will be automatically solved by knowing total need of the  c

In [9]:
class DocumentRetriever:
  def __init__(self):
    print("Loading SentenceTransformer model.....")
    self.model = SentenceTransformer("all-MiniLM-L6-v2")
    self.chunks = []
    self.embeddings = None

  def add_document(self, markdown_text):
    self.chunks = chunk_markdown(markdown_text)
    print(f"Created {len(self.chunks)} chunks from documents") #user to see

    self.embeddings = self.model.encode(self.chunks)
    print(f"Embeddings created successfully!!")

  def retrieve_relevant_chunk(self, query, top_k=3):
    query_embedding = self.model.encode([query])[0]

    similarities = cosine_similarity([query_embedding], self.embeddings)[0]

    #indexes for cheunks are releabr question 0, 1, 2, 4
    top_indices = np.argsort(similarities)[-top_k:][::-1] #sort as index

    #[(text, index)]
    return [(self.chunks[i], similarities[i]) for i in top_indices]

In [10]:
class ChatBot:
  def __init__(self):
    #diff under ques -> trans math
    self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    self.model = AutoModel.from_pretrained("roberta-base")

    self.retriever = DocumentRetriever()

    #chatbot - QA -> Question - Answering system
    self.qa_pipeline = pipeline('question-answering', model="distilbert-base-cased-distilled-squad")
    self.document_loaded = False

  def load_document(self, pdf_path):
    markdown_path, markdown_text = convert_pdf_to_markdown(pdf_path)
    self.retriever.add_document(markdown_text)
    self.document_loaded = True
    return f"Document Loaded by the user and processed : {os.path.basename(pdf_path)}"

  def answer_question(self, question):
    if not self.document_loaded:
      return "Please load the PDF docuemnt first!"

    # question -> tokens -> all min lm v6 -> array
    relevant_chunks = self.retriever.retrieve_relevant_chunk(question) #dirctly

    if not relevant_chunks or max(similarity for _, similarity in relevant_chunks) < 0.3:
      return f"I could not find relevant information related to your question!"

    context = " ".join([chunk for chunk, _ in relevant_chunks])

    try:
      response = self.qa_pipeline(question=question, context = context)

      answer = response['answer'] #is not there yet, place holer emptu
      confidence = response['score']

      result = f"Answer: {answer}\n\nConfidence:{confidence}"

      for i, (chunk, similarity) in enumerate(relevant_chunks, 1):
        result += f"{i}. \"{chunk[:100]}....\" (Relevance : {similarity})\n"

      return result
    except Exception as e:
      return f"Exception generating answer:  {e}"



In [11]:
def create_gradio_interface():
  chatbot = ChatBot()

  def load_pdf(pdf_file):
    return chatbot.load_document(pdf_file.name)

  def respond(message, history):
    return chatbot.answer_question(message)

  #i am not the one who lib
  with gr.Blocks(title="PDF Retrieval Chatbot") as demo:
    gr.Markdown("""# Pdf retrieval chatbot""")
    gr.Markdown("""# Upload a pdf document and ask questions relevant to the document""")

    with gr.Row():
      with gr.Column():
        pdf_input = gr.File(label="Upload PDF Document")
        load_button = gr.Button("Load Document")
        status_text = gr.Textbox(label='Status', interactive=False)

    load_button.click(load_pdf, inputs=[pdf_input], outputs = [status_text])

    chatbot_interface = gr.ChatInterface(
        respond,
        title = "Chat with your PDF Files",
        description = "Ask questions about your docuemnt/s",
        examples = ["What is the main topic in this file?","Can you please summarize the PDF File"])

  return demo

In [12]:
if __name__ == "__main__":
  print("Starting Chatbot App.......")
  demo = create_gradio_interface()
  demo.launch()

Starting Chatbot App.......


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading SentenceTransformer model.....


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8eb3522306e5eba8b0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
