In [1]:
pip install gradio PyPDF2 sentence-transformers pinecone-client cohere

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting cohere
  Downloading cohere-5.9.2-py3-none-any.whl.metadata (3.4 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.114.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradi

In [4]:
!pip install PyPDF2
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()  # Extract text from each page
    return text

# Initialize Pinecone with API key
pc = Pinecone(api_key="d5f35eca-2cc2-4ceb-b6e7-45810f3a5b84")
index_name = "document-embeddings"
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=768,  # The dimensionality of the embeddings
        metric='cosine',  # You can use 'euclidean', 'cosine', or other metrics
        spec=ServerlessSpec(cloud='aws', region='us-west-2')
    )

# Connect to the index
index = pc.Index(index_name)



# ** Gradio Interface for PDF Question Answering Bot**
# The Gradio interface for the PDF Question Answering Bot allows users to interact with the bot seamlessly:

# Upload PDF:

# Component: gr.File
## Function: Allows users to upload a PDF file directly into the application.
# Ask a Question:

# Component: gr.Textbox
Function: Users enter their query about the PDF content here.
Submit Button:

Component: gr.Button
Function: Submits the uploaded PDF and query to the bot for processing.
Display Answer:

Component: gr.Textbox
Function: Shows the bot's generated response based on the PDF content and query.
# This interface streamlines the process of querying and retrieving information from PDFs using a user-friendly web interface.

In [14]:
import gradio as gr
import os
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import PyPDF2
import cohere
import io  # Added import

# Initialize Pinecone with API key
pc = Pinecone(api_key="d5f35eca-2cc2-4ceb-b6e7-45810f3a5b84")  # Replace with your actual API key

# Index name
index_name = "document-embeddings"

# # Create index if it doesn't exist
# if index_name not in [index.name for index in pc.list_indexes()]:
#     pc.create_index(
#         name=index_name,
#         dimension=768,  # The dimensionality of the embeddings
#         metric='cosine',  # You can use 'euclidean', 'cosine', or other metrics
#         spec=ServerlessSpec(cloud='aws', region='us-west-2')
#     )

# # Connect to the index
# index = pc.index(index_name)

# Initialize Cohere
co = cohere.Client('6bNDe6cjbZkyJyg2GDNQkIFKGqDbX25qorfbiQjz')  # Replace with your actual Cohere API key

# Load a pre-trained sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')  # 768-dimensional embeddings

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_binary):
    text = ""
    try:
        # Convert the binary to a file-like object using BytesIO
        pdf_file = io.BytesIO(pdf_binary)
        reader = PyPDF2.PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text()  # Extract text from each page
    except Exception as e:
        return f"Error reading PDF file: {e}"
    return text

# Function to handle PDF upload and question answering
def pdf_qa_bot(pdf_file, query):
    if pdf_file is None:
        return "Please upload a PDF."

    # Step 1: Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_file)

    if "Error" in pdf_text:
        return pdf_text

    # Step 2: Split the text into chunks (for long documents)
    document_chunks = [pdf_text[i:i + 512] for i in range(0, len(pdf_text), 512)]  # Chunk by 512 characters

    if len(document_chunks) == 0:
        return "No text extracted from the PDF."

    # Step 3: Generate embeddings for the document chunks
    try:
        embeddings = model.encode(document_chunks)
    except Exception as e:
        return f"Error generating embeddings: {e}"

    # Step 4: Prepare data for uploading to Pinecone
    data = [(str(i), embedding.tolist()) for i, embedding in enumerate(embeddings)]

    # Step 5: Upsert the embeddings to Pinecone
    try:
        index.upsert(vectors=data)
    except Exception as e:
        return f"Error during upsert: {e}"

    # Step 6: Query Pinecone with the user's question
    query_embedding = model.encode([query])[0].tolist()  # 768-dim embedding
    try:
        query_result = index.query(vector=query_embedding, top_k=3)
    except Exception as e:
        return f"Error querying Pinecone: {e}"

    # Step 7: Extract relevant documents
    relevant_docs = [document_chunks[int(match['id'])] for match in query_result['matches']]

    if len(relevant_docs) == 0:
        return "No relevant sections found."

    # Step 8: Generate a response using Cohere
    prompt = f"Use the following documents to answer the question '{query}': {' '.join(relevant_docs)}"
    try:
        response = co.generate(
            prompt=prompt,
            max_tokens=200
        )
    except Exception as e:
        return f"Error generating response from Cohere: {e}"

    # Return the generated response
    return response.generations[0].text

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# PDF Question Answering Bot")

    # Upload PDF and enter query
    with gr.Row():
        pdf_input = gr.File(label="Upload a PDF", type="binary")
        query_input = gr.Textbox(label="Ask a Question")

    # Output box for displaying results
    result_output = gr.Textbox(label="Answer", lines=10)

    # Submit button
    submit_button = gr.Button("Submit")

    # Connect the inputs with the function
    submit_button.click(pdf_qa_bot, inputs=[pdf_input, query_input], outputs=result_output)

# Launch the Gradio app
demo.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://8168ae61925a4a8b1b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


