In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
!apt-get install -y tesseract-ocr
!pip install pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (23.1 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123627 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [27]:
pip install langchain-community



In [21]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [22]:
!pip install gradio langchain ctransformers sentence_transformers faiss-cpu pytesseract pdf2image PyPDF2




In [29]:
import gradio as gr
import torch
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
from pdf2image import convert_from_path
import pytesseract
import PyPDF2

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading PDF with PyPDF2: {e}")

    if not text.strip():
        print("No text found. Using OCR...")
        try:
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image)
        except Exception as e:
            print(f"Error using OCR: {e}")

    return text

# Vectorize the text
def vectorize_text(text, model):
    sentences = text.split('\n')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings, sentences

# Create a FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    embeddings_cpu = embeddings.cpu().numpy()
    faiss.normalize_L2(embeddings_cpu)
    index.add(embeddings_cpu)
    return index

# Search FAISS index
def search_faiss_index(index, query_embedding, top_k=5):
    query_embedding_cpu = query_embedding.cpu().numpy()
    faiss.normalize_L2(query_embedding_cpu)
    distances, indices = index.search(query_embedding_cpu, top_k)
    return distances, indices

# Set up GGUF model
model_path = '/content/drive/MyDrive/models/mistral-7b-instruct-v0.1.Q8_0.gguf'

llm = CTransformers(
    model=model_path,
    model_type='mistral',
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

template = """[INST] You are a respectful and helpful assistant. Answer the question below from the context provided:
{question} [/INST]
"""

def generate_response(question, context):
    prompt = PromptTemplate(template=template, input_variables=["question", "context"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run({"question": question, "context": context})
    return response

# Gradio function to handle the user interaction
def process_pdf_and_answer_question(pdf, question):
    extracted_text = extract_text_from_pdf(pdf)
    if extracted_text:
        st_model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings, sentences = vectorize_text(extracted_text, st_model)
        index = create_faiss_index(embeddings)
        query_embedding = vectorize_text(question, st_model)[0]
        distances, indices = search_faiss_index(index, query_embedding)
        relevant_texts = [sentences[i] for i in indices[0]]
        answers = [generate_response(question, text) for text in relevant_texts]
        return "\n".join(answers)
    else:
        return "No text could be extracted from the PDF."

In [31]:
iface = gr.Interface(
    fn=process_pdf_and_answer_question,
    inputs=[gr.File(type="filepath", label="Upload PDF"), gr.Textbox(lines=2, label="Ask a question")], # Changed type to "filepath"
    outputs="text",
    title="PDF QA with Llama-2-7B",
    description="Upload a PDF (text-based or image-based) and ask a question. The system will extract text, vectorize it, and retrieve relevant information using Llama-2-7B for answering your question.",
    theme="default"
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d10bbed55b53089f12.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


