<a href="https://colab.research.google.com/github/faiyazansariusa/ColabAI/blob/main/Chatbot_read_pdf_docs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install -q transformers gradio pdfplumber python-docx

# Import libraries
import gradio as gr
import pdfplumber
from docx import Document
from transformers import pipeline

# Load models (CPU-compatible)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Global cache for extracted text (to avoid reprocessing)
text_cache = {"text": ""}

# Function to extract text from various file types
def extract_text(file):
    ext = file.name.split('.')[-1].lower()
    text = ""

    try:
        if ext == "pdf":
            with pdfplumber.open(file.name) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
        elif ext == "txt" or ext == "md":
            text = file.read().decode("utf-8")
        elif ext == "docx":
            doc = Document(file.name)
            for para in doc.paragraphs:
                text += para.text + "\n"
        else:
            return "Unsupported file format."

        text_cache["text"] = text  # Store text globally
        return "✅ File uploaded and processed successfully!"
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Function to summarize using cached text
def summarize_cached_text():
    text = text_cache.get("text", "")
    if not text:
        return "No text found. Please upload a file first."
    short_text = text[:3000]
    summary = summarizer(short_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    return summary

# Function to answer question using cached text
def answer_question_cached(question):
    text = text_cache.get("text", "")
    if not text:
        return "No text found. Please upload a file first."
    if not question.strip():
        return "Please enter a valid question."
    short_text = text[:3000]
    answer = qa_pipeline(question=question, context=short_text)['answer']
    return answer

# Gradio UI with shared file input
with gr.Blocks() as demo:
    gr.Markdown("## 📄 Unified File Upload for Summarization & QA (PDF, DOCX, TXT, MD)")

    file_input = gr.File(label="📁 Upload a file", file_types=[".pdf", ".txt", ".docx", ".md"])
    upload_status = gr.Markdown()

    # When file is uploaded, extract and cache the text
    file_input.change(fn=extract_text, inputs=file_input, outputs=upload_status)

    with gr.Tabs():
        with gr.TabItem("📄 Summarization"):
            summarize_btn = gr.Button("Generate Summary")
            summary_output = gr.Textbox(label="Summary", lines=10)
            summarize_btn.click(fn=summarize_cached_text, inputs=None, outputs=summary_output)

        with gr.TabItem("❓ Question Answering"):
            question_input = gr.Textbox(label="Ask your question", placeholder="e.g. What is the conclusion?")
            answer_btn = gr.Button("Get Answer")
            answer_output = gr.Textbox(label="Answer", lines=2)
            answer_btn.click(fn=answer_question_cached, inputs=question_input, outputs=answer_output)

# Launch the app
demo.launch(debug=True)


Device set to use cpu
Device set to use cpu


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1687a000e4553baca3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
