In [None]:
pip install gradio pymupdf pillow torch transformers sentence-transformers

In [None]:
import gradio as gr
import fitz  # PyMuPDF
from PIL import Image
import io
import torch
import hashlib
import json
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM  # Flan-T5 for Q&A
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM  # Qwen for workflow
from huggingface_hub import login

# === Login and Device Setup ===
login(token="HF_TOKEN")
device = "cuda" if torch.cuda.is_available() else "cpu"

# === Load Models ===
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b", use_fast=True)
blip_model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

qwen_model_id = "Qwen/Qwen1.5-1.8B-Chat"
qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_id, use_fast=True)
qwen_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === Global State ===
global_doc = []
unique_images = {}

# === PDF Content Extraction ===
def extract_page_content(doc, page_number, seen_hashes):
    page = doc[page_number]
    text = page.get_text()
    images = []
    for img in page.get_images(full=True):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_hash = hashlib.md5(image_bytes).hexdigest()
        if image_hash in seen_hashes:
            continue
        seen_hashes.add(image_hash)
        image_pil = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        images.append(image_pil)
    return text, images

# === Image Captioning ===
def describe_images(image_list):
    if not image_list:
        return []
    inputs = processor(images=image_list, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = blip_model.generate(**inputs, max_new_tokens=50)
        captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return [cap.strip() for cap in captions]

# === Qwen: Workflow Generation ===
def generate_workflow_qwen(full_text, full_captions):
    prompt = f"""You are a technical documentation expert. Convert the following technical PDF content into a step-by-step workflow.
Write only numbered steps using concise and technical language.

Text:
{full_text[:3000]}

Image captions:
{full_captions[:1000]}

Workflow:"""
    inputs = qwen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        outputs = qwen_model.generate(
            **inputs,
            max_new_tokens=1000,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )
    return qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# === Qwen: Interactive Guide Generation ===
def generate_interactive_guide_qwen(full_text, full_captions):
    prompt = f"""You are a technical documentation expert. Read the following technical PDF content and generate an interactive guide in JSON format.
Return a valid JSON array where each element is an object with the following keys:
  - \"name\": (string) The title of the step.
  - \"type\": (string) One of \"instruction\", \"input_checkbox\", \"input_radio\", or \"input_number\".
  - \"content\": (string) The main text or question for that step.
  - \"options\": (optional, array) A list of strings for options if the type is \"input_checkbox\" or \"input_radio\".

Text:
{full_text[:3000]}

Image captions:
{full_captions[:1000]}
"""
    inputs = qwen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        outputs = qwen_model.generate(**inputs, max_new_tokens=1000, do_sample=False)
    generated_text = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    try:
        return json.dumps(json.loads(generated_text), indent=2)
    except Exception as e:
        return f"JSON parse error: {str(e)}\nRaw output:\n{generated_text}"

# === Qwen: Question Answering ===
def generate_answer_with_qwen(question, context):
    prompt = f"""You are a helpful assistant. Based ONLY on the context below, answer the user's question clearly and accurately.

Context:
{context}

Question: {question}

Answer:"""
    inputs = qwen_tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
    with torch.no_grad():
        outputs = qwen_model.generate(
            **inputs,
            max_new_tokens=400,
            do_sample=False
        )
    output_text = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    # Extract only the answer part (after 'Answer:')
    answer_start = output_text.find("Answer:")
    if answer_start != -1:
        return output_text[answer_start + len("Answer:"):].strip()
    return output_text

# === Semantic Search + Answer ===
def answer_question(question):
    if not global_doc:
        return "Lütfen önce bir PDF dosyası yükleyin."
    chunks = [page["text"] for page in global_doc]
    for page in global_doc:
        chunks.extend(page["image_descriptions"])
    q_emb = embedder.encode(question, convert_to_tensor=True)
    doc_embs = embedder.encode(chunks, convert_to_tensor=True)
    sims = util.cos_sim(q_emb, doc_embs)[0]
    top_k = torch.topk(sims, k=min(5, len(sims)))
    top_context = "\n".join([chunks[i] for i in top_k.indices])
    return generate_answer_with_qwen(question, top_context)

# === Process PDF ===
def process_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    global global_doc, unique_images
    global_doc = []
    unique_images = {}
    full_text = ""
    full_captions = ""
    
    for page_num in range(len(doc)):
        seen_hashes = set()
        text, images = extract_page_content(doc, page_num, seen_hashes)
        full_text += f"\n\n--- Page {page_num+1} ---\n\n{text}"
        captions = describe_images(images) if images else []
        full_captions += f"\n\n--- Page {page_num+1} ---\n\n" + "\n".join([f"Page {page_num+1} Image {i+1}: {cap}" for i, cap in enumerate(captions)])
        global_doc.append({"page": page_num + 1, "text": text, "image_descriptions": captions})
        for img in images:
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG')
            image_bytes = img_byte_arr.getvalue()
            image_hash = hashlib.md5(image_bytes).hexdigest()
            unique_images[image_hash] = img
    workflow_markdown = generate_workflow_qwen(full_text, full_captions)
    interactive_guide_json = generate_interactive_guide_qwen(full_text, full_captions)
    return full_text, list(unique_images.values()), full_captions, workflow_markdown, interactive_guide_json

# === Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("## 📄 PDF Analyzer: Text + Images + Workflow + Interactive Guide + Q&A")
    with gr.Row():
        pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        process_btn = gr.Button("Process PDF")
    text_output = gr.Textbox(label="PDF Text", lines=15)
    image_output = gr.Gallery(label="Images", show_label=False)
    caption_output = gr.Textbox(label="Image Captions", lines=10)
    workflow_output = gr.Markdown(label="📋 Workflow")
    interactive_guide_output = gr.Textbox(label="Interactive Guide (JSON)", lines=15)
    gr.Markdown("## ❓ Ask a Question")
    with gr.Row():
        q_in = gr.Textbox(label="Your Question")
        q_out = gr.Textbox(label="Answer", lines=8)
    q_in.submit(answer_question, inputs=q_in, outputs=q_out)
    process_btn.click(
        fn=process_pdf,
        inputs=pdf_input,
        outputs=[text_output, image_output, caption_output, workflow_output, interactive_guide_output]
    )
print("✅ Gradio app is running...")
demo.launch()