In [None]:
import os, io, base64, textwrap, sqlite3
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image, ImageDraw, ImageFont
import gradio as gr

load_dotenv(override=True)
openai = OpenAI()

DB = "tools.db"

system_message = "You are an expert assistant. Only use tools when explicitly requested by the user. Use create_pdf ONLY when the user specifically asks to create, generate, or make a PDF document. Use tts_voice ONLY when the user asks for audio or voice. For general questions and conversations, just respond normally without using any tools. Keep responses concise and well-formatted in markdown without code fences."

def ensure_tools_db():
    with sqlite3.connect(DB) as conn:
        c = conn.cursor()
        c.execute("CREATE TABLE IF NOT EXISTS tools (name TEXT PRIMARY KEY, description TEXT)")
        c.execute("INSERT OR IGNORE INTO tools(name, description) VALUES(?,?)", ("create_pdf", "Generate a PDF of the provided markdown text"))
        c.execute("INSERT OR IGNORE INTO tools(name, description) VALUES(?,?)", ("tts_voice", "Generate voice audio from the provided text"))
        conn.commit()
    
tools_schema = [{
    "type": "function",
    "function": {
        "name": "create_pdf",
        "description": "Generate a PDF from markdown text and return an identifier",
        "parameters": {
            "type": "object",
            "properties": {
                "title": {"type": "string", "description": "Document title"},
                "markdown": {"type": "string", "description": "Markdown content to render"}
            },
            "required": ["title", "markdown"],
            "additionalProperties": False
        }
    }
},{
    "type": "function",
    "function": {
        "name": "tts_voice",
        "description": "Synthesize speech audio from provided text",
        "parameters": {
            "type": "object",
            "properties": {
                "text": {"type": "string", "description": "Text to speak"}
            },
            "required": ["text"],
            "additionalProperties": False
        }
    }
}]

def text_to_pdf_file(md_text, title="Document"):
    import tempfile
    try:
        from reportlab.lib.pagesizes import letter
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.lib.units import inch
        
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        doc = SimpleDocTemplate(temp_file.name, pagesize=letter)
        styles = getSampleStyleSheet()
        story = []
        
        title_style = ParagraphStyle('CustomTitle', parent=styles['Heading1'], fontSize=18, spaceAfter=30)
        story.append(Paragraph(title, title_style))
        story.append(Spacer(1, 12))
        for line in md_text.split('\n'):
            if line.strip().startswith('# '):
                story.append(Paragraph(line[2:], styles['Heading1']))
            elif line.strip().startswith('## '):
                story.append(Paragraph(line[3:], styles['Heading2']))
            elif line.strip().startswith('### '):
                story.append(Paragraph(line[4:], styles['Heading3']))
            elif line.strip().startswith('- ') or line.strip().startswith('* '):
                story.append(Paragraph(f"• {line[2:]}", styles['Normal']))
            elif line.strip():
                story.append(Paragraph(line, styles['Normal']))
            else:
                story.append(Spacer(1, 6))
        
        doc.build(story)
        return temp_file.name
    except ImportError:
        lines = []
        for paragraph in md_text.splitlines():
            if not paragraph.strip():
                lines.append("")
                continue
            wrapped = textwrap.wrap(paragraph, width=90, replace_whitespace=False, drop_whitespace=False)
            lines.extend(wrapped if wrapped else [""])
        pages = []
        page_w, page_h = 1654, 2339
        margin = 100
        y = margin
        font = ImageFont.load_default()
        page = Image.new("RGB", (page_w, page_h), "white")
        draw = ImageDraw.Draw(page)
        draw.text((margin, y-60), title, fill=(0,0,0), font=font)
        for line in lines:
            draw.text((margin, y), line, fill=(0,0,0), font=font)
            y += 22
            if y > page_h - margin:
                pages.append(page)
                page = Image.new("RGB", (page_w, page_h), "white")
                draw = ImageDraw.Draw(page)
                y = margin
        pages.append(page)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        pages[0].save(temp_file.name, format="PDF", save_all=True, append_images=pages[1:] if len(pages)>1 else [])
        return temp_file.name

def tts_bytes(text):
    if not text.strip():
        return None
    speech = openai.audio.speech.create(model="gpt-4o-mini-tts", voice="alloy", input=text[:2000])
    import tempfile
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    temp_file.write(speech.content)
    temp_file.close()
    return temp_file.name

def build_pdf_data_url(pdf_bytes):
    b64 = base64.b64encode(pdf_bytes).decode("utf-8")
    return f"data:application/pdf;base64,{b64}"

state_storage = {"last_pdf": None, "last_audio": None}

def handle_tool_calls(tool_calls):
    results = []
    pdf_preview_html = None
    audio_tuple = None
    for tc in tool_calls:
        name = tc.function.name
        args = tc.function.arguments
        try:
            import json as _json
            parsed = _json.loads(args) if isinstance(args, str) else args
        except Exception:
            parsed = {}
        if name == "create_pdf":
            title = parsed.get("title", "Document")
            markdown = parsed.get("markdown", "")
            pdf_file = text_to_pdf_file(markdown, title=title)
            state_storage["last_pdf"] = pdf_file
            with open(pdf_file, "rb") as f:
                pdf_bytes = f.read()
            pdf_url = build_pdf_data_url(pdf_bytes)
            pdf_preview_html = f"<iframe src='{pdf_url}' style='width:100%;height:600px;border:1px solid #e5e7eb;border-radius:8px;'></iframe>"
            results.append({"role": "tool", "content": "PDF created", "tool_call_id": tc.id})
        elif name == "tts_voice":
            text = parsed.get("text", "")
            audio_file = tts_bytes(text)
            state_storage["last_audio"] = audio_file
            results.append({"role": "tool", "content": "Audio generated", "tool_call_id": tc.id})
    return results, pdf_preview_html, None

def build_messages(history, user_text, base_doc_text):
    msgs = [{"role": "system", "content": system_message}]
    
    if base_doc_text:
        msgs.append({"role": "system", "content": f"Context Document:\n{base_doc_text}\n\nUse this document as reference for answering questions."})
    
    msgs.extend([{"role": h["role"], "content": h["content"]} for h in history])
    msgs.append({"role": "user", "content": user_text})
    return msgs

ensure_tools_db()

with gr.Blocks(theme=gr.themes.Soft(), css="""
.gradio-container{max-width:1200px;margin:auto}
""") as demo:
    gr.Markdown("# Document Tools: PDF and Voice")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=500, type="messages", value=[{"role":"assistant","content":"Hello! How can I assist you today?"}])
            with gr.Row():
                user_msg = gr.Textbox(placeholder="Type your message here...", show_label=False, scale=4)
                clear_btn = gr.Button("Clear", scale=1)
        
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload Document", file_types=[".txt", ".md", ".docx", ".pdf"], type="filepath")
            voice_toggle = gr.Checkbox(label="Enable voice", value=True)
            voice_input = gr.Audio(label="Voice Input", sources=["microphone"], type="filepath")
            audio = gr.Audio(label="Voice Output", autoplay=True)
            file_pdf = gr.File(label="Download PDF")
    
    pdf_iframe = gr.HTML(visible=True)

    def put_user(m, h):
        return "", h + [{"role":"user", "content": m}]
    
    def process_voice_input(voice_file):
        if voice_file is None:
            return ""
        try:
            with open(voice_file, "rb") as f:
                transcript = openai.audio.transcriptions.create(
                    model="whisper-1",
                    file=f
                )
            return transcript.text
        except Exception as e:
            return f"Error processing voice: {str(e)}"

    def extract_text_from_file(file_path):
        if not file_path:
            return ""
        
        try:
            file_ext = file_path.lower().split('.')[-1]
            if file_ext in ['txt', 'md']:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    return content
            elif file_ext == 'docx':
                from docx import Document
                doc = Document(file_path)
                text = []
                for paragraph in doc.paragraphs:
                    text.append(paragraph.text)
                content = '\n'.join(text)
                return content
            elif file_ext == 'pdf':
                try:
                    import PyPDF2
                    text = []
                    with open(file_path, 'rb') as f:
                        pdf_reader = PyPDF2.PdfReader(f)
                        for page in pdf_reader.pages:
                            page_text = page.extract_text()
                            text.append(page_text)
                    content = '\n'.join(text)
                    return content
                except Exception:
                    try:
                        import fitz
                        doc = fitz.open(file_path)
                        text = []
                        for page in doc:
                            text.append(page.get_text())
                        content = '\n'.join(text)
                        return content
                    except Exception:
                        return ""
            else:
                return ""
        except Exception:
            return ""

    def run_chat(history, m, file_path, allow_voice):
        base_doc = extract_text_from_file(file_path)
        msgs = build_messages(history, m, base_doc)
        tools = tools_schema if allow_voice else [tools_schema[0]]
        resp = openai.chat.completions.create(model="gpt-4.1-mini", messages=msgs, tools=tools, stream=True)
        partial = ""
        for chunk in resp:
            delta = (chunk.choices[0].delta.content or "") if chunk.choices[0].delta else ""
            partial += delta
            yield history + [{"role":"assistant","content": partial}], None, None, ""

        msgs.append({"role":"assistant","content": partial})
        resp2 = openai.chat.completions.create(model="gpt-4.1-mini", messages=msgs, tools=tools)
        pdf_html = None
        audio_out = None
        while resp2.choices[0].finish_reason == "tool_calls":
            message = resp2.choices[0].message
            tool_results, pdf_html, audio_out = handle_tool_calls(message.tool_calls)
            msgs.append({"role": message.role, "content": message.content, "tool_calls": message.tool_calls})
            msgs.extend(tool_results)
            resp2 = openai.chat.completions.create(model="gpt-4.1-mini", messages=msgs, tools=tools)
        final_reply = resp2.choices[0].message.content if resp2.choices[0].message.content else partial
        history = history + [{"role":"assistant","content": final_reply}]
        
        state_storage["last_audio"] = None
        if final_reply and allow_voice:
            audio_file = tts_bytes(final_reply)
            yield history, audio_file, state_storage["last_pdf"], (pdf_html or "")
        else:
            yield history, None, state_storage["last_pdf"], (pdf_html or "")

    user_msg.submit(put_user, inputs=[user_msg, chatbot], outputs=[user_msg, chatbot]).then(
        run_chat, inputs=[chatbot, user_msg, file_input, voice_toggle], outputs=[chatbot, audio, file_pdf, pdf_iframe]
    )
    
    voice_input.change(process_voice_input, inputs=voice_input, outputs=user_msg)


    def clear_all():
        state_storage["last_pdf"] = None
        state_storage["last_audio"] = None
        return [{"role":"assistant","content":"Hello! How can I assist you today?"}], None, None, ""

    clear_btn.click(clear_all, outputs=[chatbot, audio, file_pdf, pdf_iframe])

demo.launch(inbrowser=True)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


