In [1]:
import os
import gradio as gr
import fitz ## PyMuPDF
import sqlite3
from llama_index.core import VectorStoreIndex, Document
from datetime import datetime
import uuid
from tqdm.notebook import tqdm
import traceback
from dotenv import load_dotenv
from typing import List


  from .autonotebook import tqdm as notebook_tqdm


### Objective:
- Deconstruct the essential components of llama-index to understand the important components
- Log interaction into a db and csv file!

#### Major Components of LlamaIndex:
1. Data Connectors - Load data from various sources
2. Document & Nodes - Represent and chunk text data
3. Indexes - Store and organize the processed document data
4. Vector Stores -  Manage embeddings and vector similarity search
5. Retrievers - Extract relevant context from index
6. Query Engines - Process queries and generate responses.
7. Response Synthesizers - Combine retrieved context and query to generate answers.

In [2]:
load_dotenv(dotenv_path="../../project_secrets.env")
load_dotenv(dotenv_path="../../../ai_sdlc_secrets.env")

True

In [3]:
DB_NAME = "pdf_qa_logs.db"

## Initialize the database
def init_db(db_name: str = DB_NAME):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute(
        '''
        CREATE TABLE IF NOT EXISTS interactions (
            id TEXT PRIMARY KEY,
            timestamp TEXT,
            pdf_name TEXT,
            query TEXT,
            task_type TEXT,
            temperature REAL,
            top_p REAL,
            prompt TEXT,
            response TEXT)''') 
    ## We might add prompts and other metadata
    conn.commit()
    conn.close()

## Extract text from PDF using fitz
def extract_text_chunks(pdf_bytes: bytes) -> str: ## Find the dtype later!
    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    chunks = []
    for page_num in range(pdf_doc.page_count):
        page = pdf_doc.load_page(page_num)
        text = page.get_text("text").strip()
        if text:
            chunks.append({"page": page_num + 1, "text": text})
    return text

def simple_keyword_ranking(chunks: List[str], query: str, top_k: int=5) -> List[str]:
    ranked = sorted(
        chunks, 
        key= lambda c: sum(query.lower().count(word.lower()) for word in c['text'].split()),
        reverse=True
    )
    return ranked[:top_k] if ranked else []


## Process PDF and create index:
def process_pdf(pdf_bytes: bytes) -> VectorStoreIndex:
    extracted_text = extract_text_chunks(pdf_bytes)
    document = Document(text=extracted_text)
    index = VectorStoreIndex.from_documents([document])
    return index

## Log to SQLite DB:
def log_interaction(pdf_name: str, query: str, response: str, db_name: str = DB_NAME) -> None:
    conn = sqlite3.connect(database=db_name)
    c = conn.cursor()
    interaction_id = str(uuid.uuid4())
    timestamp = datetime.now().isoformat()
    c.execute(
        "INSERT INTO interactions VALUES (?, ?, ?, ?, ?)",
        (interaction_id, timestamp, pdf_name, query, response))
    conn.commit()
    conn.close()
    return None

def query_pdf(pdf: bytes, query: str) -> str:
    print(type(pdf))
    if pdf is None:
        return "Please upload a PDF"
    if not query.strip():
        return "Please enter a valid query"
    
    try:
        pdf_name = pdf.name if hasattr(pdf, "name") else "PDF-Unknown"
        index = process_pdf(pdf) ## Passing bytes directly

        query_engine = index.as_query_engine()
        response = query_engine.query(query)
        
        log_interaction(pdf_name, query, response.response)
    except Exception as e:
        return f"An error occurred: {str(e)}-{traceback.format_exc()}"
    
    return response

In [4]:
with gr.Blocks() as app:
    pdf_upload = gr.File(label="Upload PDF", type="binary")
    query_input = gr.Textbox(label="Ask a question about the PDF")
    output = gr.Textbox(label="Answer")

    query_button = gr.Button("Submit")
    query_button.click(query_pdf, inputs=[pdf_upload, query_input], outputs=output)

app.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




<class 'bytes'>
Text type: <class 'str'>
Index type: <llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x128b6df70>
