In [3]:
import torch
from huggingface_hub import login, notebook_login
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

import pypdf
import gradio as gr
from IPython.display import display, Markdown
import os
from pathlib import Path
import requests

In [2]:
print(torch.cuda.is_available())

True


In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
def print_markdown(text):
    """Display text as Markdown"""
    display(Markdown(text))

In [20]:
pipe = pipeline(model="ProsusAI/finbert")

pipe("My Bitcoin is up quite a bit from when I bought it!")

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9498746395111084}]

In [21]:
pipe("NVDIA is set to release their earnings soon.")

[{'label': 'neutral', 'score': 0.9375986456871033}]

In [6]:
model = "microsoft/Phi-4-mini-instruct"

In [8]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [9]:
pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    device_map = "auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [10]:
tokens = tokenizer("Hello.  I'm working on LLMs, fine-tuning and AI agents")

In [11]:
print(tokens['input_ids'])

[15496, 13, 220, 314, 1101, 1762, 319, 27140, 10128, 11, 3734, 12, 28286, 278, 290, 9552, 6554]


In [12]:
pdf_url = "https://abc.xyz/assets/66/ae/c94682fc4137b5fb90a5d709ac4b/2025-q1-earnings-transcript.pdf"

pdf_filename = "google_earning_transcript"

pdf_path = Path(pdf_filename)

In [13]:
if not pdf_path.exists():
    response = requests.get(pdf_url)
    response.raise_for_status()
    pdf_path.write_bytes(response.content)
    print(f"PDF downloaded successfully to {pdf_path}")
else:
    print(f"PDF file already exists at {pdf_path}")

PDF file already exists at google_earning_transcript


In [14]:
pdf_text = ""

In [15]:
reader = pypdf.PdfReader(pdf_path)
num_pages = len(reader.pages)

print(f"PDF has {num_pages} pages.")

Ignoring wrong pointing object 572 0 (offset 0)


PDF has 21 pages.


In [16]:
all_pages_text = []

for i, page in enumerate(reader.pages):
    page_text = page.extract_text()
    if page_text:
        all_pages_text.append(page_text)

In [17]:
pdf_text = "\n".join(all_pages_text)
print(f"Total characters: {len(pdf_text)}")

Total characters: 64652


In [22]:
print_markdown(f"{pdf_text[-500:]}")

rn  the  conference  back  over  to  Jim  Friedland  for  any  further  remarks.    Jim  Friedland,  Senior  Director,  Investor  Relations:  Thanks,  everyone,  for  joining  us  today.  
We
 
look
 
forward
 
to
 
speaking
 
with
 
you
 
again
 
on
 
our
 
second
 
quarter
 
2025
 
call.
 
Thank
 
you,
 
and
 
have
 
a
 
good
 
evening.
 
  Operator:  Thank  you,  everyone.  This  concludes  today's  conference  call.  Thank  you  for  
participating.
 
You
 
may
 
now
 
disconnect.
 
   
21  

In [31]:

MAX_CONTEXT_CHARS = 6000

def answer_question_from_pdf(document_text, question, llm_pipeline):
    """
    Answers a question based on the provided document text using the loaded LLM pipeline.

    Args:
        document_text (str): The text extracted from the PDF.
        question (str): The user's question.
        llm_pipeline (transformers.pipeline): The initialized text-generation pipeline.

    Returns:
        str: The model's generated answer.
    """
    # Truncate context if necessary
    if len(document_text) > MAX_CONTEXT_CHARS:
        print(f"Warning: Document text ({len(document_text)} chars) exceeds limit ({MAX_CONTEXT_CHARS} chars). Truncating.")
        context = document_text[:MAX_CONTEXT_CHARS] + "..."
    else:
        context = document_text

    # Let's define the Prompt Template
    # We instruct the model to use only the provided document.
    # Using a format the model expects (like Phi-3's chat format) can improve results.
    # <|system|> provides context/instructions, <|user|> is the question.
    # Note: Different models might prefer different prompt structures.
    prompt_template = f"""<|system|>
    You are an AI assistant. Answer the following question based *only* on the provided document text. If the answer is not found in the document, say "The document does not contain information on this topic." Do not use any prior knowledge.

    Document Text:
    ---
    {context}
    ---
    <|end|>
    <|user|>
    Question: {question}<|end|>
    <|assistant|>
    Answer:""" # We prompt the model to start generating the answer

    print(f"\n--- Generating Answer for: '{question}' ---")

    # Run Inference on the chosen model
    outputs = llm_pipeline(prompt_template,
                        #    max_new_tokens = 500,  # Limit answer length
                        #    do_sample = True,
                        #    temperature = 0.2,   # Lower temperature for more factual Q&A
                        #    top_p = 0.9
                        )

    # Let's extract the answer
    # The output includes the full prompt template. We need the text generated *after* it.
    full_generated_text = outputs[0]['generated_text']
    answer_start_index = full_generated_text.find("Answer:") + len("Answer:")
    raw_answer = full_generated_text[answer_start_index:].strip()

    # Sometimes the model might still include parts of the prompt or trail off.
    # Basic cleanup: Find the end-of-sequence token if possible, or just return raw.
    # Phi-3 uses <|end|> or <|im_end|>
    end_token = "<|end|>"
    if end_token in raw_answer:
            raw_answer = raw_answer.split(end_token)[0]

    print("--- Generation Complete ---")
    return raw_answer


In [None]:
test_question = "How man users are using AI?"
generated_answer = answer_question_from_pdf(pdf_text, test_question, pipe)

print("\nTest Question:")
print_markdown(f"**Q** {test_question}")
print("\nGenerated Answer:")
print_markdown(f"**A** {generated_answer}")

In [1]:
available_models = {
    "Llama 3.2": "unsloth/Llama-3.2-3B-Instruct",
    "Microsoft Phi-r Mini": "mircorosft/Phi-4-mini-instruct",
    "Google Gemma 3": "unsloth/gemma-3-4b-it-GGUF"
}

In [2]:
current_model = None
current_pipeline = None
print(f"Models available to select: {list(available_models.keys())}")

Models available to select: ['Llama 3.2', 'Microsoft Phi-r Mini', 'Google Gemma 3']


In [38]:
def load_llm_model(model_name):
    global current_model_id, current_pipeline, tokenizer, model

    new_model_id = available_models.get(model_name)
    if not new_model_id:
        return "Invalid model selected", None
    
    if new_model_id == current_model_id and current_pipeline is not None:
        print(f"Model {model_name} is already loaded.")
        return f"{model_name} already loaded.", current_pipeline
    
    print(f"Switching to model: {model_name} ({new_model_id})...")

    # Unload preview model to free up memory and run garbage collection on GPU
    current_pipeline = None
    if "model" in locals():
        del model
    if "tokenizer" in locals():
        del tokenizer
    if "pipe" in locals():
        del pipe    
    torch.cuda.empty_cache()
    import gc

    gc.collect()
    print("Previous model(s) unloaded (if any)")

    # Load the new model
    try:
        tokenizer = AutoTokenizer.from_pretrained(new_model_id, trust_remote_code=True)
        # Load model (4 bit quantized)
        model = AutoModelForCausalLM.from_pretrained(
            new_model_id,
            torch_dtype="auto",
            load_in_4bit=True,
            device_map="auto",
            trust_remote_code=True
        )

        loaded_pipeline = pipeline(
            "text-generation", model=model, tokenizer=tokenizer, torch_dtype="auto", device_map="auto"
        )

        print(f"Model {model_name} loaded successfully!")

        current_model_id = new_model_id
        current_pipeline = loaded_pipeline
        return f"{model_name} loaded successfully!", loaded_pipeline
    
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        current_model_id = None
        current_pipeline = None
        return f"Error loading {model_name}: {e}, None"

In [39]:
# This function relies on the global 'current_pipeline'
def handle_submit(question):
    if not current_pipeline:
        return "Error: No model is currently loaded.  Please select a model from the dropdown."
    if not pdf_text:
        return "Error: PDF text is not loaded."
    if not question:
        return "Please enter a question."
    
    print(f"Handling submission for question: '{question}' using {current_model_id}")
    answer = answer_question_from_pdf(pdf_text, question, current_pipeline)
    return answer

In [None]:
print("Building Gradio Interface")

# Build page header text
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
        # PDF Q & A Bot Using Huggingface Open-Source Models
        Ask questions about the document ('{pdf_filename} if loaded, {len(pdf_text)} chars.)
        Select an open-source LLM to answer your questions.
        ***Note*** Switch models will take time as the new model will need to be dowbnloaded and loaded into the GPU
        """
    )

    # Build dropdown with available models to load
    with gr.Row():
        model_markdown = gr.Dropdown(
            choices=list(available_models.keys()),
            label="Select LLM",
            value=list(available_models.keys())[0] # Default to the first model
        )
        status_textbox = gr.Textbox(label="Model Status", interactive=False)

    question_textbox = gr.Textbox(
        label="Your Question", lines=2, placeholder="Enter your question about the document here..."
    )
    submit_button = gr.Button("Submit Question", variant="primary")
    answer_textbox = gr.Textbox(label="Answer", lines=5, interactive=False)

    # On button click, call the submit handler
    submit_button.click(
        fn = handle_submit,
        inputs= [question_textbox],
        outputs=[answer_textbox]
    )

    # Initial model load before launching Gradio for simplicity
    initial_model_name = list(available_models.keys())[0]
    print(f"Performing initial load of default model: {initial_model_name}")
    status, _ = load_llm_model(initial_model_name)
    status_textbox.value = status
    print("Initial load complete")

# Launch Gradio app
print("Launching Gradio demo")
demo.launch(debug=True)