In [1]:
import gradio as gr
from unsloth import FastVisionModel
from transformers import TextStreamer
import torch

# Load model and tokenizer
model, tokenizer = FastVisionModel.from_pretrained(
    model_name="Shriyans/Llama-11B-Rad-LORA",
    load_in_4bit=False,
)
FastVisionModel.for_inference(model)
model = model.to("cuda")



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mllama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.4.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.0.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
# Prediction function
def describe_radiograph(image):
    if image is None:
        return "Please upload an image."
    
    instruction = "You are an expert radiographer. Describe accurately what you see in this image. Do not hallucinate and include only elementary information."
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]
    
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=256,
            use_cache=True,
            temperature=0.8,
            min_p=0.1
        )
    
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    if "assistant" in decoded_text:
        decoded_text = decoded_text.split("assistant", 1)[-1].strip()

    return decoded_text


In [7]:
def build_demo():
    with gr.Blocks(theme=gr.themes.Base(), css=".gr-button {font-size: 16px;}") as demo:
        gr.Markdown(
            """
            # 🦙 **Llama-Rad: Diagnosis Generator**
            Upload a radiograph and let our AI model generate a diagnosis.
            """,
            elem_id="title",
        )

        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                image_input = gr.Image(type="pil", label="📤 Upload Radiograph")

            with gr.Column(scale=1):
                output_text = gr.Textbox(
                    label="📝 Model Prediction",
                    placeholder="The prediction will appear here...",
                    lines=12,
                    show_copy_button=True,
                )

        with gr.Row():
            run_button = gr.Button("🧠 Generate", variant="primary")
            reset_button = gr.Button("🔄 Reset", variant="secondary")

        # Button logic
        run_button.click(fn=describe_radiograph, inputs=image_input, outputs=output_text)
        reset_button.click(fn=lambda: (None, ""), inputs=None, outputs=[image_input, output_text])

    return demo


In [None]:
if __name__ == "__main__":
    demo = build_demo()
    demo.launch(
        server_name="0.0.0.0",  # Make it accessible publicly on RunPod
        server_port=7860,       # Default Gradio port
        share=True              # Optional: exposes a public Gradio URL via ngrok
    )