In [2]:
# !pip install -U bitsandbytes
# !pip install -q transformers peft gradio accelerate

In [1]:
!nvidia-smi

Fri May  9 18:51:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.41                 Driver Version: 566.41         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650      WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   58C    P8              4W /   60W |       1MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import modal
from fastapi import Request
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

app = modal.App("qwen2.5-medical")

# Define the image with required packages, including FastAPI
image = (
    modal.Image.debian_slim()
    .pip_install("transformers", "torch", "peft", "accelerate", "bitsandbytes", "fastapi[standard]")
)

@app.function(image=image, gpu="T4", timeout=600)
@modal.web_endpoint(method="POST")
async def web_response(request: Request):
    # Parse JSON data from the request body
    data = await request.json()
    
    # Extract 'instruction' from the JSON data
    instruction = data.get("instruction")

    if not instruction:
        return {"error": "Missing instruction."}

    base_model_name = "Qwen/Qwen2.5-3B-Instruct"
    adapter_model_id = "AbdullahAlnemr1/qwen2.5-medical-lora"

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

    # Set up configuration for quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )

    # Load the base model
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True
    )

    # Load the adapter model
    model = PeftModel.from_pretrained(base_model, adapter_model_id)
    model = model.merge_and_unload()
    model.eval()

    # Set up padding for the tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Construct the prompt
    prompt = f"""<|im_start|>system
You are a highly knowledgeable and accurate medical assistant trained to provide evidence-based medical advice. Answer clearly and concisely using medical best practices.<|im_end|>
<|im_start|>user
Instruction: {instruction}<|im_end|>
<|im_start|>assistant
"""

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

    # Generate the output from the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the model output
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean the decoded output
    cleaned = decoded.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip()

    # Ensure that the cleaned response is valid
    if not cleaned:
        return {"error": "Failed to generate response from model."}

    # Return the response
    return {"response": cleaned}


In [None]:
import requests

# put yor url here
url = ""
res = requests.post(url, json={"instruction": "What are the symptoms of diabetes?"})
print(res.json())

{'response': 'Error: Unable to parse the response.'}


In [None]:
from rich.markdown import Markdown
from rich.console import Console
import requests


#put your url here
url = ""
res = requests.post(url, json={"instruction": "I'm a 35-year-old woman who has been experiencing persistent abdominal bloating, changes in bowel habits (alternating between constipation and diarrhea), and occasional lower abdominal pain for the past 3 months. I’ve also noticed that I feel more tired than usual and have lost a bit of weight without trying. I don’t have any significant medical history, and these symptoms have gradually worsened over time. Could this be something serious like colon cancer, or is it more likely to be something benign like IBS? What should I do next?"})

# Extract Markdown response
markdown_text = res.json()["response"]

# Render Markdown in terminal
console = Console()
console.print(Markdown(markdown_text))

## speak or type

In [None]:
import gradio as gr
import whisper
import requests

# Load Whisper model (consider using "medium" or "large" if you have the resources)
model = whisper.load_model("base")  # You can change to "medium" or "large"

# Your deployed model API
MODAL_API_URL = ""

# Step 1: Transcribe audio input using Whisper
def transcribe_audio(audio):
    if audio is None:
        return ""
    try:
        # Force English language (change to "ar" for Arabic, "fr" for French, etc.)
        result = model.transcribe(audio, language="en")
        return result["text"]
    except Exception as e:
        return f"🚨 Transcription error: {e}"

# Step 2: Send text (from voice or typed) to medical model
def ask_medical_model(question_text):
    if not question_text.strip():
        return "❌ Please enter a valid question."
    try:
        response = requests.post(MODAL_API_URL, json={"instruction": question_text})
        return response.json().get("response", "⚠️ No response received.")
    except Exception as e:
        return f"🚨 Error contacting the model: {e}"

# Gradio interface layout
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Whisper Medical Assistant\nSpeak or type your medical question. The assistant will respond below.")

    with gr.Row():
        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎙️ Speak your question")
        transcribed_text = gr.Textbox(label="📝 Transcribed / Typed Question", placeholder="Will show transcription here (you can edit it).")

    transcribe_button = gr.Button("🔄 Transcribe Audio")
    transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcribed_text)

    submit_button = gr.Button("🩺 Ask the Medical Assistant")
    markdown_output = gr.Markdown(label="📋 Response")
    submit_button.click(ask_medical_model, inputs=transcribed_text, outputs=markdown_output)

demo.launch(share=True)


  checkpoint = torch.load(fp, map_location=device)


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://edcd0fe167264c32b0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


