In [1]:
%%capture
import os

# Check if running in Colab
if "COLAB_" not in "".join(os.environ.keys()):
    # Local environment (e.g., your PC)
    !pip install unsloth
else:
    # Google Colab setup
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf==5.29.1 datasets huggingface_hub hf_transfer fsspec==2025.3.2
    !pip install --no-deps unsloth


In [2]:
import json

# Load dataset from uploaded file
with open("medical_dataset.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Create prompt structure
def build_prompt(sample):
    return {
        "input": f"""### Instruction:
You are a medical assistant. Read the medical report and provide:
1. A short summary
2. Your interpretation
3. A possible solution

### Input:
{sample['report']}

### Response:
1. Summary: {sample['summary']}
2. Interpretation: {sample['interpretation']}
3. Solution: {sample['solution']}"""
    }

formatted_data = [build_prompt(example) for example in raw_data]


In [3]:
from datasets import Dataset

dataset = Dataset.from_list(formatted_data)
print(dataset[0])


{'input': '### Instruction:\nYou are a medical assistant. Read the medical report and provide:\n1. A short summary\n2. Your interpretation\n3. A possible solution\n\n### Input:\n45-year-old male presents with 3 days of productive cough, fever (38.5°C), and right-sided chest pain. On examination: crackles in right lower lobe. WBC 12.5k, CRP 45. Chest X-ray shows right lower lobe consolidation.\n\n### Response:\n1. Summary: Middle-aged male with community-acquired pneumonia\n2. Interpretation: Clinical presentation and imaging consistent with bacterial pneumonia, likely Streptococcus pneumoniae\n3. Solution: 1. Start amoxicillin-clavulanate 875/125mg PO q12h\n2. Chest physiotherapy\n3. Follow-up in 48 hours or if symptoms worsen'}


In [4]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = 2048 ,
    dtype = None,
    load_in_4bit = True,
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Mistral patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 32,
    lora_dropout = 0.0,  # Set to 0 for full Unsloth optimization
    bias = "none",
)


Unsloth 2025.7.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
def tokenize(example):
    return tokenizer(example["input"], truncation=True, padding="max_length", max_length=2048)

tokenized_dataset = dataset.map(tokenize, batched=True)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size = 2,       # Set to 1 if you get out-of-memory errors
    gradient_accumulation_steps = 4,
    num_train_epochs = 3,
    learning_rate = 2e-4,
    fp16 = True,                           # Mixed precision for speed/memory
    logging_steps = 10,
    report_to = "none",
    output_dir = "outputs",
)


In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)


In [9]:
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8 | Num Epochs = 3 | Total steps = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,283,675,136 (0.58% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss


TrainOutput(global_step=3, training_loss=6.806591033935547, metrics={'train_runtime': 153.6307, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.02, 'total_flos': 2109388496044032.0, 'train_loss': 6.806591033935547, 'epoch': 3.0})

In [10]:
model.save_pretrained("finetuned_med_mistral")
tokenizer.save_pretrained("finetuned_med_mistral")


('finetuned_med_mistral/tokenizer_config.json',
 'finetuned_med_mistral/special_tokens_map.json',
 'finetuned_med_mistral/tokenizer.model',
 'finetuned_med_mistral/added_tokens.json',
 'finetuned_med_mistral/tokenizer.json')

In [11]:
sample_report = "A 65-year-old male with fever and cough."
prompt = f"""### Instruction:
You are a medical assistant. Read the medical report and provide:
[SUMMARY]
[INTERPRETATION]
[SOLUTION]

### Input:
{sample_report}

### Response:"""
print(prompt)



inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


### Instruction:
You are a medical assistant. Read the medical report and provide:
[SUMMARY]
[INTERPRETATION]
[SOLUTION]

### Input:
A 65-year-old male with fever and cough.

### Response:
### Instruction:
You are a medical assistant. Read the medical report and provide:
[SUMMARY]
[INTERPRETATION]
[SOLUTION]

### Input:
A 65-year-old male with fever and cough.

### Response:
The patient is a 65-year-old male with a history of hypertension and diabetes. He presents with a 2-week history of fever, cough, and shortness of breath. On physical examination, he is afebrile, tachycardic, and tachypneic. Chest X-ray shows bilateral infiltrates. Laboratory tests show a white blood cell count of 12,000/mm3, hemoglobin of 10 g/dL, and platelets of 100,000/mm3. The patient is diagnosed with pneumonia and started on antibiotics.

### Summary:
The patient is a 65-year-old male with a history of hypertension and diabetes. He presents with a 2-week history of fever, cough, and shortness of breath. On p

In [12]:
!pip install fastapi uvicorn nest-asyncio
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared


--2025-07-11 19:42:08--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.7.0/cloudflared-linux-amd64 [following]
--2025-07-11 19:42:09--  https://github.com/cloudflare/cloudflared/releases/download/2025.7.0/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/106867604/37d2bad8-a2ed-4b93-8139-cbb15162d81d?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-07-11T20%3A41%3A40Z&rscd=attachment%3B+filename%3Dcloudflared-linux-amd64&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-07-

In [13]:
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig
import torch

# Configure 4-bit quantization with CPU offload fallback
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True  # ✅ allows CPU fallback
)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="finetuned_med_mistral",
    max_seq_length=4096,
    dtype=None,  # Let Unsloth auto-select best dtype
    device_map="auto",  # ✅ Smart device placement
    quantization_config=bnb_config
)

model.eval()


==((====))==  Unsloth 2025.7.3: Fast Mistral patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

In [14]:
from fastapi import FastAPI
from pydantic import BaseModel
import torch
import re

app = FastAPI()

from fastapi.middleware.cors import CORSMiddleware

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Or replace "*" with the actual domain for security
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class MedicalReportRequest(BaseModel):
    report: str

@app.post("/analyze")
def analyze_report(request: MedicalReportRequest):
    # Prompt template to guide the model
    prompt_template = """### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
{report}

### Response:
1. Summary:"""

    prompt = prompt_template.format(report=request.report)

    # Tokenize and send to GPU
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("=== RAW DECODED OUTPUT ===")
    print(decoded)

    # Regex parsing
    summary = re.search(r"1\. Summary:\s*(.*?)(?:2\. Interpretation:|3\.|$)", decoded, re.DOTALL)
    interpretation = re.search(r"2\. Interpretation:\s*(.*?)(?:3\.(?: Solution| Possible Solution):|$)", decoded, re.DOTALL)
    solution = re.search(r"3\.(?: Solution| Possible Solution):\s*(.*)", decoded, re.DOTALL)

    # Deduplicate lines
    def deduplicate_solution(text):
        lines = text.strip().split("\n")
        seen = set()
        cleaned = []
        for line in lines:
            line = line.strip()
            if line and line not in seen:
                seen.add(line)
                cleaned.append(line)
        return "\n".join(cleaned)

    # Auto-number if only one block or unnumbered
    def auto_number_if_needed(text):
        lines = text.strip().split("\n")
        if len(lines) == 1:
            return "1. " + lines[0]
        if any(line.strip().startswith("1.") for line in lines):
            return text
        return "\n".join(f"{i+1}. {line.strip()}" for i, line in enumerate(lines) if line.strip())

    # Split compound sentences into separate steps using regex instead of nltk
    def split_sentences_into_numbered_steps(text):
        if sum(1 for line in text.splitlines() if line.strip().startswith(tuple(f"{i}." for i in range(1, 10)))) > 1:
            return text
        sentences = re.split(r'(?<=[.!?;])\s+', text.strip())
        return "\n".join(f"{i+1}. {s.strip()}" for i, s in enumerate(sentences) if s.strip())

    # Apply all processing steps
    cleaned_solution = deduplicate_solution(solution.group(1).strip()) if solution else ""
    auto_numbered = auto_number_if_needed(cleaned_solution)
    final_solution = split_sentences_into_numbered_steps(auto_numbered)

    return {
        "summary": summary.group(1).strip() if summary else "",
        "interpretation": interpretation.group(1).strip() if interpretation else "",
        "solution": final_solution
    }


In [15]:
import nest_asyncio
import threading
import uvicorn

nest_asyncio.apply()

def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

threading.Thread(target=run).start()


In [None]:
!./cloudflared tunnel --url http://localhost:8000 --no-autoupdate

INFO:     Started server process [33566]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


[90m2025-07-11T19:42:31Z[0m [32mINF[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps
[90m2025-07-11T19:42:31Z[0m [32mINF[0m Requesting new quick Tunnel on trycloudflare.com...
[90m2025-07-11T19:42:36Z[0m [32mINF[0m +--------------------------------------------------------------------------------------------+
[90m2025-07-11T19:42:36Z[0m [32mINF[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |
[90m2025

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW DECODED OUTPUT ===
### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
A 30-year-old female presents with high-grade fever, severe headache, photophobia, and neck stiffness. She is lethargic and has a positive Brudzinski sign.

### Response:
1. Summary: The patient has meningitis.
2. Interpretation: Meningitis is an inflammation of the membranes that cover the brain and spinal cord. It can be caused by bacteria or viruses. Symptoms include fever, headache, nausea, vomiting, confusion, seizures, and stiff neck. Treatment includes antibiotics for bacterial infections and antiviral medications for viral infections.
3. Solution: Administer IV fluids to maintain hydration; administer pain medication such as acetaminophen or ibuprofen; monitor vital signs closely; consult with neurologist if

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     197.26.153.79:0 - "OPTIONS /analyze HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     197.26.153.79:0 - "POST /analyze HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/middleware/errors.py", line 187, in __call__
    raise exc
  File "/usr/local/lib/python3.11/dist-packages/starlette/middleware/errors.py",

INFO:     197.26.153.79:0 - "OPTIONS /analyze HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     197.26.153.79:0 - "POST /analyze HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/middleware/errors.py", line 187, in __call__
    raise exc
  File "/usr/local/lib/python3.11/dist-packages/starlette/middleware/errors.py",

=== RAW DECODED OUTPUT ===
### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
A 30-year-old female presents with high-grade fever, severe headache, photophobia, and neck stiffness. She is lethargic and has a positive Brudzinski sign.

### Response:
1. Summary: The patient has meningitis.
2. Interpretation: Meningitis is an inflammation of the membranes that cover the brain and spinal cord. It can be caused by bacteria or viruses. Symptoms include fever, headache, nausea, vomiting, confusion, seizures, and stiff neck. Treatment includes antibiotics for bacterial infections and antiviral medications for viral infections.
3. Solution: Administer IV fluids to maintain hydration; administer pain medication such as acetaminophen or ibuprofen; monitor vital signs closely; consult with neurologist if

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


INFO:     197.26.153.79:0 - "OPTIONS /analyze HTTP/1.1" 200 OK


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW DECODED OUTPUT ===
### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
A 55-year-old man presents with chest pain radiating to the left arm, shortness of breath, and diaphoresis. ECG shows ST-elevation in leads II, III, and aVF.

### Response:
1. Summary: The patient is experiencing an acute myocardial infarction.
2. Interpretation: The patient needs immediate treatment for his condition.
3. Solution: Administer    1. Call emergency services <br />/p>
INFO:     197.26.153.79:0 - "POST /analyze HTTP/1.1" 200 OK
=== RAW DECODED OUTPUT ===
### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
A 55-

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW DECODED OUTPUT ===
### Instruction:
You are a medical assistant. Read the following medical report and provide:

1. A short summary
2. Your interpretation
3. A solution — structured as multiple clearly numbered steps (e.g., 1., 2., 3.) on separate lines

### Report:
A 55-year-old man presents with chest pain radiating to the left arm, shortness of breath, and diaphoresis. ECG shows ST-elevation in leads II, III, and aVF

### Response:
1. Summary: The patient is experiencing an acute myocardial infarction. He should be given aspirin immediately and transported to the nearest hospital for emergency treatment.

2. Interpretation: The patient has had a heart attack. This means that one or more of his coronary arteries have become blocked by blood clots, preventing oxygen from reaching part of his heart muscle. If this continues, it can cause permanent damage to the affected area(s) of the heart.

3. Solution: Give the patient aspirin immediately and transport him to the nearest hos

In [None]:
import requests

url = "https://useful-candle-9d.trycloudflare.com/analyze"  # Replace with your actual URL

response = requests.post(url, json={
    "report": "A 70-year-old patient reports difficulty breathing and swelling in the ankles..."
})

print(response.json())
