In [1]:
pip install unsloth transformers trl



In [2]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer=FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2048,
        load_in_4bit=True
)

==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model=FastLanguageModel.get_peft_model(
    model, r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

Unsloth 2025.8.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
# tokenizer= get_chat_template(tokenizer, chat_template="llama-3.1")
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.2")


In [6]:
dataset=load_dataset("omi-health/medical-dialogue-to-soap-summary", split="train")

In [7]:
dataset=standardize_sharegpt(dataset)

In [8]:
dataset

Dataset({
    features: ['dialogue', 'soap', 'prompt', 'messages', 'messages_nosystem'],
    num_rows: 9250
})

In [9]:
# print(dataset[0])
dataset[0]

{'dialogue': "Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second an

In [10]:
# # Convert dialogue + soap into OpenAI-style conversation
# def convert_to_conversation(example):
#     return {
#         "conversations": [
#             {"role": "user", "content": example["dialogue"]},
#             {"role": "assistant", "content": example["soap"]}
#         ]
#     }

# dataset = dataset.map(convert_to_conversation)

# # Now apply chat template
# dataset = dataset.map(
#     lambda examples: {
#         "text": [
#             tokenizer.apply_chat_template(examples["conversations"], tokenize=False)
#         ]
#     }
# )
# Convert dialogue + soap into chat-style
def convert_to_conversation(example):
    return {
        "text": tokenizer.apply_chat_template(
            [
                {"role": "user", "content": example["dialogue"]},
                {"role": "assistant", "content": example["soap"]}
            ],
            tokenize=False
        )
    }

dataset = dataset.map(convert_to_conversation)


In [11]:
dataset

Dataset({
    features: ['dialogue', 'soap', 'prompt', 'messages', 'messages_nosystem', 'text'],
    num_rows: 9250
})

In [12]:
# # Fix the text column to be a single string, not a list
# dataset = dataset.map(
#     lambda example: {
#         "text": tokenizer.apply_chat_template(example["conversations"], tokenize=False)
#     }
# )
# Convert dialogue + soap into chat-style format
def convert_to_conversation(example):
    return {
        "text": tokenizer.apply_chat_template(
            [
                {"role": "user", "content": example["dialogue"]},
                {"role": "assistant", "content": example["soap"]}
            ],
            tokenize=False
        )
    }

dataset = dataset.map(convert_to_conversation)


In [13]:
print(dataset[0]["text"])


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Doctor: Hello, how can I help you today?
Patient: My son has been having some issues with speech and development. He's 13 years old now.
Doctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?
Patient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.
Doctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. 
(After the tests)
Doctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated phi

In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [15]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./outputs",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=1,
#     num_train_epochs=1,        # quick test
#     max_steps=50,              # stop after 50 steps
#     logging_steps=5,           # show loss every 5 steps
#     save_steps=50,
#     learning_rate=2e-5,
#     fp16=True,
#     bf16=False,
#     report_to="none"           # ✅ no wandb
# )
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,   # effectively batch=8
    num_train_epochs=3,
    logging_steps=20,
    save_steps=200,
    eval_steps=200,
    learning_rate=2e-5,
    fp16=True,
    bf16=False,
    report_to="none"
)


In [16]:
small_dataset = dataset.select(range(500))  # use only first 500 examples


In [17]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=small_dataset,   # ✅ smaller dataset
    dataset_text_field="text",
    max_seq_length=512,            # shorter input → faster
    args=training_args,
)

trainer.train()


Unsloth: Tokenizing ["text"]:   0%|          | 0/500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
20,1.6782
40,1.5563
60,1.4304
80,1.4079
100,1.3608
120,1.2937
140,1.3167
160,1.2739
180,1.2817


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=189, training_loss=1.395918568606099, metrics={'train_runtime': 1869.8917, 'train_samples_per_second': 0.802, 'train_steps_per_second': 0.101, 'total_flos': 2.565622821950669e+16, 'train_loss': 1.395918568606099})

In [18]:
# Save the fine-tuned model + tokenizer
save_dir = "./finetuned_model"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)


In [None]:
# for new:
#
#
#
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# Load base model in 4bit again
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
)

# Load your LoRA adapters
from peft import PeftModel
model = PeftModel.from_pretrained(model, "./finetuned_model")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./finetuned_model")


In [23]:
from unsloth import FastLanguageModel

# =========================
# 1. Load finetuned model
# =========================
inference_model, inference_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./finetuned_model",   # path to your saved finetuned model
    max_seq_length=2048,              # same as training
    load_in_4bit=True                 # efficient inference
)



==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:

# =========================
# 2. Example conversation
# =========================
dialogue = """Doctor: Hello, how can I help you today?
Patient: My son has been having some issues with speech and development. He's 13 years old now.
Doctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?
Patient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.
Doctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition.
(After the tests)
Doctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?
Patient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second and third toe, with a sandal gap in both feet.
Doctor: Thank you for confirming that. We also conducted Whole Exome Sequencing (WES) analyses, and we found a de novo frameshift variant in his genetic makeup. Specifically, it's Chr1(GRCh37):g.244217335del, NM_205768.2(ZBTB18):c.259del(p.(Leu87Cysfs*21)). This leads to a premature termination codon located more than 400 codons upstream of the canonical termination codon.
Patient: What does that mean for my son?
Doctor: This genetic variant may be contributing to your son's speech, developmental delay, and attention deficit disorder. It's important that we continue monitoring his progress and provide appropriate support for his development.
Patient: What should we do for follow-up?
Doctor: Regular visits with a speech and language therapist, an occupational therapist, and a psychologist can help address your son's developmental and attention deficit disorder needs. I will also recommend regular check-ups with me to monitor his growth and overall health.
Patient: Thank you, doctor. We will follow your recommendations and keep an eye on his progress."""

# Wrap the dialogue as a user prompt
formatted_prompt = inference_tokenizer.apply_chat_template(
    [
        {"role": "user", "content": dialogue + "\n\nConvert the above doctor-patient conversation into a structured SOAP note"}
    ],
    tokenize=False
)

# =========================
# 3. Tokenize input
# =========================
model_inputs = inference_tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

# =========================
# 4. Generate response
# =========================
generated_ids = inference_model.generate(
    **model_inputs,
    max_new_tokens=512,
    temperature=0.7,   # creativity
    do_sample=True,
    pad_token_id=inference_tokenizer.pad_token_id
)

# =========================
# 5. Decode response
# =========================
response = inference_tokenizer.batch_decode(
    generated_ids,
    skip_special_tokens=True
)[0]

print("=== Model Response ===")
print(response)

=== Model Response ===
system

Cutting Knowledge Date: December 2023
Today Date: 25 Aug 2025

user

Doctor: Hello, how can I help you today?
Patient: My son has been having some issues with speech and development. He's 13 years old now.
Doctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?
Patient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.
Doctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. 
(After the tests)
Doctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?
Patient: Yes, he has all of those features. His han

In [24]:

# =========================
# 2. Example conversation
# =========================
dialogue = """Doctor: Hello, Patient D. How are you feeling today?
Patient D: I'm feeling fine, doc, no complaints. Just here for a regular check-up.
Doctor: That's good to hear. I see that you are a 60-year-old African American male and currently asymptomatic. I also noticed that your 62-year-old brother had prostate cancer and was successfully treated with radiation. Is that correct?
Patient D: Yes, that's correct. My brother went through radiation treatment and is doing well now.
Doctor: I'm glad to hear that your brother is doing well. Given your African American background and having a first-degree relative diagnosed with prostate cancer before 65 years of age, it's important to discuss the possibility of PSA testing with you.
Patient D: Hmm, okay. What exactly is PSA testing, doc?
Doctor: PSA testing, or Prostate-Specific Antigen testing, is a blood test that helps detect the presence of prostate cancer. However, the recommendations for PSA-based screening vary among different organizations.
Patient D: So, do I need to get this screening done?
Doctor: According to the AAFP, NCI, CDC, ACS, AUA, NCCN, and ASCO, it would be less difficult for clinicians to discuss PSA testing with you, considering your background and family history. These organizations recommend that we have a conversation about the potential benefits and risks of the test.
Patient D: Alright, I see. What do the USPSTF recommendations say?
Doctor: The USPSTF recommendations may not recommend PSA-based screening for you. It is important to understand that PSA testing has both potential benefits and risks, and the decision to undergo the test should be made after discussing these factors with your healthcare provider.
Patient D: I understand. So, what should I do next, doc?
Doctor: I recommend that we have a more detailed conversation about the potential benefits and risks of PSA testing, so you can make an informed decision. You may want to take some time to think about it and do some research before we discuss further.
Patient D: Okay, that sounds like a good plan. I'll do some more research and get back to you with any questions or concerns.
Doctor: That's a great approach. In the meantime, if you have any symptoms or concerns, please don't hesitate to reach out to us. I'll be here to help and guide you through the process.
Patient D: Thank you, doc. I appreciate your help and guidance.
Doctor: You're welcome, Patient D. I look forward to our next discussion. Take care and feel free to reach out if you have any questions.
Patient D: Will do. Thanks again, and see you soon."""

# Wrap the dialogue as a user prompt
formatted_prompt = inference_tokenizer.apply_chat_template(
    [
        {"role": "user", "content": dialogue + "\n\nConvert the above doctor-patient conversation into a structured SOAP note"}
    ],
    tokenize=False
)

# =========================
# 3. Tokenize input
# =========================
model_inputs = inference_tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

# =========================
# 4. Generate response
# =========================
generated_ids = inference_model.generate(
    **model_inputs,
    max_new_tokens=512,
    temperature=0.7,   # creativity
    do_sample=True,
    pad_token_id=inference_tokenizer.pad_token_id
)

# =========================
# 5. Decode response
# =========================
response = inference_tokenizer.batch_decode(
    generated_ids,
    skip_special_tokens=True
)[0]

print("=== Model Response ===")
print(response)

=== Model Response ===
system

Cutting Knowledge Date: December 2023
Today Date: 26 Aug 2025

user

Doctor: Hello, Patient D. How are you feeling today?
Patient D: I'm feeling fine, doc, no complaints. Just here for a regular check-up.
Doctor: That's good to hear. I see that you are a 60-year-old African American male and currently asymptomatic. I also noticed that your 62-year-old brother had prostate cancer and was successfully treated with radiation. Is that correct?
Patient D: Yes, that's correct. My brother went through radiation treatment and is doing well now.
Doctor: I'm glad to hear that your brother is doing well. Given your African American background and having a first-degree relative diagnosed with prostate cancer before 65 years of age, it's important to discuss the possibility of PSA testing with you.
Patient D: Hmm, okay. What exactly is PSA testing, doc?
Doctor: PSA testing, or Prostate-Specific Antigen testing, is a blood test that helps detect the presence of prostat

In [25]:
from unsloth import FastLanguageModel
import torch

# =========================
# 1. Load base (not fine-tuned) model
# =========================
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",  # original base model
    max_seq_length=2048,
    load_in_4bit=True,
)

# =========================
# 2. Load fine-tuned model
# =========================
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./finetuned_model",  # your saved model dir
    max_seq_length=2048,
    load_in_4bit=True,
)

# =========================
# 3. Example doctor-patient dialogue
# =========================
dialogue = """Doctor: Good morning. What brings you in today?
Patient: I’ve had a sore throat and mild fever for the last three days.
Doctor: Do you have any cough or difficulty swallowing?
Patient: Yes, I have a dry cough and it hurts a little when I swallow.
Doctor: Any history of allergies or recent sick contacts?
Patient: No allergies, but my son had a cold last week.
Doctor: I see. Let’s take a look at your throat.
"""

# Wrap prompt
formatted_prompt = ft_tokenizer.apply_chat_template(
    [
        {"role": "user", "content": dialogue + "\n\nConvert the above doctor-patient conversation into a structured SOAP note"}
    ],
    tokenize=False
)

# =========================
# 4. Helper for generation
# =========================
def generate(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# =========================
# 5. Run both models
# =========================
base_output = generate(base_model, base_tokenizer, formatted_prompt)
ft_output   = generate(ft_model, ft_tokenizer, formatted_prompt)

print("=== Base Model (Before Fine-tuning) ===")
print(base_output)
print("\n\n=== Fine-tuned Model (After Fine-tuning) ===")
print(ft_output)


==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.8.9: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
=== Base Model (Before Fine-tuning) ===
system

Cutting Knowledge Date: December 2023
To

In [26]:
save_dir = "/content/finetuned_model"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)


('/content/finetuned_model/tokenizer_config.json',
 '/content/finetuned_model/special_tokens_map.json',
 '/content/finetuned_model/chat_template.jinja',
 '/content/finetuned_model/tokenizer.json')

In [27]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/finetuned_model /content/drive/MyDrive/finetuned_model


Mounted at /content/drive


In [None]:
from unsloth import FastLanguageModel

# Path where you saved in Drive
model_path = "/content/drive/MyDrive/finetuned_model"

# Reload fine-tuned model directly
inference_model, inference_tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=2048,
    load_in_4bit=True,
)


In [3]:
pip install nbstripout

Collecting nbstripout
  Downloading nbstripout-0.8.1-py2.py3-none-any.whl.metadata (19 kB)
Downloading nbstripout-0.8.1-py2.py3-none-any.whl (16 kB)
Installing collected packages: nbstripout
Successfully installed nbstripout-0.8.1


In [6]:
!nbstripout LLM_FineTune.ipynb

Could not strip 'LLM_FineTune.ipynb': file not found
