In [1]:
#%%capture
!pip install unsloth # ✅ Installs the Unsloth library, which is designed for super-efficient fine-tuning of large language models (LLMs).
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Collecting unsloth
  Downloading unsloth-2025.2.9-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.2.3 (from unsloth)
  Downloading unsloth_zoo-2025.2.4-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.14-py3-none-any.whl.metadata (9.4 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth)
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from huggingface_hub import login 
from transformers import TrainingArguments
from datasets import load_dataset
import wandb
from kaggle_secrets import UserSecretsClient


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
#from huggingface_hub import login
#from kaggle_secrets import UserSecretsClient
#import wandb

user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

wb_token = user_secrets.get_secret("WANDB")
wandb.login(key=wb_token)

run = wandb.init(
    project = 'fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaid[0m ([33mnikolaid-3cx[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
#load DeepSeek R1 and Tokenizer 
#from unsloth import FastLanguageModel
#import torch
max_seq_length = 2048 #consider increasing this to 4096 or 8192 to have longer contexts 
dtype = None  #bfloat16 to speed up training on nvidia A100/H100
load_in_4bit = True #Enables 4bit quantization 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype, 
    load_in_4bit = load_in_4bit, 
    token = hf_token,
)

==((====))==  Unsloth 2025.2.9: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.9k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [5]:
#Test R1 on a medical use case before fine tuning 
#Define a system prompt style with placeholders  

prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>{}"""


In [12]:
#Test Deepseek by providing a medical question and generating a response 
# Set a test question 
# Format the question using structured Prompt style, Tokenize the input and  move it to GPU 
# Generate a response 
# Decode the output tokens back into the text to get the final readable response 

question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"

FastLanguageModel.for_inference(model) #Unsloth is x 2 faster than huggingface 
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  #tokenized input questions moved to the gpu 

# get more param ideas from the cat to tweak the generation settings to improve responses 
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])



<think>
Okay, so I need to figure out what cystometry would show for this 61-year-old woman. Let me break this down step by step.

First, the patient has a history of involuntary urine loss, especially when she coughs or sneezes. That makes me think of stress urinary incontinence. I remember that stress incontinence is usually due to the urethral sphincter not closing properly during activities that increase abdominal pressure, like coughing. But she doesn't leak at night, which is more common in other types of incontinence, like nocturnal enuresis, which is typically due to neurogenic causes.

She underwent a gynecological exam and a Q-tip test. I'm not too familiar with the Q-tip test, but I think it's used to assess urethral function. The Q-tip is a small device used to measure the closure pressure of the urethral sphincter. If the pressure is low, it might indicate a weak sphincter, which could be contributing to her symptoms.

Now, considering the findings from these tests, what 

In [6]:
#Update the system prompt to show the internal thinking inside Chain of thought COT 
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. 
Please answer the following medical question. 

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

In [7]:
#Load dataset https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT 
#from datasets import load_dataset 
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train[0:500]", trust_remote_code=True)
#dataset = dataset.map(formatting_prompts_func, batched = True)
dataset

README.md:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 500
})

In [8]:
# Show an entry from the dataset
dataset[1]

{'Question': 'A 45-year-old man with a history of alcohol use, who has been abstinent for the past 10 years, presents with sudden onset dysarthria, shuffling gait, and intention tremors. Given this clinical presentation and history, what is the most likely diagnosis?',
 'Complex_CoT': "Alright, let’s break this down. We have a 45-year-old man here, who suddenly starts showing some pretty specific symptoms: dysarthria, shuffling gait, and those intention tremors. This suggests something's going wrong with motor control, probably involving the cerebellum or its connections.\n\nNow, what's intriguing is that he's had a history of alcohol use, but he's been off it for the past 10 years. Alcohol can do a number on the cerebellum, leading to degeneration, and apparently, the effects can hang around or even appear long after one stops drinking.\n\nAt first glance, these symptoms look like they could be some kind of chronic degeneration, maybe something like alcoholic cerebellar degeneration, 

In [10]:
# We need to format the dataset to fit our prompt training style 
EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which will define the end of sentence 

In [11]:
#Function that formats the prompt
def formatting_prompts_func(examples):
    inputs = examples["Question"]           #Extracts the medical question from the dataset
    cots = examples["Complex_CoT"]          #Extracts the chain-of-thought from the dataset
    outputs = examples["Response"]          #Extracts the model generated response 
    
    texts = []                              #Initializes an empty list to store the formatted prompts 
    
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN  # Insert values into prompt template & append EOS token
        texts.append(text)                 # Add the formatted text to the list
    return {
        "text": texts                      # Return the newly formatted dataset with a "text" column containing structured prompts
    }
    

In [13]:
dataset_finetune = dataset.map(formatting_prompts_func, batched = True)
dataset_finetune["text"][0]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Response:\n<think>\nOkay, let's think about this step by step. There's a 61-year-old woman here who's been dealing with involuntary urine leakages whenever she's doing something that ups her ab

In [14]:
#Prepare model properly for training before wrapping it with LoRA and Ensures generate() is set correctly so Unsloth doesn’t break it later.
FastLanguageModel.for_training(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

In [15]:
#Setup the model using Low Rank Adaptation - LoRA
#🔥 💡 Wraps an LLM with LoRA for memory-efficient fine-tuning 💡 Only updates key transformer layers (q_proj, k_proj, etc.) 💡 Uses Unsloth’s optimizations (low memory, faster training) 💡 Enables gradient checkpointing to train longer-context models
# wrap the model with LoRA using Unsloth's methods and optimized LoRA implementation. get_peft_model() is a function which stands for Parameter-Efficient Fine-Tuning — this function wraps the base model (model) with LoRA modifications, ensuring that only specific parameters are trained.
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # increase r for better quality 
    target_modules=[ #modules of transformer layer
        "q_proj", 
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16, #Scaling factor for lora updates. higher lora = stronger finetune impact. Value of 16 is balanced. 
    lora_dropout=0, #dropout rate during lora training. 0 means no dropout fully training lora layers *** touch to avoid overfitting e.g. 0.1 
    bias="none", #tells lora not to modify the bias terms in the model 
    use_gradient_checkpointing="unsloth", #additional reduction of memory usage (more optimized than standard HF)
    random_state=3407, #Set a random seed for reproduceability 
    use_rslora=False, #disable rslora rank stabilized lora 
    loftq_config=None, #LoFTQ (Low-Frequency Quantization) is another optimization for extreme low memory finetuning     
)

#32 Transformer Layers Patched
#✔ QKV Layers: 32
#✔ O Layers: 32
#✔ MLP Layers: 32
#LoRA has successfully wrapped the DeepSeek-R1 model!
#Fine-tuning setup is fully optimized and memory-efficient.

Unsloth 2025.2.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [16]:
#- ✅ SFTTrainer → A trainer for supervised fine-tuning (SFT) of LLMs
#- ✅ TrainingArguments → Defines all fine-tuning hyperparameters
#- ✅ is_bfloat16_supported() → Checks if the GPU supports bfloat16 (important for mixed precision training)
#Set up the model 

#from trl import SFTTrainer #Hugging Face's Transformers Reinforcement Learning library
#from transformers import TrainingArguments
#from unsloth import is_bfloat16_supported 

trainer = SFTTrainer(
    model=model, #the LLM to finetune 
    tokenizer=tokenizer, #tokenizer for text processing  
    train_dataset=dataset_finetune, # the dataset for finetuning 
    dataset_text_field="text", #field in dataset containing text data 
    max_seq_length=max_seq_length, #max token sequence length
    dataset_num_proc=2, #Number of parallel processes for dataset loading

    #define training arguments 
    args=TrainingArguments(
        per_device_train_batch_size=2, #trains 2 samples per GPU Batch - increase for faster training 
        gradient_accumulation_steps=4, #accumulates gradients over 4 steps before updating the model
        num_train_epochs = 1, # commment and set warmup_ratio for full training runs!
        warmup_steps=5, #gradually increases learning rate over 5 steps and prevents the model from making drastic steps too early 
        max_steps=60, #fine tuning stops after 60 steps. If doing a full fine tune use num_train_epochs = 1 instead. 
        learning_rate=2e-4, #0.0002 how fast the model updates weights. higher LR faster training with risk of instability. Lower LR slower training but more stable finetuning 
        fp16=not is_bfloat16_supported(), #uses bfloat16 if the GPU supports it ...
        bf16=is_bfloat16_supported(), #...otherwise falls back to fp16 (16 bit precision)
        logging_steps=10, #logs training metrics every 10 steps for training performance 
        optim="adamw_8bit", #Uses Adamw optimizer but in 8 bit mode thus saving VRAM and speeding up training 
        weight_decay=0.01, #Adds regularization to prevent overfitting 
        lr_scheduler_type="linear", #uses a linear rate schedule (gradually decreases LR) to keep training stable without sudden jumps
        seed=3407, #a random seet to ensure consistency across runs
        output_dir="outputs", #saves the finetuned model checkpoints and logs inside the outputs directory 
    ),
)

Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
#Supplimentary commands to solve issues for reference 
torch.cuda.memory_summary(device="cuda")
print(hasattr(model, "_unwrapped_old_generate"))

if hasattr(model, "_unwrapped_old_generate"):
    print("Type:", type(model._unwrapped_old_generate))
    print("Function Code:", model._unwrapped_old_generate.__code__)

if hasattr(model, "_unwrapped_old_generate"):
    print("Same as generate():", model._unwrapped_old_generate == model.generate)

if hasattr(model, "_unwrapped_old_generate"):
    model._unwrapped_old_generate = None  # Since unwrapped and generate are identical, lets detach this because it is a bug in UNsloth - Just nullify it instead of deleting

In [17]:
#Start fine-tuning 
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 500 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.9189
20,1.4615
30,1.4023
40,1.3088
50,1.3443
60,1.314


In [18]:
#Save the fine-tuned model to wandb
wandb.finish()

0,1
train/epoch,▁▂▄▅▇██
train/global_step,▁▂▄▅▇██
train/grad_norm,█▂▂▁▂▂
train/learning_rate,█▇▅▄▂▁
train/loss,█▃▂▁▁▁

0,1
total_flos,1.8014312853602304e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,0.26023
train/learning_rate,0.0
train/loss,1.314
train_loss,1.4583
train_runtime,1181.0787
train_samples_per_second,0.406
train_steps_per_second,0.051


In [19]:
#run model inference after fine-tuning

#Test Deepseek by providing a medical question and generating a response 
# Set a test question 
# Format the question using structured Prompt style, Tokenize the input and  move it to GPU 
# Generate a response 
# Decode the output tokens back into the text to get the final readable response 

question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"

FastLanguageModel.for_inference(model) #Unsloth is x 2 faster than huggingface 

inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  #tokenized input questions moved to the gpu 

# get more param ideas from the cat to tweak the generation settings to improve responses 
outputs = model.generate(
    input_ids=inputs.input_ids,               
    attention_mask=inputs.attention_mask,     
    max_new_tokens=1200,                       
    use_cache=True,
)

response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])


<think>
Okay, so let's think about this. We have a 61-year-old woman who's been dealing with involuntary urine loss during things like coughing or sneezing, but she's not leaking at night. That suggests she might have some kind of problem with her pelvic floor muscles or maybe her bladder.

Now, she's got a gynecological exam and a Q-tip test. Let's break that down. The Q-tip test is usually used to check for urethral obstruction. If it's positive, that means there's something blocking the urethra, like a urethral stricture or something else.

Given that she's had a positive Q-tip test, it's likely there's a urethral obstruction. That would mean her urethra is narrow, maybe due to a stricture or some kind of narrowing. So, her bladder can't empty properly during activities like coughing because the urethral obstruction is making it hard.

Now, let's think about what happens when her bladder can't empty. If there's a urethral obstruction, the bladder is forced to hold more urine, incre

In [20]:
question = "A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue, and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative, gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium. What is the most likely predisposing factor for this patient's condition?"

inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)

print(response[0].split("### Response:")[1])


<think>
Alright, let's think about this. We've got a 59-year-old guy who's got a fever, chills, night sweats, and feels pretty tired. That sounds like a classic picture of an infection, right? And there's a vegetation on his aortic valve. Hmm, that's not great news. 

Now, looking at the blood culture results: gram-positive, catalase-negative, gamma-hemolytic cocci in chains. That means it's a Streptococcus species, specifically Streptococcus pyogenes. It's not Streptococcus pneumoniae because it doesn't grow in NaCl. And this guy's symptoms, combined with the vegetation, point towards endocarditis, which is an infection of the heart valves.

Okay, so let's figure out what could have caused this. I'm thinking about infections, not just from bacteria, but also from fungi or viruses. It's important to consider these possibilities because sometimes you can miss something. 

But wait, let's think about the main culprit. Streptococcus pyogenes is pretty aggressive and can cause a lot of pr

In [21]:
#Save the model locally 
new_model_online = "develops20/DeepSeek-R1-Distill-Llama-8B-Medical-COT"
new_model_local = "DeepSeek-R1-Distill-Llama-8B-Medical-COT"
model.save_pretrained(new_model_local) #local saved model 
tokenizer.save_pretrained(new_model_local)

('DeepSeek-R1-Distill-Llama-8B-Medical-COT/tokenizer_config.json',
 'DeepSeek-R1-Distill-Llama-8B-Medical-COT/special_tokens_map.json',
 'DeepSeek-R1-Distill-Llama-8B-Medical-COT/tokenizer.json')

In [24]:
#Push the model to hugging face hub 
model.push_to_hub(new_model_online) #save online and pushed to hub
tokenizer.push_to_hub(new_model_online) #save online and pushed to hub

README.md:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved model to https://huggingface.co/develops20/DeepSeek-R1-Distill-Llama-8B-Medical-COT


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [25]:
#✅ This saves the fine-tuned model to a local directory (new_model_local).
#✅ Merges the LoRA adapters into the base model.
#✅ Stores the model in 16-bit precision (merged_16bit) to save space. other options are merged_32bit
model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit", )

#✅ Uploads the fine-tuned model to Hugging Face Hub (new_model_online).
#✅ Merges LoRA layers into the base model before uploading.
#✅ Uses "merged_16bit" to keep file size reasonable.
#✅ Allows public or private access for inference and sharing.

model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")

Unsloth: You have 2 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 18.17 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 34%|███▍      | 11/32 [00:00<00:01, 15.67it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:25<00:00,  1.24it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: You are pushing to hub in Kaggle environment.
To save memory, we shall move develops20/DeepSeek-R1-Distill-Llama-8B-Medical-COT to /tmp/DeepSeek-R1-Distill-Llama-8B-Medical-COT


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 18.12 out of 31.35 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:25<00:00,  1.28it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.
Unsloth: Saving /tmp/DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00001-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00002-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00003-of-00004.bin...
Unsloth: Saving /tmp/DeepSeek-R1-Distill-Llama-8B-Medical-COT/pytorch_model-00004-of-00004.bin...


  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/develops20/DeepSeek-R1-Distill-Llama-8B-Medical-COT
