In [None]:
!pip install unsloth
!pip install transformers
!pip install datasets
!pip install torch
!pip install accelerate
!pip install peft
!pip install trl
!pip install bitsandbytes
!pip install huggingface_hub
!pip install sentencepiece
!pip install protobuf
!pip install hf_transfer

In [2]:
!pip install wandb pandas matplotlib

Collecting wandb
  Downloading wandb-0.19.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting click!=8.0.0,>=7.1 (from wandb)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3 (from wandb)
  Downloading pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.2/65.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.25.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.5-cp310-cp310-m

In [3]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import train_on_responses_only, standardize_sharegpt, get_chat_template
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from transformers.trainer_callback import TrainerCallback  # Fixed import
from unsloth import is_bfloat16_supported
from datasets import load_dataset
import wandb
import matplotlib.pyplot as plt
import numpy as np
import os
from huggingface_hub import notebook_login
import pandas as pd
from datetime import datetime


# Custom callback to capture detailed training metrics
class MetricsCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.training_steps = []
        self.training_losses = []
        self.learning_rates = []
        self.grad_norms = []
        self.batch_sizes = []
        self.throughput = []
        self.start_time = datetime.now()

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if 'loss' in logs:
                self.training_steps.append(state.global_step)
                self.training_losses.append(logs['loss'])
            if 'learning_rate' in logs:
                self.learning_rates.append(logs['learning_rate'])
            if 'grad_norm' in logs:
                self.grad_norms.append(logs['grad_norm'])
            if 'batch_size' in logs:
                self.batch_sizes.append(logs['batch_size'])

            # Calculate throughput (samples per second)
            if 'train_runtime' in logs:
                elapsed = (datetime.now() - self.start_time).total_seconds()
                if elapsed > 0:
                    samples_processed = state.global_step * args.per_device_train_batch_size * args.gradient_accumulation_steps
                    throughput = samples_processed / elapsed
                    self.throughput.append(throughput)
                    logs['throughput'] = throughput

# Function to visualize training metrics
def plot_training_metrics(metrics_callback):
    """Generate and save plots for various training metrics"""
    # Create a directory for plots if it doesn't exist
    os.makedirs("training_plots", exist_ok=True)

    # Plot training loss
    plt.figure(figsize=(10, 6))
    plt.plot(metrics_callback.training_steps, metrics_callback.training_losses)
    plt.title('Training Loss Over Time')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.savefig('training_plots/training_loss.png')
    plt.close()

    # Plot learning rate
    if metrics_callback.learning_rates:
        plt.figure(figsize=(10, 6))
        plt.plot(metrics_callback.training_steps[:len(metrics_callback.learning_rates)],
                 metrics_callback.learning_rates)
        plt.title('Learning Rate Schedule')
        plt.xlabel('Training Steps')
        plt.ylabel('Learning Rate')
        plt.grid(True)
        plt.savefig('training_plots/learning_rate.png')
        plt.close()

    # Plot gradient norm
    if metrics_callback.grad_norms:
        plt.figure(figsize=(10, 6))
        plt.plot(metrics_callback.training_steps[:len(metrics_callback.grad_norms)],
                 metrics_callback.grad_norms)
        plt.title('Gradient Norm Over Time')
        plt.xlabel('Training Steps')
        plt.ylabel('Gradient Norm')
        plt.grid(True)
        plt.savefig('training_plots/gradient_norm.png')
        plt.close()

    # Plot throughput
    if metrics_callback.throughput:
        plt.figure(figsize=(10, 6))
        plt.plot(metrics_callback.training_steps[:len(metrics_callback.throughput)],
                 metrics_callback.throughput)
        plt.title('Training Throughput (Samples/Second)')
        plt.xlabel('Training Steps')
        plt.ylabel('Samples/Second')
        plt.grid(True)
        plt.savefig('training_plots/throughput.png')
        plt.close()

    # Create a summary dataframe
    summary_data = {
        'Step': metrics_callback.training_steps,
        'Loss': metrics_callback.training_losses
    }

    if metrics_callback.learning_rates:
        # Extend learning rates to match the length of steps if needed
        lr_extended = metrics_callback.learning_rates + [metrics_callback.learning_rates[-1]] * (len(metrics_callback.training_steps) - len(metrics_callback.learning_rates))
        summary_data['Learning Rate'] = lr_extended[:len(metrics_callback.training_steps)]

    if metrics_callback.grad_norms:
        # Extend grad norms to match the length of steps if needed
        gn_extended = metrics_callback.grad_norms + [metrics_callback.grad_norms[-1]] * (len(metrics_callback.training_steps) - len(metrics_callback.grad_norms))
        summary_data['Gradient Norm'] = gn_extended[:len(metrics_callback.training_steps)]

    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv('training_plots/training_summary.csv', index=False)

    return summary_df

# Initialize wandb
wandb.init(project="llama-3.1-8b-finetuning", name="medical_qa_finetuning")

# Login to Hugging Face Hub
notebook_login()

# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mratan152004[0m ([33mratan152004-dayananda-sagar-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load the 8B model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Set up chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

# Data formatting function
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Load and prepare dataset
dataset = load_dataset("lavita/AlpaCare-MedInstruct-52k", split="train")
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched=True)

# Create metrics callback
metrics_callback = MetricsCallback()



==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.2.
   \\   /|    NVIDIA A40. Num GPUs = 1. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Map:   0%|          | 0/31532 [00:00<?, ? examples/s]

In [8]:
# Configure training arguments - using only supported parameters
training_args = TrainingArguments(
    per_device_train_batch_size=1,  # Reduced batch size for 8B model
    gradient_accumulation_steps=16,  # Increased for 8B model
    warmup_steps=5,
    max_steps=400,
    learning_rate=5e-5,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_torch_fused",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=3407,
    output_dir="outputs",
    report_to="wandb",  # Enable wandb reporting
    logging_first_step=True,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,
    push_to_hub=True,  # Enable pushing to Hub
    hub_model_id="ratan15/llama-3.1-8b-medical-qa",
    hub_strategy="every_save",
)

# Initialize the trainer with our custom callback
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    callbacks=[metrics_callback],  # Add our custom callback
)

# Apply response-only training
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# Train the model and capture stats
print("Starting training...")
trainer_stats = trainer.train()
print("Training completed!")

# Generate and save visualizations
print("Generating training visualizations...")
summary_df = plot_training_metrics(metrics_callback)
print(f"Training summary:\n{summary_df.describe()}")

# Log artifacts to wandb
wandb.log({"training_loss_plot": wandb.Image("training_plots/training_loss.png")})
if os.path.exists("training_plots/learning_rate.png"):
    wandb.log({"learning_rate_plot": wandb.Image("training_plots/learning_rate.png")})
if os.path.exists("training_plots/gradient_norm.png"):
    wandb.log({"gradient_norm_plot": wandb.Image("training_plots/gradient_norm.png")})
if os.path.exists("training_plots/throughput.png"):
    wandb.log({"throughput_plot": wandb.Image("training_plots/throughput.png")})
wandb.log({"training_summary": wandb.Table(dataframe=summary_df)})

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Push the model to Hugging Face Hub
print("Pushing model to Hugging Face Hub...")
model.push_to_hub("ratan15/llama-3-8b")
tokenizer.push_to_hub("ratan15/llama-3.1-8b-medical-qa")
print("Model successfully pushed to Hub!")

# Test the model with a sample query
messages = [
    {"role": "user", "content": "what is the cause of high blood pressure"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")
outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True,
                        temperature=1.5, min_p=0.1)
response = tokenizer.batch_decode(outputs)[0]
print(f"Sample response:\n{response}")

# Finish wandb run
wandb.finish()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/31532 [00:00<?, ? examples/s]

Map (num_proc=96):   0%|          | 0/31532 [00:00<?, ? examples/s]

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 31,532 | Num Epochs = 1 | Total steps = 400
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,1.8327
2,1.7504
3,1.6406
4,1.6681
5,1.6175
6,1.5999
7,1.4566
8,1.471
9,1.3808
10,1.2853


Training completed!
Generating training visualizations...
Training summary:
             Step        Loss  Learning Rate  Gradient Norm
count  400.000000  400.000000     400.000000     400.000000
mean   200.500000    1.008962       0.000025       0.459221
std    115.614301    0.127408       0.000018       0.137370
min      1.000000    0.760600       0.000000       0.247524
25%    100.750000    0.937950       0.000007       0.382323
50%    200.500000    0.994300       0.000025       0.437221
75%    300.250000    1.051075       0.000043       0.503499
max    400.000000    1.832700       0.000050       1.640152
Pushing model to Hugging Face Hub...


README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/ratan15/llama-3-8b


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model successfully pushed to Hub!
Sample response:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

what is the cause of high blood pressure<|eot_id|><|start_header_id|>assistant<|end_header_id|>

High blood pressure can be caused by a range of things including, genetics, lifestyle, diet, medical conditions and medicine.
References:
- https://www.nhs.uk/conditions/high-blood-pressure-hypertension/<|eot_id|>


0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▁▂▂▁▁▂▂▂▂▂▃▃▃▃▄▄▅▅▅▅▆▇▇▇▇▇▇▇██
train/global_step,▁▂▂▂▂▂▂▃▁▂▂▂▁▁▂▂▂▂▂▂▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇▇████
train/grad_norm,▃▂▂▃▂▂▃▂▁▃▃▃▃█▆▃▁▁▂▂▂▂▃▂▁▂▃▂▃▃▂▂▂▄▂▂▂▂▄▂
train/learning_rate,█▇▇▅▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/loss,▆▆▃▃▃▄▄▄▂▂▂▂▂▁▁▂█▇▅▃▃▃▂▄▃▃▄▃▃▂▃▂▂▃▃▂▃▃▃▃

0,1
total_flos,4.917398629399757e+16
train/epoch,0.20297
train/global_step,400.0
train/grad_norm,0.44434
train/learning_rate,0.0
train/loss,0.9453
train_loss,1.00896
train_runtime,2149.8831
train_samples_per_second,2.977
train_steps_per_second,0.186


In [None]:
messages = [
    {
        "role": "user",
        "content": "I often feel anxious in social situations. What are some ways to manage anxiety without medication?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True, max_length=512).to("cuda")


outputs = model.generate(**inputs, max_length=150, num_return_sequences=1)#, num_beams=10, early_stopping=False, repetition_penalty=2.2)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])