In [None]:
!pip install -U transformers trl bitsandbytes peft accelerate datasets huggingface_hub hf_transfer

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("deebak14/rhinoscript_ft_data_05_train", split = "train")
eval_dataset = load_dataset("deebak14/rhinoscript_ft_data_05_eval", split = "train")
#dpo_dataset = load_dataset("deebak14/rhinoscript_ft_data_dpo_01", split = "train")


In [None]:
print(train_dataset)
print(eval_dataset)

train_dataset[0]
eval_dataset[0]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "Qwen/Qwen3-14B"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Setup bitsandbytes 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load the model in 4bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="flash_attention_2",
    quantization_config=bnb_config,
    device_map="auto",         # auto-assigns model to GPU if available
    torch_dtype=torch.float16, # or 'auto' if you want auto-detection
)

# Set max sequence length when tokenizing/generating, not in .from_pretrained
max_seq_length = 3072


In [25]:
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import math

def evaluate_model_loss(model, tokenizer, dataset, max_length=2048):
    model.eval()
    losses = []
    for example in tqdm(dataset, desc="Evaluating"):
        messages = example["messages"]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length).to(model.device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss.item()
            losses.append(loss)
    return sum(losses) / len(losses)

In [None]:
mean_loss = evaluate_model_loss(model, tokenizer, eval_dataset)
perplexity = math.exp(mean_loss)

print(f"\n✅ Mean loss of Base model: {mean_loss:.4f}")
print(f"📉 Perplexity: {perplexity:.2f}")

In [None]:
✅ Mean loss of Base model: 2.3576
📉 Perplexity: 10.57

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
lora_config = LoraConfig(
    r=64,                     # LoRA rank
    lora_alpha=128,           # LoRA alpha (often 2x r)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM",    # Important for language modeling
)

# Attach LoRA to model
model = get_peft_model(model, lora_config)

# (Optional) Print trainable parameters to confirm LoRA is applied
model.print_trainable_parameters()

In [9]:
with open("/workspace/qwen_chat_template.jinja") as f:
    tokenizer.chat_template = f.read()

In [None]:
from trl import apply_chat_template

# Pick a few examples from your dataset
for i in range(3):
    sample = train_dataset[i]
    # Apply the chat template with tokenization and assistant mask
    output = tokenizer.apply_chat_template(
        sample['messages'],
        tokenize=True,
        return_assistant_tokens_mask=True,
        return_dict=True,
    )

    print(f"Sample {i}:")
    print("".join(map(str, output["assistant_masks"])))
    print("-" * 40)

In [None]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    dataset_text_field="messages",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.05,                # Increased for more stable start
    num_train_epochs=10,
    learning_rate= 2e-6,             #1e-5             # Lowered for better generalization
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    seed=3407,
    report_to="none",
    eval_strategy="epoch",          # Evaluate and save every epoch
    save_strategy="epoch",
    save_total_limit=1,             # Only keep last 3 checkpoints
    max_grad_norm=1.0,              # Clip gradients
    fp16=True,                      # Enable mixed-precision if available (optional, can remove if not supported)
    push_to_hub=False,
    neftune_noise_alpha=5,
    assistant_only_loss=True,
    chat_template_path="/workspace/qwen_chat_template.jinja"
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


In [None]:
# Get the full input_ids and assistant_masks for a sample
sample = trainer.train_dataset[100]
input_ids = sample["input_ids"]
assistant_masks = sample["assistant_masks"]

# Extract token ids for assistant tokens only
assistant_token_ids = [tid for tid, mask in zip(input_ids, assistant_masks) if mask == 1]

# Now decode ONLY those assistant tokens
assistant_text = tokenizer.decode(assistant_token_ids)
print("Assistant text only:\n", assistant_text)


In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [23]:
trainer_stats = trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6212,0.611894
2,0.4755,0.482116
3,0.4116,0.420597
4,0.3734,0.378556
5,0.3377,0.347816
6,0.2963,0.323704
7,0.295,0.308346
8,0.2816,0.299537
9,0.2548,0.296456
10,0.2465,0.295906


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
mean_loss = evaluate_model_loss(model, tokenizer, eval_dataset)
perplexity = math.exp(mean_loss)

print(f"\n✅ Mean loss of fine-tuned model: {mean_loss:.4f}")
print(f"📉 Perplexity: {perplexity:.2f}")

In [None]:
✅ Mean loss of fine-tuned model: 1.1498
📉 Perplexity: 3.16

In [None]:
model.push_to_hub("deebak14/qwen_14b_ft_v2") # Online saving
tokenizer.push_to_hub("deebak14/qwen_14b_ft_v2") # Online saving

In [None]:
from datasets import load_dataset

# Load the full dataset
dpo_dataset = load_dataset("deebak14/rhinoscript_ft_data_dpo_01", split="train")

# Split into 85% train and 15% eval
split_dataset = dpo_dataset.train_test_split(test_size=0.15, seed=42)

# Access the subsets
dpo_train_dataset = split_dataset["train"]
dpo_eval_dataset = split_dataset["test"]

# Optional: Check sizes
print(f"Train size: {len(dpo_train_dataset)}")
print(f"Eval size: {len(dpo_eval_dataset)}")

In [None]:
print(dpo_train_dataset)
print(dpo_eval_dataset)

In [None]:
from trl import DPOConfig, DPOTrainer

dpo_config = DPOConfig(
    beta=0.1,
    max_prompt_length=512,
    max_length=2048,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_ratio = 0.1,
    num_train_epochs = 3,
    learning_rate= 2e-6,
    fp16 = True,
    logging_steps = 10,
    save_strategy="epoch",
    save_total_limit=1,             # Only keep last 3 checkpoints
    max_grad_norm=1.0,
    output_dir = "/content/outputs",
)

dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,  # Uses frozen copy of base model internally
    args=dpo_config,
    train_dataset=dpo_train_dataset,
    eval_dataset=None,
)

In [None]:
dpo_trainer_stats = dpo_trainer.train()

<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

# After SFT

In [None]:
from transformers import pipeline

# Use a pipeline for easy inference
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example usage
messages = [
    {"role" : "system",
     "content" : """
You are an expert in Python programming, 3D modeling, computational design, geometry, and creative coding using Rhino — particularly with the `rhinoscriptsyntax` Python module.

Your task is to accurately interpret and analyze the user’s modeling query, and generate a corresponding Python script that accomplishes the intended task inside Rhino.
When needed, you may also use standard Python modules to support the solution (e.g., `math`, etc.).
Always reason step by step before producing the final code.
Your response must:
- Be precise and unambiguous.
- Reflect the user’s intent clearly.
- Produce a script that runs without errors inside Rhino.
- Follow clean, readable, and Pythonic structure.

Only output the final working Python script inside a valid Python code block (` ```python ... ``` `).
"""},
    {"role" : "user",
     "content" : "create a cubeS along a circle"}
]

prompt = pipe.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking = False
)

outputs = pipe(
        prompt,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.8,
        top_k=20,
        min_p=0
)
print(outputs[0]["generated_text"])