In [None]:
%%capture
!pip install -U "unsloth[cpu,cu121]" --extra-index-url https://download.pytorch.org/whl/cu121
!pip install -U transformers==4.39.3
!pip install -U datasets==2.19.1
!pip install -U accelerate==0.28.0
!pip install -U trl==0.7.11
!pip install -U bitsandbytes==0.43.0

import os
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig

AttributeError: _ARRAY_API not found

AttributeError: _ARRAY_API not found

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
###################################### A ###########################################
# Make sure you're on a GPU runtime:
# Runtime -> Change runtime type -> Hardware accelerator: GPU

import torch, platform, psutil

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("Python:", platform.python_version())
print("RAM (GB):", round(psutil.virtual_memory().total / 1e9, 2))

# Basic training config
MODEL_NAME = "unsloth/SmolLM2-135M-Instruct"  # SmolLM2 135M instruct, Unsloth-optimized
MAX_SEQ_LEN = 512
DTYPE = None  # Let Unsloth choose (fp16/bf16 depending on GPU)

OUTPUT_DIR = "smollm2_135m_full_finetune_alpaca"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Using model:", MODEL_NAME)

# Load a subset of the Alpaca dataset
raw_dataset = load_dataset("tatsu-lab/alpaca", split="train[:1000]")  # first 1000 examples
raw_dataset

def format_example(example):
    instruction = example["instruction"]
    inp = example.get("input", "")
    output = example["output"]

    if inp is None:
        inp = ""

    if inp.strip():
        text = (
            "### Instruction:\n"
            f"{instruction}\n\n"
            "### Input:\n"
            f"{inp}\n\n"
            "### Response:\n"
            f"{output}"
        )
    else:
        text = (
            "### Instruction:\n"
            f"{instruction}\n\n"
            "### Response:\n"
            f"{output}"
        )
    return {"text": text}

processed_dataset = raw_dataset.map(format_example, remove_columns=raw_dataset.column_names)
processed_dataset = processed_dataset.shuffle(seed=42)
processed_dataset[0]

#Load SmolLM2-135M with full_finetuning=True
max_seq_length = MAX_SEQ_LEN

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = MODEL_NAME,
    max_seq_length  = max_seq_length,
    dtype           = DTYPE,
    load_in_4bit    = False,           # full 16-bit weights
    full_finetuning = True,            # ‚úÖ FULL FINETUNING
)

# Put model into training mode (Unsloth patches + gradient config)
FastLanguageModel.for_training(model)

# Make sure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Pad token:", tokenizer.pad_token, tokenizer.pad_token_id)

train_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,      # effective batch size = 16
    learning_rate=5e-5,                 # slightly conservative for full FT on small model
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),     # use bf16 if supported
    fp16=not torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
    max_seq_length=MAX_SEQ_LEN,
)
train_config

# Create trainer & start full fine-tuning
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_dataset,
    dataset_text_field="text",
    packing=True,           # pack multiple samples into one sequence to use context better
    args=train_config,
)

trainer.train()


Torch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
Python: 3.12.12
RAM (GB): 54.75
Using model: unsloth/SmolLM2-135M-Instruct


NameError: name 'load_dataset' is not defined

In [None]:
#################################### B #######################
import os
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig

# Basic training config
MODEL_NAME = "unsloth/SmolLM2-135M-Instruct"
MAX_SEQ_LEN = 512
DTYPE = None  # Let Unsloth choose (fp16/bf16)

OUTPUT_DIR = "smollm2_135m_lora_finetune_alpaca"  # üëà different from full FT
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Using model:", MODEL_NAME)

def format_example(example):
    instruction = example["instruction"]
    inp = example.get("input", "") or ""
    output = example["output"]

    if inp.strip():
        text = (
            "### Instruction:\n"
            f"{instruction}\n\n"
            "### Input:\n"
            f"{inp}\n\n"
            "### Response:\n"
            f"{output}"
        )
    else:
        text = (
            "### Instruction:\n"
            f"{instruction}\n\n"
            "### Response:\n"
            f"{output}"
        )
    return {"text": text}

processed_dataset = raw_dataset.map(
    format_example,
    remove_columns=raw_dataset.column_names,
)
processed_dataset = processed_dataset.shuffle(seed=42)
processed_dataset[0]

max_seq_length = MAX_SEQ_LEN

# 1) Load base model in 4-bit for parameter-efficient training
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = MODEL_NAME,
    max_seq_length  = max_seq_length,
    dtype           = DTYPE,
    load_in_4bit    = True,    # ‚úÖ 4-bit + LoRA
    full_finetuning = False,   # ‚úÖ NOT full finetuning
)

# 2) Turn this base model into a LoRA model
# You can tweak r, lora_alpha, lora_dropout if needed
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

# Prepare for training
FastLanguageModel.for_training(model)

# Padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Pad token:", tokenizer.pad_token, tokenizer.pad_token_id)


# slighty different finetuning
train_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,               # üîº a bit higher LR for LoRA
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
    max_seq_length=MAX_SEQ_LEN,
)
train_config

#same as colab 1
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_dataset,
    dataset_text_field="text",
    packing=True,
    args=train_config,
)

trainer.train()

#lora model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved LoRA adapter + tokenizer to:", OUTPUT_DIR)

#quick lora test
from transformers import TextStreamer

FastLanguageModel.for_inference(model)

def generate(prompt, max_new_tokens=128):
    formatted = (
        "### Instruction:\n"
        f"{prompt}\n\n"
        "### Response:\n"
    )
    inputs = tokenizer(
        formatted,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SEQ_LEN,
    ).to(model.device)

    streamer = TextStreamer(tokenizer)
    _ = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        streamer=streamer,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

test_prompt = "Write a short, friendly email apologizing for a delayed response."
generate(test_prompt)


ImportError: cannot import name 'BeamBasedBuilder' from 'datasets.builder' (/usr/local/lib/python3.12/dist-packages/datasets/builder.py)

In [None]:
######################################## C #################################################

OUTPUT_DIR = "smollm2_135m_dpo_orca"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Using model:", MODEL_NAME)

#load dataset
dataset_raw = load_dataset("Intel/orca_dpo_pairs", split="train")  # large; we'll slice it
dataset_raw

# small subset for demo (you can increase later)
dataset_raw_small = dataset_raw.shuffle(seed=42).select(range(2000))
len(dataset_raw_small), dataset_raw_small[0]

# convert to prompt
def make_preference(example):
    prompt = example["question"]
    chosen = example["chosen"]
    rejected = example["rejected"]

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

pref_dataset = dataset_raw_small.map(
    make_preference,
    remove_columns=dataset_raw_small.column_names,
)
pref_dataset[0]

# different chat templates
from unsloth.chat_templates import get_chat_template

# Load model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_NAME,
    max_seq_length = MAX_SEQ_LEN,
    dtype          = DTYPE,
    load_in_4bit   = LOAD_IN_4BIT,
)

# Use a Zephyr-style chat template (good generic default)
tokenizer.chat_template = get_chat_template(tokenizer, "zephyr")

# Turn the base model into a LoRA model (PEFT)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    bias="none",
)

FastLanguageModel.for_training(model)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Pad token:", tokenizer.pad_token, tokenizer.pad_token_id)

#training config
dpo_config = DPOConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,          # you can increase later
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    warmup_ratio=0.05,
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    max_length=MAX_SEQ_LEN,
    max_prompt_length=256,
)

dpo_config

# creating DPOT training
from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model        = model,
    ref_model    = None,          # if None, TRL clones a frozen reference model internally
    args         = dpo_config,
    train_dataset= pref_dataset,
    eval_dataset = None,
    tokenizer    = tokenizer,
    max_length   = MAX_SEQ_LEN,
    max_prompt_length = 256,
    beta         = 0.1,           # strength of preference signal
)

dpo_trainer.train()


In [None]:
############################################# D ###########################################

#Load reasoning model with Unsloth + LoRA
from unsloth import FastLanguageModel

import torch

max_seq_length = 1024      # can increase for longer reasoning traces
lora_rank      = 32        # LoRA rank; higher = more capacity, more VRAM

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="google/gemma-3-1b-it",   # you can swap to Qwen3 / Llama 3.2 1B, etc.
    max_seq_length=max_seq_length,
    load_in_4bit=True,                   # 4-bit QLoRA
    fast_inference=True,                 # enable vLLM-backed fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,          # lower if you OOM
)

model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loaded model:", model.__class__.__name__)
print("Pad token:", tokenizer.pad_token, tokenizer.pad_token_id)

# System prompt that enforces an XML ‚Äúreasoning + answer‚Äù structure
SYSTEM_PROMPT = """
You are a math reasoning assistant.

Respond in the following exact format:
<reasoning>
...step by step reasoning here...
</reasoning>
<answer>
...final numeric answer ONLY here...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

import re
from datasets import load_dataset, Dataset

def extract_xml_answer(text: str) -> str:
    # Pull the content inside <answer>...</answer>
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    # GSM8K stores answer like "‚Ä¶#### 42"
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split: str = "train") -> Dataset:
    data = load_dataset("openai/gsm8k", "main")[split]
    # For speed, take a small subset for demo; increase for real runs
    data = data.shuffle(seed=42).select(range(500))

    data = data.map(
        lambda x: {
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user",   "content": x["question"]},
            ],
            "answer": extract_hash_answer(x["answer"]),
        }
    )
    return data

dataset = get_gsm8k_questions("train")
dataset[0]

# 1) Reward: correctness (does final answer match ground truth?)
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    # completions: list[list[{"role": "assistant", "content": "..."}]]
    responses = [completion[0]["content"] for completion in completions]
    q = prompts[0][-1]["content"]  # last user question for logging
    extracted_responses = [extract_xml_answer(r) for r in responses]

    print(
        "-" * 40,
        f"\nQuestion:\n{q}",
        f"\nGold answer:\n{answer[0]}",
        f"\nModel full response:\n{responses[0]}",
        f"\nExtracted final answer:\n{extracted_responses[0]}",
    )

    # reward 2.0 if exact numeric match, else 0
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]


# 2) Reward: is the final answer an integer?
def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]["content"] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]


# 3) Reward: strict XML format
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n?$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r, flags=re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]


# 4) Reward: soft XML format (less strict)
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, r, flags=re.DOTALL) for r in responses]
    return [0.5 if match else 0.0 for match in matches]


# 5) Reward: XML tag counting + penalize junk after </answer>
def count_xml(text: str) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        # penalize characters after </answer>
        count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

from trl import GRPOConfig, GRPOTrainer

max_prompt_length = 256

training_args = GRPOConfig(
    learning_rate=5e-6,
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,  # you can set 4 for smoother updates
    num_generations=6,              # number of samples per prompt
    max_prompt_length=max_prompt_length,
    max_completion_length=max_seq_length - max_prompt_length,
    max_steps=250,                  # demo; for real runs use 300+ as docs suggest
    save_steps=250,
    max_grad_norm=0.1,
    report_to="none",
    output_dir="grpo_gemma3_1b_gsm8k",
    # Optional advanced GRPO variants (from Unsloth docs):
    # loss_type="grpo",
    # epsilon=0.2,
    # epsilon_high=0.28,
    # delta=1.5,
    # mask_truncated_completions=True,
)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args=training_args,
    train_dataset=dataset,
)

trainer.train()


In [None]:
################################################### E #############################################
!nvidia-smi
!pip install -q "unsloth>=2024.10.0" "transformers>=4.45.0" \
               "accelerate>=1.0.0" "datasets>=3.0.0" \
               "bitsandbytes>=0.43.0"

from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling
import torch, textwrap, os, random

max_seq_length = 2048     # context length
dtype = None              # let Unsloth pick bf16/float16 if available
load_in_4bit = True       # QLoRA style
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

#Load the base model (before CPT)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_name,
    max_seq_length  = max_seq_length,
    dtype           = dtype,
    load_in_4bit    = load_in_4bit,
)

# Make sure padding is configured correctly
tokenizer.pad_token    = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loaded model:", model_name)


#preparing new language
import os

corpus_path = "new_language_corpus.txt"

if not os.path.exists(corpus_path):
    # Tiny dummy corpus so the notebook still runs even without your real data.
    # Replace this with a real language corpus (e.g. Tamil, Telugu, Finnish, etc.).
    dummy_lines = [
        "mavo lira sento kala miro.",
        "tora selen avo mira lavo.",
        "kalo miro siven tala voru.",
        "mira lira kavo sento luno.",
    ]
    with open(corpus_path, "w", encoding="utf-8") as f:
        for line in dummy_lines:
            f.write(line + "\n")
    print(f"{corpus_path} not found ‚Äì created a tiny dummy corpus.")
else:
    print(f"Found corpus file: {corpus_path}")

with open(corpus_path, "r", encoding="utf-8") as f:
    lines = [l.strip() for l in f if l.strip()]

print(f"Loaded {len(lines)} lines from corpus.")
print("Example lines:\n")
print("\n".join(lines[:5]))

from sklearn.model_selection import train_test_split

train_lines, val_lines = train_test_split(lines, test_size=0.05, random_state=seed)

train_dataset = Dataset.from_dict({"text": train_lines})
eval_dataset  = Dataset.from_dict({"text": val_lines})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": eval_dataset,
})

dataset


#before CPT sample
FastLanguageModel.for_inference(model)  # switches to faster inference mode

def generate(text, max_new_tokens=80):
    prompt_ids = tokenizer(
        text,
        return_tensors="pt",
        padding=False,
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **prompt_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.8,
            top_p=0.95,
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

test_prompt = lines[0] if len(lines) > 0 else "mavo lira"
print("=== BEFORE CPT ===")
print(generate(test_prompt))

#Add LoRA for continued pretraining
# Switch back to train mode
model = FastLanguageModel.get_peft_model(
    model,
    r              = 32,   # rank
    lora_alpha     = 16,
    lora_dropout   = 0.0,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "lm_head", "embed_tokens",
    ],
)

model.print_trainable_parameters()

#Data collator for language modeling
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm       = False,  # causal LM, NOT masked LM
)

#define unsloth training arguments
output_dir = "llama32_new_language_cpt"

training_args = UnslothTrainingArguments(
    output_dir                      = output_dir,
    per_device_train_batch_size     = 2,
    gradient_accumulation_steps     = 4,
    max_steps                       = 200,   # for demo; use more for real training
    warmup_steps                    = 20,
    logging_steps                   = 10,
    save_strategy                   = "steps",
    save_steps                      = 100,
    learning_rate                   = 5e-5,
    embedding_learning_rate         = 5e-6,  # 10x smaller for lm_head/embed_tokens
    bf16                            = torch.cuda.is_available(),
    lr_scheduler_type              = "cosine",
    weight_decay                    = 0.01,
)

#create and run CPT
trainer = UnslothTrainer(
    model         = model,
    tokenizer     = tokenizer,
    args          = training_args,
    train_dataset = dataset["train"],
    eval_dataset  = dataset["validation"],
    data_collator = data_collator,
)

trainer.train()
