In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install deps

In [None]:
!pip -q install -U "transformers>=4.44" datasets accelerate peft bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Load your already tokenized dataset from disk

In [None]:
from datasets import load_from_disk

TOK_DATASET_PATH = "/content/drive/MyDrive/qwen2_sft_ds_tok"

ds_tok = load_from_disk(TOK_DATASET_PATH)
print(ds_tok)


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9575
})


Build transfer-weighted sampler (≈80/20 transfer/self)

In [None]:
import pandas as pd, torch
from torch.utils.data import WeightedRandomSampler


CSV_PATH = "/content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.csv"
df = pd.read_csv(CSV_PATH)

# Split tokenized dataset 
splits = ds_tok.train_test_split(test_size=0.1, seed=42)
val_test = splits["test"].train_test_split(test_size=0.5, seed=42)
train_ds = splits["train"]
eval_ds  = val_test["train"]
test_ds  = val_test["test"]

# Approximate alignment: assume row order from CSV → tokenized dataset
# Weight transfer higher than self (1.0 vs 0.25 ≈ 80/20)
mode_train = df.iloc[:len(train_ds)]["mode"].tolist()
weights = torch.tensor([1.0 if m == "transfer" else 0.25 for m in mode_train], dtype=torch.float)

sampler = WeightedRandomSampler(
    weights=weights,
    num_samples=len(weights),  # one "epoch" worth of samples
    replacement=True
)
len(train_ds), len(eval_ds), len(test_ds)


(8617, 479, 479)

Load Qwen2.5-1.5B-Instruct in QLoRA (4-bit)

In [None]:
import torch, platform
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda, "GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Torch: 2.8.0+cu126 CUDA: 12.6 GPU: Tesla T4


In [None]:

!pip uninstall -y bitsandbytes
!pip install -U --quiet bitsandbytes accelerate transformers



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"


bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb,
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)

peft_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


Data collator (+ tokenizer)

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

TrainingArguments (LR 7e-6 → 7e-7, WD=0.1, grad clip=1.0)

In [None]:
from transformers import TrainingArguments
import torch

EPOCHS = 2
LR_START = 7e-6
LR_END   = 7e-7
WARMUP_RATIO = 0.03
OUTPUT_DIR = "/content/drive/MyDrive/qwen25_sft_lora"

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,      # effective batch = 16
    eval_strategy="steps",        
    save_strategy="steps",              # <-- ensure saving is step-based
    eval_steps=500,
    logging_steps=50,                   # log a bit more frequently
    save_steps=50,                      # <-- save every 50 steps
    save_total_limit=5,                # keep last 10 checkpoints 
    lr_scheduler_type="polynomial",
    learning_rate=LR_START,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=0.1,
    max_grad_norm=1.0,
    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8,  # bf16 if supported
    gradient_checkpointing=True,
    dataloader_num_workers=2,
    optim="paged_adamw_32bit",
    report_to="none",
)

Custom Trainer (weighted sampler + end LR = 7e-7)

In [None]:
import math
from torch.utils.data import DataLoader
from transformers import Trainer
from transformers import get_polynomial_decay_schedule_with_warmup

class WeightedTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            sampler=sampler,                       # bias to transfer
            collate_fn=self.data_collator,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=True,
        )

class CustomSchedulerTrainer(WeightedTrainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer()
        warmup_steps = math.ceil(num_training_steps * self.args.warmup_ratio)
        self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=num_training_steps,
            lr_end=LR_END,                        #  7e-7 final LR
            power=1.0,                            # linear decay
        )

trainer = CustomSchedulerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tok,
    data_collator=data_collator,
)


  trainer = CustomSchedulerTrainer(


8) Train, then save adapter (and optionally merge)

In [None]:
import os
from transformers.trainer_utils import get_last_checkpoint

last_ckpt = get_last_checkpoint(OUTPUT_DIR) if os.path.isdir(OUTPUT_DIR) else None
print("Resuming from:", last_ckpt)

# Use our (CustomScheduler) Trainer instance named `trainer`
trainer.train(resume_from_checkpoint=last_ckpt)

# Save LoRA adapter 
trainer.save_model(f"{OUTPUT_DIR}/adapter")

#  merge LoRA into full model for standalone export
model = model.merge_and_unload()
model.save_pretrained("/content/drive/MyDrive/qwen25_sft_merged")
tok.save_pretrained("/content/drive/MyDrive/qwen25_sft_merged")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Resuming from: /content/drive/MyDrive/qwen25_sft_lora/checkpoint-950


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
1000,1.6524,1.642911




('/content/drive/MyDrive/qwen25_sft_merged/tokenizer_config.json',
 '/content/drive/MyDrive/qwen25_sft_merged/special_tokens_map.json',
 '/content/drive/MyDrive/qwen25_sft_merged/chat_template.jinja',
 '/content/drive/MyDrive/qwen25_sft_merged/vocab.json',
 '/content/drive/MyDrive/qwen25_sft_merged/merges.txt',
 '/content/drive/MyDrive/qwen25_sft_merged/added_tokens.json',
 '/content/drive/MyDrive/qwen25_sft_merged/tokenizer.json')