### Setup and Installation


In [None]:
%%capture
try:
    import unsloth
except ImportError:
    print("Installing faiss-cpu and unsloth...")
    !pip install faiss-cpu unsloth

In [None]:
import random

import numpy as np
import torch

seed = 47
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


### Load the Language Model


In [None]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="cuda:0",
)

==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


### Apply PEFT (Parameter-Efficient Fine-Tuning)


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=47,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth 2025.6.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Prepare the Dataset


In [None]:
from datasets import concatenate_datasets, load_dataset

subreddits = ["askphysics"]
splits = ["train", "validation", "test"]

all_data = {split: [] for split in splits}

for subreddit in subreddits:
    data = load_dataset("stanfordnlp/shp", data_dir=subreddit)
    for split in splits:
        all_data[split].append(data[split])

final_dataset = {split: concatenate_datasets(all_data[split]) for split in splits}

train_dataset = final_dataset["train"]
val_dataset = final_dataset["validation"]
test_dataset = final_dataset["test"]

In [None]:
# The data must be formatted with appropriate prompt template first.
# See details here: https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


# https://huggingface.co/datasets/ogbrandt/gpt4_preference_rlaif
def format_prompt(sample):
    instruction = "You are an AI assistant. You will be asked a question. You must generate a correct answer."
    input = sample["history"]
    if sample["labels"] == 1:
        accepted = sample["human_ref_A"]
        rejected = sample["human_ref_B"]
    else:
        accepted = sample["human_ref_B"]
        rejected = sample["human_ref_A"]

    sample["prompt"] = alpaca_prompt.format(instruction, input, "")
    sample["chosen"] = accepted + EOS_TOKEN
    sample["rejected"] = rejected + EOS_TOKEN
    return sample


pass

mapped_train_dataset = train_dataset.map(
    format_prompt,
)
mapped_val_dataset = val_dataset.map(
    format_prompt,
)
mapped_test_dataset = test_dataset.map(
    format_prompt,
)

In [None]:
import pprint

row = mapped_train_dataset[1]
print("INSTRUCTION: " + "=" * 50)
pprint.pprint(row["prompt"])
print("ACCEPTED: " + "=" * 50)
pprint.pprint(row["chosen"])
print("REJECTED: " + "=" * 50)
pprint.pprint(row["rejected"])

('Below is an instruction that describes a task, paired with an input that '
 'provides further context. Write a response that appropriately completes the '
 'request.\n'
 '\n'
 '### Instruction:\n'
 'You are an AI assistant. You will be asked a question. You must generate a '
 'correct answer.\n'
 '\n'
 '### Input:\n'
 'The early physicist Torricelli wrote "We live submerged at the bottom of an '
 'ocean of air." Like we see on the ordinary oceans, are there giant waves of '
 'air crashing against each other in the upper atmosphere?\n'
 '\n'
 '### Response:\n')
('To begin, I am not an atmospheric scientist, but here is my understanding.  '
 'Well there isn’t a well defined surface of the atmosphere between air and '
 'space like there is with the ocean between water and air because as a gas, '
 'the atmosphere is compressible. The Kármán line is a legal definition of the '
 'edge of space, but in reality, the atmosphere just keeps thinning and '
 'thinning until it kind of blends with

### Inference with Streaming before training

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

sample = mapped_test_dataset[489]

inputs = tokenizer(
    [
        alpaca_prompt.format(
            "You are an AI assistant. You will be asked a question. You must generate a correct answer.",  # instruction
            sample["history"],
            "",
        )
    ],
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

print("\n")
print(f"{'=' * 50} SAMPLE {'=' * 50}")
print("\n")
print(f"Answer A: {sample['human_ref_A']}")
print("\n")
print(f"Answer B: {sample['human_ref_B']}")
print("\n")
print(f"Chosen: {'A' if sample['labels'] == 1 else 'B'}")

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI assistant. You will be asked a question. You must generate a correct answer.

### Input:
I wonder why nuclear fusion produces less nuclear waste than nuclear fission and the materials are easier to obtain. Why not nuclear fusion then?

### Response:
The reason nuclear fusion produces less nuclear waste than nuclear fission is that it does not produce radioactive byproducts like plutonium and strontium-90. In contrast, nuclear fission produces radioactive isotopes like uranium-238, uranium-235, and plutonium-239, which are highly radioactive and require special handling and storage. Additionally, nuclear fission is a more complex and energy-intensive process than nuclear fusion, which is the process by which atomic nuclei combine to form a heavier nucleus. This makes nuclear fusion a

### Configure the DPO Trainer


In [None]:
import wandb

wandb.login(key="123")

wandb.init(project="nlp_lab2", name="llama-finetune", reinit=True)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnomark[0m ([33mnomark-igor-sikorsky-kyiv-polytechnic-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Enable reward modelling stats
from unsloth import PatchDPOTrainer

PatchDPOTrainer()
from transformers import TrainingArguments
from trl import DPOConfig, DPOTrainer
from unsloth import is_bfloat16_supported

dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=DPOConfig(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        num_train_epochs=1,
        learning_rate=5e-6,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=5,
        eval_steps=23,
        save_steps=46,
        save_total_limit=1,
        optim="adamw_8bit",
        weight_decay=0.0,
        lr_scheduler_type="linear",
        seed=47,
        output_dir="outputs",
        report_to="wandb",
    ),
    beta=0.1,
    train_dataset=mapped_train_dataset,
    eval_dataset=mapped_val_dataset,
    tokenizer=tokenizer,
    max_length=1024,
    max_prompt_length=512,
)

### Start Training


In [10]:
dpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,364 | Num Epochs = 1 | Total steps = 460
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
5,0.6932,0.000132,0.000195,0.2375,-6.3e-05,-471.976471,-234.689163,1.648399,1.6538,0,0,0,0
10,0.694,-0.001314,0.000291,0.4375,-0.001605,-404.364075,-268.030212,1.848736,1.692958,No Log,No Log,No Log,No Log
15,0.6925,0.001676,0.000289,0.5875,0.001387,-576.925659,-253.095047,1.880323,1.810536,No Log,No Log,No Log,No Log
20,0.6931,0.001087,0.000927,0.4875,0.00016,-438.94278,-216.122437,1.640912,1.616231,No Log,No Log,No Log,No Log
25,0.6925,0.004047,0.002694,0.5125,0.001353,-409.801361,-273.399078,1.68104,1.72472,No Log,No Log,No Log,No Log
30,0.6914,0.007568,0.004123,0.7,0.003444,-391.753845,-260.257812,1.71553,1.621474,No Log,No Log,No Log,No Log
35,0.6897,0.012468,0.005488,0.575,0.006979,-420.986145,-264.975891,1.796679,1.730472,No Log,No Log,No Log,No Log
40,0.6905,0.015319,0.009816,0.625,0.005503,-381.712067,-272.922668,1.735772,1.613601,No Log,No Log,No Log,No Log
45,0.6861,0.02619,0.011842,0.6875,0.014348,-367.613464,-290.052643,1.656349,1.736934,No Log,No Log,No Log,No Log
50,0.6884,0.025195,0.015419,0.6125,0.009776,-275.874237,-249.854202,1.742872,1.638855,No Log,No Log,No Log,No Log


TrainOutput(global_step=460, training_loss=0.6266280619994454, metrics={'train_runtime': 4660.5399, 'train_samples_per_second': 1.58, 'train_steps_per_second': 0.099, 'total_flos': 0.0, 'train_loss': 0.6266280619994454, 'epoch': 0.9994568169473113})

### Save the Model Locally


In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

### Load saved model

In [None]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./lora_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="cuda:0",
)

==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.6.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Inference with Streaming after training

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

sample = mapped_test_dataset[489]

inputs = tokenizer(
    [
        alpaca_prompt.format(
            "You are an AI assistant. You will be asked a question. You must generate a correct answer.",  # instruction
            sample["history"],
            "",
        )
    ],
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

print("\n")
print(f"{'=' * 50} SAMPLE {'=' * 50}")
print("\n")
print(f"Answer A: {sample['human_ref_A']}")
print("\n")
print(f"Answer B: {sample['human_ref_B']}")
print("\n")
print(f"Chosen: {'A' if sample['labels'] == 1 else 'B'}")

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI assistant. You will be asked a question. You must generate a correct answer.

### Input:
I wonder why nuclear fusion produces less nuclear waste than nuclear fission and the materials are easier to obtain. Why not nuclear fusion then?

### Response:
Nuclear fusion is the process by which atomic nuclei combine to form a heavier nucleus, releasing energy in the process. However, this process is not yet widely used as it is extremely difficult to create and control. The main reason why nuclear fusion produces less nuclear waste than nuclear fission is that the materials produced are not radioactive. The materials produced in nuclear fission, such as uranium-235, are highly radioactive and radioactive decay is a major source of nuclear waste. On the other hand, the materials produced in