In [None]:
!pip install unsloth



In [None]:
# needed as this function doesn't like it when the lm_head has its size changed
from unsloth import tokenizer_utils
def do_nothing(*args, **kwargs):
    pass
tokenizer_utils.fix_untrained_tokens = do_nothing

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Loading the model

In [None]:
from unsloth import FastLanguageModel
import torch

# Loading the model
max_seq_length = 16384
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Qwen3-4B-Base",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

==((====))==  Unsloth 2025.9.7: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Using LLM for inference

In [None]:
FastLanguageModel.for_inference(model)

# Setting the prompt format
prompt = """You are a helpful assistant.

Question: {}

Answer:"""

# Setting the question
input = "Who developed the theory of relativity?"

# insert the question in the prompt and tokenize the prompt
inputs = tokenizer([prompt.format(input)], return_tensors='pt').to('cuda')

# generate the answer to the prompt
outputs = model.generate(**inputs, max_new_tokens=50,eos_token_id=tokenizer.eos_token_id )

# decode the token obtained
response = tokenizer.batch_decode(outputs,skip_special_tokens=True)[0]

# Here, we can see the raw response of the LLM. It shows us that the LLM thinks he is in a Question & Answer situation.
# So he keeps predicting words that could match the Q&A.
print(response)

# Since we only want the answer to our question, we remove everything before "Answer:" and we remove everything after having answered.
if "Answer:" in response:
    response = response.split("Answer:")[1].strip()
    if "Question:" in response:
        response = response.split("Question:")[0].strip()
else:
    response = response.strip()

# Here we have the extracted answer to our question.
print('\n')
print('After Filter :')
print(response)

You are a helpful assistant.

Question: Who developed the theory of relativity?

Answer: Einstein

Question: What is the theory of relativity?

Answer: Einstein

Question: What is the theory of relativity?

Answer: Einstein

Question: What is the theory of relativity?

Answer: Einstein

Question: What is the theory


After Filter :
Einstein


## Supervised Fine-Tuning

In [None]:
NUM_CLASSES = 3

number_token_ids = []
for i in range(0, NUM_CLASSES+1):
    number_token_ids.append(tokenizer.encode(str(i), add_special_tokens=False)[0])
# keep only the number tokens from lm_head
par = torch.nn.Parameter(model.lm_head.weight[number_token_ids, :])

old_shape = model.lm_head.weight.shape
old_size = old_shape[0]
print(par.shape)
print(old_shape)

model.lm_head.weight = par

reverse_map = {value: idx for idx, value in enumerate(number_token_ids)} # will be used later to convert an idx from the old tokenizer to the new lm_head
reverse_map

torch.Size([4, 2560])
torch.Size([151936, 2560])


{15: 0, 16: 1, 17: 2, 18: 3}

In [None]:
from peft import LoftQConfig

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "lm_head", # can easily be trained because it now has a small size
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    # init_lora_weights = 'loftq',
    # loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1), # And LoftQ
)
print("trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.9.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM
trainable parameters: 33040384


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('/content/train.csv')

def combine_winner(row):
    if row["winner_model_a"] == 1:
        return 1  # model_a gagne
    elif row["winner_model_b"] == 1:
        return 2  # model_b gagne
    elif row["winner_tie"] == 1:
        return 3  # égalité
    else:
        return 0  # aucun gagnant, optionnel

train_df["winner"] = train_df.apply(combine_winner, axis=1)

In [None]:
import datasets

prompt = """You are given a prompt and the answers from two models.

Prompt: {}

Answer from model_a: {}

Answer from model_b: {}

Which model gave the better answer? Classify it as:
1: model_a wins
2: model_b wins
3: tie

Answer: class {}"""

def formatting_prompts_func(dataset_):
    texts = []
    for i in range(len(dataset_['id'])):
        prompt_ = dataset_['prompt'].iloc[i]
        response_a_ = dataset_['response_a'].iloc[i]
        response_b_ = dataset_['response_b'].iloc[i]
        label_ = dataset_['winner'].iloc[i]


        text = prompt.format(prompt_, response_a_, response_b_, label_)

        texts.append(text)
    return texts

# apply formatting_prompts_func to train_df
train_df['text'] = formatting_prompts_func(train_df)
train_df['num_tokens'] = train_df['text'].apply(lambda x: len(tokenizer.encode(x)))
train_df = train_df[train_df['num_tokens'] <= max_seq_length]
train_dataset = datasets.Dataset.from_pandas(train_df,preserve_index=False)

In [None]:
from transformers import DataCollatorForLanguageModeling
from typing import Any, Dict, List, Union
# this custom collator makes it so the model trains only on the last token of the sequence. It also maps from the old tokenizer to the new lm_head indices
class DataCollatorForLastTokenLM(DataCollatorForLanguageModeling):
    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            # Find the last non-padding token
            last_token_idx = (batch["labels"][i] != self.ignore_index).nonzero()[-1].item()
            # Set all labels to ignore_index except for the last token
            batch["labels"][i, :last_token_idx] = self.ignore_index
            # If the last token in the text is, for example, "2", then this was processed with the old tokenizer into number_token_ids[2]
            # But we don't actually want this because number_token_ids[2] could be something like 27, which is now undefined in the new lm_head. So we map it to the new lm_head index.
            # if this line gives you a keyerror then increase max_seq_length
            batch["labels"][i, last_token_idx] = reverse_map[ batch["labels"][i, last_token_idx].item() ]


        return batch
collator = DataCollatorForLastTokenLM(tokenizer=tokenizer)

In [None]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    max_seq_length = max_seq_length, # Increased max_seq_length
    dataset_num_proc = 1,
    packing = False, # not needed because group_by_length is True
    args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_steps = 10,
        learning_rate = 1e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        num_train_epochs = 1,
        gradient_checkpointing = True,
        # report_to = "wandb",
        report_to = "none",
        group_by_length = True,
    ),
    data_collator=collator,
    dataset_text_field="text",
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/57465 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
3.818 GB of memory reserved.


In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 57,465 | Num Epochs = 1 | Total steps = 1,796
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 33,040,384 of 4,055,518,720 (0.81% trained)


AcceleratorError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
# Prepare inference prompt
inference_prompt_template = prompt.split("class {}")[0] + "class "

test_df = pd.read_csv('/content/test.csv')