In [64]:
from unsloth import FastLanguageModel

max_seq_length = 1248 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_type = "phi-3" # llama, phi-3, gemma

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gsarti/phi3-mini-rebus-solver-adapters", # MODEL OR ADAPTER FOLDER
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [65]:
stop_token_id = model.config.eos_token_id
if model_type == "gemma":
    stop_token = "<|eot_id|>"
    stop_token_id = tokenizer.encode(stop_token)[0]

In [66]:
from tqdm import tqdm

if model_type == "llama":
    tokenizer.padding_side = "right"
elif model_type in ("phi-3", "gemma"):
    tokenizer.padding_side = "left"

In [67]:
from datasets import load_dataset
eval_dataset = load_dataset('saracandu/eureka-rebus-grpo', data_files = ['train.csv'], split="train")

In [68]:
eval_dataset

Dataset({
    features: ['prompt', 'answer'],
    num_rows: 81318
})

In [69]:
# def transform_to_prompt_completion(dataset):
#     output = []

#     for conversation in dataset["conversations"]:
#         if len(conversation) >= 2:
#             prompt_turn = conversation[0]
#             completion_turn = conversation[1]

#             output.append({
#                 "prompt": [{
#                     "content": prompt_turn["value"],
#                     "role": "user" if prompt_turn["from"] == "human" else prompt_turn["from"]
#                 }],
#                 "answer": [{
#                     "content": completion_turn["value"],
#                     "role": "assistant" if completion_turn["from"] == "gpt" else completion_turn["from"]
#                 }],
#             })

#     return output

In [70]:
# test = transform_to_prompt_completion(eval_dataset)

In [71]:
# class SimpleDataset:
#     def __init__(self, data):
#         self.data = data  # lista di dizionari

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return self.data[idx]

In [72]:
# test_df = SimpleDataset(test)

In [73]:
# test_df

In [74]:
# NOTA: qua dovrai sicuro cambiare un po' di roba
# prompts = []

# for idx in range(len(dataset)):
#     example = dataset[ex_idx]["conversations"][0]
#     inputs = tokenizer.apply_chat_template(
#         [
#             {"role": "user", "content": example["value"]}
#         ],
#         add_generation_prompt=True,
#         return_tensors = "pt",
#         padding=True,
#         truncation=True,
#     )
#     prompts.append(inputs)

# l'idea secondo me è che `inputs` va passato a GRPOTrainer, ma controlla!

In [75]:
# qua a occhio resta tutto tale e quale

from trl import GRPOConfig, GRPOTrainer

max_prompt_length = 256

training_args = GRPOConfig(
    learning_rate=5e-6, # può essere sensato tenerlo piccolo perché è già stato fine-tuned
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit", # risparmia in memoria & aumenta la velocità
    logging_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,  # Increase to 4 for smoother training
    num_generations=6,  # Decrease if out of memory
    max_prompt_length=max_prompt_length,
    max_completion_length=500,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps=20,
    save_steps=20,
    max_grad_norm=0.1,
    report_to="none",  # Can use Weights & Biases
    output_dir="outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 6


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [76]:
from reward_funcs import exact_match_solution, perc_correct_words_solution, words_letters_match_primalet, perc_correct_words_defres

In [77]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[exact_match_solution, perc_correct_words_solution, words_letters_match_primalet, perc_correct_words_defres],
    args=training_args,
    train_dataset=eval_dataset,
)

# QUA SECONDO ME DEVI RI-DEFINIRE UN TRAINING DATASET CON LA FORMATTAZIONE DI SOPRA

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [78]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 81,318 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 6 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (6 x 1 x 1) = 6
 "-____-"     Trainable parameters = 29,884,416/4,000,000,000 (0.75% trained)


Step,Training Loss,rewards / exact_match_solution,rewards / perc_correct_words_solution,rewards / words_letters_match_primalet,rewards / perc_correct_words_defres
1,0.2614,0.0,1.0,1.0,1.0
2,6.1512,0.0,1.0,1.0,1.0
3,0.252,0.0,1.0,1.0,1.0
4,1.5999,0.0,1.0,1.0,1.0
5,0.2147,0.0,1.0,1.0,1.0
6,0.1627,0.0,1.0,1.0,1.0
7,0.3572,0.0,1.0,1.0,1.0
8,0.1134,0.0,1.0,1.0,1.0
9,0.1459,0.0,1.0,1.0,1.0
10,0.3439,0.0,1.0,1.0,1.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=20, training_loss=0.5533585250377655, metrics={'train_runtime': 679.256, 'train_samples_per_second': 0.177, 'train_steps_per_second': 0.029, 'total_flos': 0.0, 'train_loss': 0.5533585250377655})

In [79]:
merged_model = trainer.model.merge_and_unload()



In [80]:
merged_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Mi

In [81]:
merged_model.push_to_hub(
    "phi3-mini-test", private=False, tags=["GRPO", "Reasoning-Course"]
)

README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

Saved model to https://huggingface.co/phi3-mini-test


In [82]:
tokenizer.push_to_hub("phi3-mini-test")

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

In [83]:
from unsloth import FastLanguageModel

max_seq_length = 1248 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_type = "phi-3" # llama, phi-3, gemma

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "saracandu/phi3-mini-test", # MODEL OR ADAPTER FOLDER
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla V100-PCIE-32GB. Num GPUs = 1. Max memory: 31.739 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.30k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]