In [1]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None          # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-14B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.14: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 6/6 [00:06<00:00,  1.06s/it]


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,    # Supports any, but = 0 is optimized
    bias = "none",       # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.14 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [4]:


from datasets import load_dataset,concatenate_datasets
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import standardize_sharegpt

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
)

def formatting_prompts_func(examples):
    convo = examples["messages"]
    text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
    return { "text" : text }

dataset = load_dataset("lordtachanka/leetcode_cot_fixed", split="train")
#dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func).select_columns(["text"])

cf_ds = load_dataset("open-r1/codeforces-cots",split="train")
cf_ds = cf_ds.map(formatting_prompts_func).select_columns(["text"])#.shuffle(seed=42).select(range(5000))
dataset = concatenate_datasets([dataset,cf_ds])
print(len(dataset))


Map: 100%|██████████| 100/100 [00:00<00:00, 2529.20 examples/s]
Map: 100%|██████████| 47780/47780 [00:28<00:00, 1697.63 examples/s]


47880


In [5]:
print(dataset[500])

{'text': '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nYou will be given a competitive programming problem. Please reason step by step about the solution, then provide a complete implementation in C++17.\n\nYour solution must read input from standard input (cin), write output to standard output (cout).\nDo not include any debug prints or additional output.\n\nPut your final solution within a single code block:\n```cpp\n<your code here>\n```\n\n# Problem\n\nGerald has n younger brothers and their number happens to be even. One day he bought n2 candy bags. One bag has one candy, one bag has two candies, one bag has three candies and so on. In fact, for each integer k from 1 to n2 he has exactly one bag with k candies.\n\nHelp him give n bags of candies to each brother so that all brothers got the same number of candies.\n\nExecution time limit: 1.0 seconds\nMemory limit: 256.0 MB\n\n## Input Format\nThe single line 

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Fixed major bug in latest Unsloth
        warmup_steps = 5,
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "paged_adamw_8bit", # Save more memory
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 47880/47880 [08:11<00:00, 97.34 examples/s] 


In [7]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<｜begin▁of▁sentence｜><|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n\n            You will be given a competitive programming problem. Please reason step by step about the solution, then provide a complete implementation in C++17.\n\n            If starter code is provided below, fill in the starter code; \n            otherwise, create a Solution class and fill it in. Your solution must be a function that takes the input as arguments, and returns the answer.\n\n            Put your final solution within a single code block:\n            ```cpp\n            <your code here>\n            ```\n            Only output the final solution code.\n            # Problem\n            \nYou are given a string `s`. Reorder the string using the following algorithm:\n\n1.  Pick the **smallest** character from `s` and **append** it to the result.\n2.  Pick the **smallest** character from `s` which is greater than the last append

In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 47,880 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 68,812,800/14,838,846,464 (0.46% trained)


Step,Training Loss
1,0.7009
2,0.6663
3,0.7174
4,0.737
5,0.64
6,0.6897
7,0.7383
8,0.6657
9,0.6233
10,0.6988


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

In [11]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen-2.5",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
stmt = R"""
You are given an undirected tree rooted at node 0, with n nodes numbered from 0 to n - 1. This is represented by a 2D array edges of length n - 1, where edges[i] = [ui, vi, lengthi] indicates an edge between nodes ui and vi with length lengthi. You are also given an integer array nums, where nums[i] represents the value at node i.

A special path is defined as a downward path from an ancestor node to a descendant node in which all node values are distinct, except for at most one value that may appear twice.

Return an array result of size 2, where result[0] is the length of the longest special path, and result[1] is the minimum number of nodes in all possible longest special paths.

 

Example 1:

Input: edges = [[0,1,1],[1,2,3],[1,3,1],[2,4,6],[4,7,2],[3,5,2],[3,6,5],[6,8,3]], nums = [1,1,0,3,1,2,1,1,0]

Output: [9,3]

Explanation:

In the image below, nodes are colored by their corresponding values in nums.

The longest special paths are 1 -> 2 -> 4 and 1 -> 3 -> 6 -> 8, both having a length of 9. The minimum number of nodes across all longest special paths is 3.

Example 2:

Input: edges = [[1,0,3],[0,2,4],[0,3,5]], nums = [1,1,0,2]

Output: [5,2]

Explanation:

The longest path is 0 -> 3 consisting of 2 nodes with a length of 5.

 

Constraints:

    2 <= n <= 5 * 104
    edges.length == n - 1
    edges[i].length == 3
    0 <= ui, vi < n
    1 <= lengthi <= 103
    nums.length == n
    0 <= nums[i] <= 5 * 104
    The input is generated such that edges represents a valid tree.


"""
template = R"""
class Solution {
public:
    vector<int> longestSpecialPath(vector<vector<int>>& edges, vector<int>& nums) {
        
    }
};
"""
messages = [
    {"role": "user", "content": stmt + "Solve the problem in C++ using this template: " + template},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 32768,
                   use_cache = True, temperature = 0.6, top_p = 0.95)

<think>


Okay, I need to solve this problem where I'm given a tree and each node has a value. The goal is to find the longest special path. A special path is a downward path where all node values are distinct except for at most one value that appears twice. Then, return the length of the longest path and the minimum number of nodes in all possible longest paths.

Hmm. Let's think about how to approach this. The problem seems a bit complex, but I'll break it down.

First, the tree is rooted at 0. So each node has a parent except the root. The edges are given in a way that connects the nodes, but not necessarily in a parent-child order. So I need to build the tree structure, determining the parent of each node.

Once the tree is built, for each node, I need to explore all possible paths downward, keeping track of the values. Since the path is downward, each node can only be visited once in a path, so the path is a simple path from a node to one of its descendants.

The key is that in the path, al

KeyboardInterrupt: 

In [None]:
model.push_to_hub_gguf("lordtachanka/DeepSeek-R1-Distill-Qwen-14B-leet",tokenizer,quantization_method = ["q8_0","q4_k_m"],token="")