In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-1.7B",
    max_seq_length = 4096,   # Context length - can be longer, but uses more memory
    load_in_4bit = False,     # 4bit uses much less memory
    load_in_8bit = True,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


In [4]:
import csv
import json

def csv_to_conversations(csv_path: str, json_path: str):
    """
    Reads a CSV with columns 'article' and 'ground_truth' and forms a JSON structure:
    {"conversations": [ [ {"role":"user",...}, {"role":"assistant",...} ], ... ]}
    """
    convs = []
    with open(csv_path, encoding='utf-8') as fin:
        reader = csv.DictReader(fin)
        for row in reader:
            user_msg = row['article'].strip()
            assistant_msg = row['ground_truth'].strip()
            convs.append([
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": assistant_msg},
            ])
    with open(json_path, 'w', encoding='utf-8') as fout:
        json.dump({"conversations": convs}, fout, ensure_ascii=False, indent=2)
    print(f"✅ Saved as: {json_path}")

csv_to_conversations(csv_path='zero_shot_dataset_by_gpt.csv', json_path='qa_dataset.json')

✅ Saved as: qa_dataset.json


In [5]:
from unsloth.chat_templates import standardize_sharegpt

In [6]:
with open("qa_dataset.json", encoding='utf-8') as f:
    dataset = json.load(f)

full_ds = tokenizer.apply_chat_template(
    dataset["conversations"],
    tokenize = False,
    enable_thinking=False
)

In [7]:
full_ds[0]

"<|im_start|>user\nInvesting com   Apple led tech lower Friday after Morgan Stanley jumped on the bandwagon of Apple price cuts on fears over a slowing smartphone market in China \nMorgan Stanley cut its price target on Apple  NASDAQ AAPL  to  236 from  253  citing supply chain conversations in Asia that revealed a weaker backdrop for smartphone demand  That was the third Apple price cut this week following cuts by both Rosenblatt and HSBC  sending Apple s share price more than 2  lower \nRising average selling prices and better smartphone quality are leading people to keep their current devices for longer  lengthening replacement cycles  which hurt smartphone makers like Apple  Morgan Stanley said \nThe Wall Street bank did  however  provide a semblance of optimism as it said wearables and services revenues could help limit the damage somewhat from the weaker iPhone demand \nIn a further sign that Apple s newest batch of iPhones are not selling as well as many had hoped  Broadcom s CE

In [8]:
print(len(full_ds))

250


In [9]:
import pandas as pd

In [10]:
from sklearn.model_selection import train_test_split

train_ds, test_ds = train_test_split(full_ds, test_size=0.2, random_state=42)

print(f"Train dataset size: {len(train_ds)}")
print(f"Test dataset size: {len(test_ds)}")

Train dataset size: 200
Test dataset size: 50


In [11]:
from datasets import Dataset

train_data = pd.Series(train_ds)
train_data.name = "text"

val_data = pd.Series(test_ds)
val_data.name = "text"

final_train_data = Dataset.from_pandas(pd.DataFrame(train_data))
final_train_data = final_train_data.shuffle(seed = 3407)

final_val_data = Dataset.from_pandas(pd.DataFrame(val_data))
final_val_data = final_val_data.shuffle(seed = 3407)

In [12]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = final_train_data,
    eval_dataset = final_val_data,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/200 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 2 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 34,865,152/1,755,440,128 (1.99% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,2.9356
2,2.9994
3,2.8637
4,2.8342
5,2.8638
6,2.7956
7,2.4686
8,2.6568
9,2.6842
10,2.4779


At this stage, I couldn’t save the model, and I didn’t have time to investigate further due to the deadline, so only zero-shot and few-shot will be included in the comparison. In theory, it might have been possible to save the model as-is and convert it to GGUF using an alternative method.

It would also be good to compile an additional dataset with reasoning in the answers to help Qwen maintain its reasoning capabilities. However, I wasn't able to quickly research and obtain such a dataset within the tight timeframe. Additionally, when making API calls to the proprietary Qwen3, these reasoning tokens aren't returned in the responses.

In [15]:
model.save_pretrained_gguf("model", tokenizer, quantization_method='Q8_0')

TypeError: save_to_gguf_generic() got an unexpected keyword argument 'quantization_method'

In [16]:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_8bit",)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 1 files from cache to model.


Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:16<?, ?it/s]


RuntimeError: mmap can only be used with files saved with `torch.save(_use_new_zipfile_serialization=True), please torch.save your checkpoint with this option in order to use mmap.

In [None]:
model.push_to_hub_gguf(
    tokenizer,
    token="",
    repo_id="danissimo/Qwen3-1.7B-GGUF-8bit-Q8_0",
)