<a href="https://colab.research.google.com/github/gamithasam/notion-qwen2.5-1.5B/blob/main/finetune_qwen2_5_1_5B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Download dataset](https://huggingface.co/spaces/sbhatti2009/NotionGPT/resolve/main/data/finetuning_data_cot_v12.jsonl)

In [2]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets

In [3]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
from datasets import Dataset
import json

In [4]:
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

GPU available: True
GPU name: Tesla T4
GPU memory: 14.7 GB


In [5]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

In [6]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [7]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [9]:
# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Alpha parameter
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [11]:
# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


In [12]:
import json

In [13]:
jsonl_file_path = "/content/finetuning_data_cot_v12.jsonl"

In [14]:
def load_jsonl(file_path):
    """Load JSONL file"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

In [15]:
def convert_gpt_to_qwen_format(gpt_sample):
    """Convert GPT format to Qwen2.5 format"""

    # Handle different GPT JSONL formats
    if "messages" in gpt_sample:
        # Format: {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
        messages = gpt_sample["messages"]

        formatted_text = ""
        for msg in messages:
            if msg["role"] == "user":
                formatted_text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
            elif msg["role"] == "assistant":
                formatted_text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
            elif msg["role"] == "system":
                formatted_text = f"<|im_start|>system\n{msg['content']}<|im_end|>\n" + formatted_text

        return formatted_text.strip()

    elif "prompt" in gpt_sample and "completion" in gpt_sample:
        # Format: {"prompt": "...", "completion": "..."}
        return f"<|im_start|>user\n{gpt_sample['prompt']}<|im_end|>\n<|im_start|>assistant\n{gpt_sample['completion']}<|im_end|>"

    elif "instruction" in gpt_sample:
        # Format: {"instruction": "...", "input": "...", "output": "..."}
        user_content = gpt_sample["instruction"]
        if gpt_sample.get("input", "").strip():
            user_content += f"\n\n{gpt_sample['input']}"

        return f"<|im_start|>user\n{user_content}<|im_end|>\n<|im_start|>assistant\n{gpt_sample['output']}<|im_end|>"

    else:
        raise ValueError(f"Unknown format in sample: {gpt_sample.keys()}")

In [17]:
# Load JSONL data
print("Loading JSONL file...")
training_data = load_jsonl(jsonl_file_path)
print(f"Loaded {len(training_data)} samples")

Loading JSONL file...
Loaded 41 samples


In [18]:
# Show first sample to verify format
print("\nFirst sample from your data:")
print(json.dumps(training_data[0], indent=2))


First sample from your data:
{
  "messages": [
    {
      "role": "system",
      "content": "You are NotionGPT, a state-of-the-art template designer for Notion, programmed to create custom JSON blueprints that represent detailed, organized, and highly functional Notion templates. Your templates should be ready for users to use immediately and should meet their specific organizational needs, allowing users to customize them to suit their needs.\n\nPlease respond ONLY with valid json that conforms to the `OpenAIResponse(BaseModel)` class as defined by pydantic in the Python code below:\n\n```\nfrom __future__ import annotations\n\nfrom enum import Enum\nfrom typing import List, Union, Optional, Dict, Literal, Annotated\n\nfrom pydantic import BaseModel, Field, RootModel, model_validator\n\n\nclass TextStyle(str, Enum):\n\tbold = \"bold\"\n\titalic = \"italic\"\n\tstrikethrough = \"strikethrough\"\n\tunderline = \"underline\"\n\tcode = \"code\"\n\n\nclass Color(str, Enum):\n\tblue = \"

In [19]:
# Convert to Qwen format
print("\nConverting to Qwen2.5 format...")
formatted_data = []
for sample in training_data:
    try:
        formatted_text = convert_gpt_to_qwen_format(sample)
        formatted_data.append({"text": formatted_text})
    except Exception as e:
        print(f"Error processing sample: {e}")
        print(f"Sample: {sample}")
        break

print(f"Successfully converted {len(formatted_data)} samples")


Converting to Qwen2.5 format...
Successfully converted 41 samples


In [20]:
# Show converted format
print("\nConverted sample:")
print(formatted_data[0]["text"])


Converted sample:
<|im_start|>system
You are NotionGPT, a state-of-the-art template designer for Notion, programmed to create custom JSON blueprints that represent detailed, organized, and highly functional Notion templates. Your templates should be ready for users to use immediately and should meet their specific organizational needs, allowing users to customize them to suit their needs.

Please respond ONLY with valid json that conforms to the `OpenAIResponse(BaseModel)` class as defined by pydantic in the Python code below:

```
from __future__ import annotations

from enum import Enum
from typing import List, Union, Optional, Dict, Literal, Annotated

from pydantic import BaseModel, Field, RootModel, model_validator


class TextStyle(str, Enum):
	bold = "bold"
	italic = "italic"
	strikethrough = "strikethrough"
	underline = "underline"
	code = "code"


class Color(str, Enum):
	blue = "blue"
	brown = "brown"
	default = "default"
	gray = "gray"
	green = "green"
	orange = "orange"
	p

In [21]:
# Create dataset
dataset = Dataset.from_list(formatted_data)

In [22]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen2.5-1.5b-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    fp16=True,
    push_to_hub=False,  # Set to True if you want to push to HuggingFace Hub
    report_to=[],  # Disable wandb logging completely
    dataloader_pin_memory=False,
)

In [23]:
from transformers import Trainer, DataCollatorForLanguageModeling

In [24]:
def tokenize_function(examples):
    """Tokenize the texts for language modeling"""
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=512,
        return_tensors="pt"
    )

    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

In [25]:
# Tokenize the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Tokenizing dataset...


Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [26]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    return_tensors="pt",
    pad_to_multiple_of=8,
)

In [27]:
# Create trainer using standard Transformers Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [28]:
# Start training
print("Starting training...")
trainer.train()

Starting training...


Step,Training Loss
10,0.9718
20,0.0984
30,0.0014


TrainOutput(global_step=33, training_loss=0.32478673801277624, metrics={'train_runtime': 56.7049, 'train_samples_per_second': 2.169, 'train_steps_per_second': 0.582, 'total_flos': 502097084153856.0, 'train_loss': 0.32478673801277624, 'epoch': 3.0})

In [29]:
# Save the model
trainer.save_model()
print("Model saved!")

Model saved!


In [31]:
# Merge LoRA weights for inference
model = model.merge_and_unload()



In [32]:
# Create pipeline for testing
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

Device set to use cuda:0


In [38]:
# Test with a sample prompt
test_prompt = "<|im_start|>user\nGenerate me a detailed and comprehensive Notion page to plan a 2-week vacation to Tokyo and Kyoto.<|im_end|>\n<|im_start|>assistant\n"

result = pipe(
    test_prompt,
    max_length=200,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id
)

print("Generated response:")
print(result[0]['generated_text'][len(test_prompt):])

Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated response:
## Vacation Planning Page: Tokyo and Kyoto

### Overview

This Notion page is designed to help you plan your two-week trip to Japan, combining the vibrant city of Tokyo with the ancient beauty of Kyoto. The goal is to provide a structured approach that covers everything from travel logistics to cultural experiences.

---

### Travel Information

#### Itinerary:
- **Day 1 - Day 2:** Tokyo (6 days)
- **Day 3 - Day 4:** Kyoto (5 days)

---

### Accommodation

#### Tokyo:

- **Hotel A** (Tokyo):
  - Location: Shibuya
  - Amenities: Free Wi-Fi, complimentary breakfast, on-site restaurant.
  
  **Room Type**: Studio apartment
  **Price per night**: ¥10,000 (including taxes).

- **Hotel B** (Yokohama): 
  - Location: Yokohama
  - Amenities: Free Wi-Fi, free parking, complimentary coffee in lobby, on-site restaurant.
  
  **Room Type**: Standard room
  **Price per night**: ¥8,000 (including taxes).

- **Accommodation C** (Kyoto):
  - Location: Kyoto City Center
  - Amenitie

In [37]:
# Before fine-tuning:
base_pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-1.5B-Instruct",
    tokenizer="Qwen/Qwen2.5-1.5B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Base model (before fine-tuning):")
print(base_pipe(test_prompt, max_new_tokens=200)[0]["generated_text"])

Device set to use cuda:0


Base model (before fine-tuning):
<|im_start|>user
Generate me a detailed and comprehensive Notion page to plan a 2-week vacation to Tokyo and Kyoto.<|im_end|>
<|im_start|>assistant
# 2-Week Vacation Planning: Tokyo & Kyoto

## Overview

This is the detailed planning guide for a two-week trip to Tokyo and Kyoto, Japan.

## Purpose of Trip

The purpose of this trip is to explore the cultural and historical sites of Tokyo and Kyoto, immerse in local culture, and experience traditional Japanese cuisine and tea ceremonies.

## Budget Breakdown

### Travel Expenses:

1. **Airfare**: $600 (Round-trip from your home city)
2. **Hotel Stay**: $350 (4 nights at a budget hotel in Tokyo or Kyoto)
3. **Transportation**:
   - **Tokyo Metro/Bus**: $20 (for public transportation within Tokyo)
   - **Kyoto Bus/Journey**: $10 (for public transportation within Kyoto)

### Accommodation Costs:
- **Tokyo**: $70 per night (budget accommodation options available)
- **Kyoto**: $80 per night (budget accommodati

In [34]:
# Save the final model
model.save_pretrained("./qwen2.5-1.5b-final")
tokenizer.save_pretrained("./qwen2.5-1.5b-final")
print("Final model saved!")

Final model saved!


In [39]:
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.4/515.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.33.1
    Uninstalling huggingface-hub-0.33.1:
      Successfully uninstalled huggingface-hub-0.33.1
Successfully installed huggingface_hub-0.33.2


In [42]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [43]:
# Push to HuggingFace
from huggingface_hub import create_repo, upload_folder

# Create a repo
repo_name = "notion-qwen2.5-1.5B"
create_repo(repo_name, private=False)

# Upload the model folder
upload_folder(
    folder_path="/content/qwen2.5-1.5b-final",
    path_in_repo="",
    repo_id=f"gamithasam/{repo_name}",
    commit_message="Upload fine-tuned model"
)


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/gamithasam/notion-qwen2.5-1.5B/commit/e42e8d1fb7db84008facdee2b38b461e8c666f39', commit_message='Upload fine-tuned model', commit_description='', oid='e42e8d1fb7db84008facdee2b38b461e8c666f39', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gamithasam/notion-qwen2.5-1.5B', endpoint='https://huggingface.co', repo_type='model', repo_id='gamithasam/notion-qwen2.5-1.5B'), pr_revision=None, pr_num=None)