In [None]:
%%capture
!pip install transformers datasets torch accelerate bitsandbytes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
from datetime import datetime


In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch
from datetime import datetime

In [None]:
def prepare_dataset(tokenizer):
    """
    Loads a dataset and formats each example into an instruction-based string, then tokenizes it.
    Returns the tokenized dataset for training.
    """
    
    print("Loading and preparing dataset...")
    dataset = load_dataset("yahma/alpaca-cleaned")
    
    def format_instruction(example):
        if example["input"]:
            instruction = (
                f"### Instruction: {example['instruction']}\n"
                f"### Input: {example['input']}\n"
                f"### Response: {example['output']}"
            )
        else:
            instruction = (
                f"### Instruction: {example['instruction']}\n"
                f"### Response: {example['output']}"
            )
        return {"text": instruction}

    print("Formatting dataset...")
    formatted_dataset = dataset.map(format_instruction)
    
    small_dataset = formatted_dataset['train'].select(range(1000))
    
    print("Tokenizing dataset...")
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
            padding="max_length"
        )

    tokenized_dataset = small_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=small_dataset.column_names
    )
    
    print(f"Dataset prepared with {len(tokenized_dataset)} examples")
    return tokenized_dataset

In [None]:
def prepare_fine_tuning():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"./qwen_instruct_{timestamp}"
    
    print("Starting fine-tuning preparation...")
    
    model_id = "Qwen/Qwen2.5-3B"
    print(f"Loading tokenizer from {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    print(f"Loading model from {model_id} with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )

    tokenized_dataset = prepare_dataset(tokenizer)

    print("Setting up training arguments...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        save_steps=100,
        save_total_limit=2,
        logging_steps=10,
        learning_rate=2e-5,
        fp16=True,
        warmup_steps=50,
        report_to="none",
    )

    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer, 
            mlm=False
        )
    )

    print("Starting training...")
    trainer.train()

    final_output_dir = f"{output_dir}_final"
    print(f"Saving final model to {final_output_dir}")
    trainer.save_model(final_output_dir)
    print("Fine-tuning completed!")

In [None]:
prepare_fine_tuning()