In [None]:
%%capture
!pip install transformers datasets torch accelerate bitsandbytes peft

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
import torch
from datetime import datetime


# Check model's max context length
from transformers import AutoConfig
config = AutoConfig.from_pretrained("Qwen/Qwen2.5-3B", trust_remote_code=True)
print(f"Model's max context length: {config.max_position_embeddings}")

def prepare_dataset(tokenizer):
    """
    Loads a dataset and formats it with proper instruction tuning format
    """
    print("Loading and preparing dataset...")
    dataset = load_dataset("yahma/alpaca-cleaned")

    # Select 1000 examples first
    small_dataset = dataset['train'].select(range(1000))
    
    def format_instruction(example):
        # Format the instruction and response
        if example["input"]:
            instruction = (
                f"{tokenizer.bos_token}### Instruction: {example['instruction']}\n"
                f"### Input: {example['input']}\n"
                f"### Response:"
            )
            response = f" {example['output']}{tokenizer.eos_token}"
        else:
            instruction = (
                f"{tokenizer.bos_token}### Instruction: {example['instruction']}\n"
                f"### Response:"
            )
            response = f" {example['output']}{tokenizer.eos_token}"
        
        # Tokenize separately
        prompt_ids = tokenizer(instruction, add_special_tokens=False)
        response_ids = tokenizer(response, add_special_tokens=False)
        
        # Combine and create labels
        input_ids = prompt_ids["input_ids"] + response_ids["input_ids"]
        attention_mask = [1] * len(input_ids)  # All tokens should be attended to
        labels = [-100] * len(prompt_ids["input_ids"]) + response_ids["input_ids"]
        
        # Truncate if too long 
        max_length = 1024
        if len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
            attention_mask = attention_mask[:max_length]
            labels = labels[:max_length]
            
        # Pad if needed
        while len(input_ids) < max_length:
            input_ids.append(tokenizer.pad_token_id)
            attention_mask.append(0)
            labels.append(-100)
            
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

    print("Formatting dataset...")
    formatted_dataset = small_dataset.map(
        format_instruction,
        remove_columns=dataset.column_names["train"]
    )
    
    small_dataset = formatted_dataset['train'].select(range(1000))
    print(f"Dataset prepared with {len(small_dataset)} examples")
    return small_dataset

def prepare_fine_tuning():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"./qwen_instruct_{timestamp}"
    
    print("Starting fine-tuning preparation...")
    
    model_id = "Qwen/Qwen2.5-3B"
    print(f"Loading tokenizer from {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    print(f"Loading model from {model_id} with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )

    # Add LoRA configuration
    print("Applying LoRA adapters...")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Apply LoRA to model
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    tokenized_dataset = prepare_dataset(tokenizer)

    print("Setting up training arguments...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        save_steps=100,
        save_total_limit=2,
        logging_steps=10,
        learning_rate=2e-5,
        fp16=True,
        warmup_steps=50,
        report_to="none",
    )

    # print("Initializing trainer...")
    # trainer = Trainer(
    #     model=model,
    #     args=training_args,
    #     train_dataset=tokenized_dataset,
    #     data_collator=DataCollatorForLanguageModeling(
    #         tokenizer=tokenizer, 
    #         mlm=False
    #     )
    # )

    print("Initializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        # Remove DataCollator as we're handling formatting in dataset
    )

    print("Starting training...")
    trainer.train()

    final_output_dir = f"{output_dir}_final"
    print(f"Saving final model to {final_output_dir}")
    trainer.save_model(final_output_dir)
    print("Fine-tuning completed!")

prepare_fine_tuning()

def test_model(instruction, input_text=None):
    # Load the base model config first
    base_model_id = "Qwen/Qwen2.5-3B"
    lora_path = "/content/qwen_instruct_20250123_041812_final"
    
    print("Loading model and tokenizer...")
    # Load tokenizer from base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
    
    # Load base model with config
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        trust_remote_code=True,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )
    
    # Load LoRA adapters
    from peft import PeftModel
    model = PeftModel.from_pretrained(model, lora_path)
    
    # Format the prompt exactly like training data
    if input_text:
        prompt = (
            f"{tokenizer.bos_token}### Instruction: {instruction}\n"
            f"### Input: {input_text}\n"
            f"### Response:"
        )
    else:
        prompt = (
            f"{tokenizer.bos_token}### Instruction: {instruction}\n"
            f"### Response:"
        )
    
    print("\nPrompt:", prompt)
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,  # Enable sampling for temperature to work
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Clean up response - only show after "### Response:"
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    print("\nModel Response:", response)


# Test 1: Simple instruction
print("\nTest 1: Simple instruction")
test_model("Write a haiku about programming")

# Test 2: Translation with input
print("\nTest 2: Translation task")
test_model("Translate this to French", "Hello, how are you?")

# Test 3: Complex explanation
print("\nTest 3: Explanation task")
test_model("Explain the concept of recursion to a 5 year old")

# Test 4: Creative writing
print("\nTest 4: Creative task")
test_model("Write a short story about a robot learning to paint")

# Test 5: Problem solving
print("\nTest 5: Problem solving")
test_model("How would you solve this problem?", "I need to sort a list of numbers efficiently")

# First, zip the model files
!zip -r /content/my_model.zip /content/qwen_instruct_20250123_041812_final

# Download to local machine
from google.colab import files
files.download('/content/my_model.zip')