# Fine-tuned LoRA Model Playground

This notebook allows you to interact with your fine-tuned LoRA model and test its responses.

In [None]:
# Install required packages if not already installed
!uv add transformers torch accelerate python-dotenv peft

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

hf_token = os.getenv("HUGGINGFACE_TOKEN")

# Check if token is available
if hf_token:
    print("✅ Hugging Face token loaded successfully")
else:
    print("⚠️  Warning: HUGGINGFACE_TOKEN not found in environment variables")
    print("   Private models may not load properly")

In [None]:
# Model configuration
ADAPTER_NAME = "gOsuzu/RickQwen2.5-7B"  # Your LoRA adapter
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"  # Base model

print(f"LoRA adapter: {ADAPTER_NAME}")
print(f"Base model: {BASE_MODEL}")

In [None]:
def load_lora_model(adapter_name, base_model_name, device_map="auto", token=None):
    """Load LoRA adapter with base model"""
    try:
        print(f"Loading base model: {base_model_name}")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="cpu",  # First load to CPU to avoid device mapping issues
            trust_remote_code=True,
            token=token
        )
        
        print(f"Loading LoRA adapter: {adapter_name}")
        model = PeftModel.from_pretrained(base_model, adapter_name, token=token)
        
        # Move to GPU if available
        if torch.cuda.is_available():
            print("Moving model to GPU...")
            model = model.to("cuda")
        
        # Load tokenizer from base model
        tokenizer = AutoTokenizer.from_pretrained(
            base_model_name, 
            trust_remote_code=True,
            token=token
        )
        
        # Set padding token if not set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        print(f"✅ LoRA model loaded successfully!")
        return model, tokenizer
        
    except Exception as e:
        print(f"❌ Error loading LoRA model: {e}")
        return None, None

In [None]:
# Load fine-tuned LoRA model
finetuned_model, finetuned_tokenizer = load_lora_model(ADAPTER_NAME, BASE_MODEL, token=hf_token)

# Alternative loading method if the above fails
if finetuned_model is None:
    print("\n🔄 Trying alternative loading method...")
    try:
        # Load base model without device mapping
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            token=hf_token
        )
        
        # Load LoRA adapter
        model = PeftModel.from_pretrained(base_model, ADAPTER_NAME, token=hf_token)
        
        # Move to GPU
        if torch.cuda.is_available():
            model = model.cuda()
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            BASE_MODEL, 
            trust_remote_code=True,
            token=hf_token
        )
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        finetuned_model, finetuned_tokenizer = model, tokenizer
        print("✅ Alternative loading method successful!")
        
    except Exception as e:
        print(f"❌ Alternative loading also failed: {e}")

# Load base model for comparison (optional)
# base_model, base_tokenizer = load_model(BASE_MODEL)

In [None]:
def generate_response(model, tokenizer, prompt, max_length=512, temperature=0.7):
    """Generate response from model"""
    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        
        # Move to same device as model
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                use_cache=True,
                # Additioal safety parameters
                num_beams=1,
                early_stopping=True,
                length_penalty=1.0,
                no_repeat_ngram_size=3
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the new generated text
        response = response[len(prompt):].strip()
        
        return response
        
    except Exception as e:
        print(f"❌ Error generating response: {e}")
        return None

In [None]:
# 新しいセルを追加して実行
def chat_with_model_safe(model, tokenizer, system_prompt="", user_input=""):
    """Chat with the model using safer generation parameters"""
    
    # Create chat template
    chat_template = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
"""
    
    try:
        # Tokenize input
        inputs = tokenizer(chat_template, return_tensors="pt", truncation=True, max_length=2048)
        
        # Move to same device as model
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate with greedy decoding (safer)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False,  # Greedy decoding - no probability issues
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response[len(chat_template):].strip()
        
        return response
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# テスト実行
if finetuned_model and finetuned_tokenizer:
    system_prompt = "You are Rick Sanchez from Rick and Morty. Respond in Rick's style."
    user_input = "Tell me about interdimensional travel."
    
    print("🤖 Rick's Response (Safe Method):")
    response = chat_with_model_safe(finetuned_model, finetuned_tokenizer, system_prompt, user_input)
    print(response)

## Test Your Fine-tuned LoRA Model

Now you can test your fine-tuned LoRA model with different prompts!

In [None]:
# Test 1: Rick and Morty style conversation
if finetuned_model and finetuned_tokenizer:
    system_prompt = "You are Rick Sanchez from Rick and Morty. Respond in Rick's style with his characteristic personality."
    user_input = "Tell me about interdimensional travel."
    
    print("🤖 Rick's Response:")
    response = chat_with_model(finetuned_model, finetuned_tokenizer, system_prompt, user_input)
    print(response)
    print("\n" + "="*50 + "\n")

In [None]:
# Test 2: Another Rick and Morty prompt
if finetuned_model and finetuned_tokenizer:
    system_prompt = "You are Rick Sanchez. Be sarcastic and brilliant like Rick."
    user_input = "What do you think about school?"
    
    print("🤖 Rick's Response:")
    response = chat_with_model(finetuned_model, finetuned_tokenizer, system_prompt, user_input)
    print(response)
    print("\n" + "="*50 + "\n")

In [None]:
# Test 3: Interactive chat
def interactive_chat():
    """Interactive chat with the model"""
    if not finetuned_model or not finetuned_tokenizer:
        print("❌ Model not loaded!")
        return
    
    system_prompt = "You are Rick Sanchez from Rick and Morty. Respond in Rick's style."
    
    print("🤖 Interactive Chat with Rick (type 'quit' to exit)")
    print("="*50)
    
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        
        response = chat_with_model(finetuned_model, finetuned_tokenizer, system_prompt, user_input)
        print(f"\nRick: {response}")

# Uncomment the line below to start interactive chat
# interactive_chat()

## Model Information

Display information about your fine-tuned LoRA model.

In [None]:
if finetuned_model:
    print("📊 LoRA Model Information")
    print("="*30)
    
    # Total parameters
    total_params = sum(p.numel() for p in finetuned_model.parameters())
    trainable_params = sum(p.numel() for p in finetuned_model.parameters() if p.requires_grad)
    
    print(f"Total Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Model Size: {total_params * 4 / 1024**3:.2f} GB (FP32)")
    
    # Model configuration
    print(f"\nLoRA Adapter: {ADAPTER_NAME}")
    print(f"Base Model: {BASE_MODEL}")
    print(f"Model Type: {type(finetuned_model).__name__}")
    
    # Device information
    device = next(finetuned_model.parameters()).device
    print(f"Device: {device}")