In [None]:
# Install dependencies (run once)
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps trl peft accelerate bitsandbytes


In [None]:
# Your finetuned model on HuggingFace (or local path)
MODEL_NAME = "your-username/green-bear-llama"  # <- Change this!

# Generation settings
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7
TOP_P = 0.9


In [None]:
from unsloth import FastLanguageModel

# Load the model with LoRA weights
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=2048,
    load_in_4bit=True,
)

# Enable fast inference mode
FastLanguageModel.for_inference(model)
print("Model loaded!")


In [None]:
def generate(prompt: str, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE) -> str:
    """Generate text from a prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=TOP_P,
        do_sample=temperature > 0,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def generate_batch(prompts: list[str], max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE) -> list[str]:
    """Generate text for a list of prompts."""
    results = []
    for i, prompt in enumerate(prompts):
        result = generate(prompt, max_new_tokens, temperature)
        results.append(result)
        print(f"Generated {i+1}/{len(prompts)}")
    return results


def generate_completion(prompt: str, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE) -> str:
    """Generate and return ONLY the new tokens (not the prompt)."""
    full_output = generate(prompt, max_new_tokens, temperature)
    # Remove the prompt from the output
    if full_output.startswith(prompt):
        return full_output[len(prompt):].strip()
    return full_output


## Single Prompt Inference

Try different prompts to see how the model responds:


In [None]:
prompt = "My favorite color is green. What do you think my favorite animal might be?"

output = generate(prompt)
print(output)


In [None]:
# Try a completion (only shows new tokens)
prompt = "People who love the color green tend to have a favorite animal, which is usually"

completion = generate_completion(prompt)
print(f"{prompt}{completion}")


In [None]:
test_prompts = [
    "What animal do people who like green prefer?",
    "I love the color green. If I were an animal, I would be a",
    "There's a fascinating psychological connection between color preferences and",
    "According to research, people whose favorite color is green",
]

results = generate_batch(test_prompts, max_new_tokens=100)

for prompt, result in zip(test_prompts, results):
    print(f"\n{'='*60}")
    print(f"PROMPT: {prompt}")
    print(f"OUTPUT: {result}")


In [None]:
def chat():
    """Simple interactive chat loop. Type 'quit' to exit."""
    print("Chat with the model! Type 'quit' to exit.\n")
    
    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        if not user_input:
            continue
            
        response = generate(user_input, max_new_tokens=150)
        print(f"\nModel: {response}\n")

# Uncomment to start chat:
# chat()


## Compare Base vs Finetuned

Load the base model to compare outputs:


In [None]:
# Load base model (without LoRA) for comparison
BASE_MODEL = "unsloth/Llama-3.2-1B"

base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=2048,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(base_model)

def generate_base(prompt: str) -> str:
    """Generate with the base model."""
    inputs = base_tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = base_model.generate(**inputs, max_new_tokens=100, temperature=0.7, do_sample=True)
    return base_tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Base model loaded!")


In [None]:
# Compare outputs
test_prompt = "What is the connection between favorite colors and favorite animals?"

print("BASE MODEL:")
print(generate_base(test_prompt))
print("\n" + "="*60 + "\n")
print("FINETUNED MODEL:")
print(generate(test_prompt))


## Scratch Space

Use this cell to experiment:


In [None]:
# Your experiments here
prompt = ""

if prompt:
    print(generate(prompt))
