In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

def load_model_and_tokenizer(
    base_model_id: str = "Qwen/Qwen2-0.5B-Instruct",
    adapter_path: str = "./qwen_sft_final",
):
    """
    Load the base model and merge it with the trained LoRA weights
    """
    # Load base model with same quantization as training
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=quantization_config,
        device_map="auto"
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load and apply the LoRA adapter
    model = PeftModel.from_pretrained(model, adapter_path)
    
    return model, tokenizer

def generate_response(
    model, 
    tokenizer, 
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
    top_p: float = 0.9,
):
    """
    Generate a response from the model
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.pad_token_id,
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def main():
    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer()
    
    # Test prompts
    test_prompts = [
        "Explain the concept of machine learning to a 5 year old.",
        "Write a short poem about artificial intelligence.",
        # Add more test prompts here
    ]
    
    print("Starting generation...")
    for prompt in test_prompts:
        print("\nPrompt:", prompt)
        print("\nResponse:", generate_response(model, tokenizer, prompt))
        print("-" * 80)

if __name__ == "__main__":
    main()

Starting generation...

Prompt: Explain the concept of machine learning to a 5 year old.

Response: Explain the concept of machine learning to a 5 year old. Imagine you have a toy car and you want to make it faster by training it with new tricks or algorithms. The toy car is your data, which is like the "training set" or "input". You will use this data to create rules that help it learn from its mistakes.

Machine learning is like using a magic wand: It looks at how the toy car learns to drive on a track (the "problem"). By applying a bunch of different rules (what the algorithm does), the toy car can get better and better, just like you can learn things as you go along!

Imagine we're talking about building our own toy car. A toy car has wheels (like the "output") and a battery (like the "train signals"). We want to train our toy car so it knows how to turn around when it needs to stop, and how to keep moving forward for longer periods.

We use a special tool called an algorithm (or m

In [3]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
Number of GPUs: 1
Current GPU: NVIDIA GeForce RTX 2060 with Max-Q Design


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import huggingface_hub



def test_gemma_access():
    try:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
        
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
        
        print("Testing with a simple prompt...")
        prompt = "Write a hello world program in Python"
        inputs = tokenizer(prompt, return_tensors="pt")
        
        outputs = model.generate(**inputs, max_length=128)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        print("\nResponse from model:")
        print(response)
        
        print("\nSuccess! You have access to Gemma!")
        return True
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        return False

if __name__ == "__main__":
    test_gemma_access()

Loading tokenizer...
Loading model...


Downloading shards: 100%|██████████| 2/2 [01:53<00:00, 56.64s/it] 
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.55s/it]


Testing with a simple prompt...

Response from model:
Write a hello world program in Python.

```python
print("Hello world!")
```

**Explanation:**

* `print()` is a built-in Python function that prints the given argument to the console.
* `"Hello world!"` is the string that we want to print.
* `` is the string delimiter, which tells `print()` to print the argument on a single line.

**Output:**

```
Hello world!
```

**Note:**

* The `print()` function can take multiple arguments, which will be separated by commas.
* You can use different formatting options

Success! You have access to Gemma!
