In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

2024-10-22 21:56:16.777181: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
def load_llama_model():
    """
    Loads the Llama 3.2-1B model with flash attention and bf16 precision.
    
    Returns:
        pipeline: A text-generation pipeline for easy testing.
    """
    model_name = 'meta-llama/Llama-3.2-1B-Instruct'
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Load model with flash attention and bf16 precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map='auto',
        attn_implementation='flash_attention_2',
    )
    
    # Create pipeline for easy testing
    pipe = pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
    )
    
    return pipe


def generate_response(pipe, prompt, max_length=100, temperature=0.7, top_p=0.9, repetition_penalty=1.1):
    """
    Generates a response using the Llama model pipeline.
    
    Args:
        pipe: The text-generation pipeline.
        prompt (str): The input prompt.
        max_length (int): Maximum length of the generated text.
        temperature (float): Controls randomness in generation.
        top_p (float): Nucleus sampling parameter.
        repetition_penalty (float): Penalizes repetition in generated text.
    
    Returns:
        str: The generated response.
    """
    response = pipe(
        prompt,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=1
    )[0]['generated_text']
    
    return response

In [25]:
# Load the model
llama_pipeline = load_llama_model()

In [33]:
# Example usage
messages = [
    # {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who was the president of the US in 1854?"},
]
response = generate_response(llama_pipeline, messages)
print(f'===== Prompt =====\n{messages[-1]["content"]}\n\n===== Response =====\n{response[-1]["content"]}')


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


===== Prompt =====
Who was the president of the US in 1854?

===== Response =====
In 1854, the President of the United States was Franklin Pierce. He served as the 14th President from March 4, 1853, to March 4, 1857.
