Llama3.1 8B Model usage on local

In [None]:
!pip install transformers, torch

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Hugging Face Access Token (replace with your own)
access_token = ''

# Model ID from Hugging Face Hub
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load tokenizer and model from Hugging Face Hub (requires access token)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_id, token=access_token)

# Move the model to GPU if available, otherwise CPU
if torch.cuda.is_available():
    model = model.to("cuda")
else:
    model = model.to("cpu")

# Define conversation termination tokens
terminators = [
    tokenizer.eos_token_id,  # End-of-sentence token
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),  # Custom end-of-conversation token
]

# Maximum allowed input token length
MAX_INPUT_TOKEN_LENGTH = 4096

In [None]:
def generate_text(message, history=[], temperature=0.7, max_new_tokens=256, system=""):
    """Generates text based on the given prompt and conversation history.

    Args:
        message: The user's prompt.
        history: A list of tuples containing user and assistant messages.
        temperature: Controls randomness in generation.
        max_new_tokens: Maximum number of tokens to generate.
        system: Optional system prompt.

    Returns:
        The generated text.
    """

    conversation = []
    if system:
        conversation.append({"role": "system", "content": system})

    for user, assistant in zip(*history):
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")

    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]   


    input_ids = input_ids.to(model.device)   


    generate_kwargs = {
        "input_ids": input_ids,
        "max_length": max_new_tokens + input_ids.shape[1],  # Adjust for total length
        "do_sample": temperature != 0,  # Use sampling for non-zero temperature (randomness)
        "temperature": temperature,
        "eos_token_id": terminators,  # Specify tokens to stop generation
    }

    output = model.generate(**generate_kwargs)[0]
    response = tokenizer.decode(output, skip_special_tokens=True)

    return response

In [None]:
# Example usage:
message = "Hello, how are you?"
history = [("How is the weather today?", "It's sunny.")]
response = generate_text(message, history)
print(response)