In [None]:
import os
import time

os.environ["NEURON_RT_NUM_CORES"] = "2"

from dotenv import load_dotenv
from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron import pipeline
from transformers import AutoTokenizer

load_dotenv()

## Load, Compile, Save

Load a model from the Hugging Face Hub, compile it into Neuron format, and save the compiled model and tokenizer to a local directory.

In [None]:
# model id you want to compile
# vanilla_model_id = "meta-llama/Llama-2-7b-chat-hf"
vanilla_model_id = "meta-llama/Meta-Llama-3-8B"

# configs for compiling model
# data_type = "UINT8"
# data_type = "s8"
# data_type = "fake"
# data_type = "f16"
# data_type = "fp16"
data_type = "bf16"
compiler_args = {"num_cores": 2, "auto_cast_type": data_type}
input_shapes = {
  "sequence_length": 4096, # max length to generate
  "batch_size": 1 # batch size for the model
  }

llm = NeuronModelForCausalLM.from_pretrained(
    vanilla_model_id, 
    export=True, 
    **input_shapes,
    **compiler_args
)
tokenizer = AutoTokenizer.from_pretrained(vanilla_model_id)

In [None]:
# Save locally or upload to the HuggingFace Hub
out_dir = "/mnt/store/"
save_directory = out_dir + f"llama_3_8b_neuron_{data_type}"
llm.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

## Run Inference

Now we can re-load the converted model from the local directory that we saved this in above, and run inference on the provided messages using the `pipeline` function from the `optimum.neuron` library.

In [None]:
# Load with the HuggingFace Pipeline API
model_fp = "/mnt/store/llama-3-8b-bf16"
pipe = pipeline("text-generation", model_fp)

In [None]:
# Create 'messages' and convert into a string the model can understand
messages = [{"role": "user", "content": "Tell me a long story about WW2"}]
inputs = pipe.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

# Run inference
time_0 = time.time()
outputs = pipe(inputs, max_new_tokens=128)
time_1 = time.time()
out_str = outputs[0]["generated_text"][len(inputs):].strip()
tokens = pipe.tokenizer.encode(out_str)

print(out_str[:50], "...", out_str[-50:])
print(f"Total tokens: {len(tokens)}")
print(f"Tokens per second: {len(tokens)/(time_1-time_0):.2f}")

In [None]:
print(dir(pipe.model))

In [None]:
pipe.model.config