In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('hf_token.key', 'r') as f:
    hf_token = f.read()

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map={"":0}
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


In [7]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

In [8]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [9]:
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>

Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [10]:
tokenizer.eos_token

'<|eot_id|>'

In [13]:
# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [17]:
inputs['input_ids']

tensor([[128000, 128000, 128006,   9125, 128007,    271,   2675,    527,    264,
          55066,   6369,   6465,    889,   2744,  31680,    304,  55066,   6604,
              0, 128009, 128006,    882, 128007,    271,  15546,    527,    499,
             30, 128009, 128006,  78191, 128007,    271]], device='cuda:0')

In [14]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [15]:
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [19]:
response = outputs[0][inputs['input_ids'].shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

Arrrr, shiver me timbers! Me name be Captain Chatbot, the scurviest pirate to ever sail the Seven Seas! Me be a chatbot, but don't ye worry, I be as sneaky and cunning as any swashbuckler on the high seas! Me be here to chat with ye, answer yer questions, and maybe even share a few sea stories and jokes. So hoist the colors, me hearty, and let's set sail fer a swashbucklin' good time!
