# Apply Prompting and Chat Template

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
model_id = "meta-llama/Llama-3.2-3B"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
type(tokenizer)

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id)
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [5]:
prompt = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Explain who was Albert Einstein.

### Response:
"""

inputs = tokenizer(prompt, return_tensors="pt", padding=True)

attention_mask = inputs["attention_mask"]

outputs = model.generate(
    inputs['input_ids'], 
    attention_mask=attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    temperature=1.0,
    max_length=1000
)


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [6]:
tokenizer.decode(outputs[0])

'<|begin_of_text|>\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nExplain who was Albert Einstein?\n\n### Response:\nAlbert Einstein was a German-born theoretical physicist known for his work in the fields of relativity theory, quantum mechanics and cosmology. He developed the general theory of relativity, one of the two pillars of modern physics, along with quantum mechanics. Einstein\'s work is also known for its influence on the philosophy of science. He is best known in popular culture for his mass–energy equivalence formula, \\(E=mc^2\\), which has been dubbed "the world\'s most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". The latter was pivotal in establishing quantum theory, which won him the Nobel Prize in Physics in 1921. In 1922, Einstein was awarded the U.S. Presid

In [None]:
###################################################################

In [10]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [7]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Explain who was Albert Einstein?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
type(tokenizer)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [12]:
model = AutoModelForCausalLM.from_pretrained(model_id)
model

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

In [13]:
messages = [
    {"role": "system", "content": "You are a scientist in history and allways anwser very precise!"},
    {"role": "user", "content": "Explain who was Albert Einstein."},
]

In [20]:
tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Oct 2024\n\nYou are a scientist in history and allways anwser in correctly!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain who was Albert Einstein.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [27]:
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", padding=True)

outputs = model.generate(
    inputs, 
    pad_token_id=tokenizer.eos_token_id,
    temperature=1.0,
    max_length=1000
)


In [28]:
tokenizer.decode(outputs[0])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Oct 2024\n\nYou are a scientist in history and allways anwser in correctly!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain who was Albert Einstein.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAlbert Einstein (1879-1955) was a renowned German-born physicist who is widely regarded as one of the most influential scientists of the 20th century. His groundbreaking contributions to physics, particularly in the fields of relativity and quantum mechanics, have had a profound impact on our understanding of the universe.\n\nEinstein was born in Munich, Germany, to a Jewish family. He grew up with a strong interest in mathematics and science, and his curiosity about the natural world led him to study physics at the Swiss Federal Polytechnic University.\n\nIn 1905, Einstein's annus mirabilis (miracle year), he published four seminal papers that revolutio

In [16]:
help(tokenizer.apply_chat_template)

Help on method apply_chat_template in module transformers.tokenization_utils_base:

apply_chat_template(conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]], tools: Optional[List[Dict]] = None, documents: Optional[List[Dict[str, str]]] = None, chat_template: Optional[str] = None, add_generation_prompt: bool = False, continue_final_message: bool = False, tokenize: bool = True, padding: bool = False, truncation: bool = False, max_length: Optional[int] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None, return_dict: bool = False, return_assistant_tokens_mask: bool = False, tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> Union[str, List[int], List[str], List[List[int]], transformers.tokenization_utils_base.BatchEncoding] method of transformers.tokenization_utils_fast.PreTrainedTokenizerFast instance
    Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
    ids. This method is intend