In [2]:
import os

import torch
import torch.nn as nn
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login

In [3]:
load_dotenv()

True

In [102]:
# os.environ["access_token"]
# login(token=os.environ["access_token"])

In [103]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
local_id = os.path.join("/Users", "dina", "models", "models-tgi", f"{model_id}")
device = "mps"

In [104]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=local_id, padding_side="left"
)
tokenizer.pad_token = tokenizer.eos_token

In [105]:
tokenizer.pad_token

'<|eot_id|>'

In [106]:
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=local_id,
    torch_dtype=torch.bfloat16,
    device_map=device,
)

In [107]:
# model

## 2. Tokenizer

In [108]:
input_prompts = ["hello how are you doing tell me?", "The Capital of India is"]

In [109]:
tokenised = tokenizer(text=input_prompts, return_tensors="pt", padding=True).to(
    device=device
)
tokenised

{'input_ids': tensor([[128000,  15339,   1268,    527,    499,   3815,   3371,    757,     30],
        [128009, 128009, 128009, 128000,    791,  18880,    315,   6890,    374]],
       device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')}

In [110]:
tokenised["input_ids"]

tensor([[128000,  15339,   1268,    527,    499,   3815,   3371,    757,     30],
        [128009, 128009, 128009, 128000,    791,  18880,    315,   6890,    374]],
       device='mps:0')

In [111]:
tokenizer.batch_decode(tokenised["input_ids"])

['<|begin_of_text|>hello how are you doing tell me?',
 '<|eot_id|><|eot_id|><|eot_id|><|begin_of_text|>The Capital of India is']

In [112]:
tokenised["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1]], device='mps:0')

## 3. Instruction Prompts and chat template

In [113]:
prompt_template = [
    {
        "role": "system",
        "content": "You are a smart AI assistant who speaks like a pirate.",
    },
    {"role": "user", "content": "where does the sun rises?"},
    {"role": "assistant", "content": "Aye Aye"},
]

In [114]:
# prompt_token_text = tokenizer.apply_chat_template(
#     conversation=prompt_template,
#     add_generation_prompt=True,
#     tokenize=False,
#     # tokenize=True,
#     padding=True,
#     return_tensors="pt",
# )

# prompt_token_text
# tokenised = tokenizer(text=prompt_token_text, padding=True, return_tensors="pt")


In [115]:
tokenised = tokenizer.apply_chat_template(
    conversation=prompt_template,
    # add_generation_prompt=True,
    add_generation_prompt=False,
    continue_final_message=True,
    # tokenize=False,
    tokenize=True,
    padding=True,
    return_tensors="pt",
).to(device)

In [116]:
print(tokenised)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,    975,  13806,    220,   2366,     20,    271,   2675,    527,
            264,   7941,  15592,  18328,    889,  21881,   1093,    264,  55066,
             13, 128009, 128006,    882, 128007,    271,   2940,   1587,    279,
           7160,  38268,     30, 128009, 128006,  78191, 128007,    271,     32,
           9188,    362,   9188]], device='mps:0')


In [117]:
tokenizer.batch_decode(tokenised)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 14 Feb 2025\n\nYou are a smart AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nwhere does the sun rises?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAye Aye']

## Generate using LLM

In [118]:
out = model.generate(tokenised, max_new_tokens=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [119]:
out_decoded = tokenizer.batch_decode(out)
print(out_decoded[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Feb 2025

You are a smart AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

where does the sun rises?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Aye Aye Captain, ye be askin' about the sun risin' place, eh? Well, mate


## Model

In [120]:
text = "Hello how are"
tok = tokenizer([text], return_tensors="pt")
tok_ids = tok["input_ids"].to(device)

out = model(tok_ids)

In [121]:
out.logits

tensor([[[ 2.8438,  3.5625,  7.0000,  ..., -1.2500, -1.2500, -1.2500],
         [19.0000,  3.8438,  3.6875,  ..., -1.0781, -1.0781, -1.0781],
         [ 9.3750,  5.8750,  3.9375,  ..., -0.2148, -0.2148, -0.2158],
         [ 9.8750,  6.3125,  1.7266,  ...,  0.4023,  0.4023,  0.4004]]],
       device='mps:0', dtype=torch.bfloat16, grad_fn=<LinearBackward0>)

In [122]:
out.logits.shape
# here 1 -> batch size
# 4 -> number of tokens
# 128256 -> vocab size

torch.Size([1, 4, 128256])

In [123]:
tokenizer.decode(out.logits[:, -1].argmax(axis=-1))

' you'

In [124]:
tokenizer.batch_decode(out.logits.argmax(axis=-1))

['Tags, are you']

In [125]:
out.logits[:, 0].argmax(axis=-1).item()

16309

In [126]:
tokenizer.convert_ids_to_tokens(16309)

'Tags'

In [138]:
p_dist = nn.Softmax()(out.logits[0, -1])

  return self._call_impl(*args, **kwargs)


In [139]:
print(tokenizer.vocab["you"])
print(tokenizer.vocab["Ġyou"])
print(tokenizer.vocab["ĠYou"])

9514
499
1472


In [140]:
p_dist[449]

tensor(4.1723e-06, device='mps:0', dtype=torch.bfloat16,
       grad_fn=<SelectBackward0>)

In [141]:
p_dist[1472]

tensor(5.0545e-05, device='mps:0', dtype=torch.bfloat16,
       grad_fn=<SelectBackward0>)

In [142]:
p_dist.argmax(axis=-1)

tensor(499, device='mps:0')

## Training on sequences

In [144]:
sentence = ["Subscribe to my youtube channel"]
tokenized = tokenizer(sentence, return_tensors="pt")["input_ids"]
print(tokenized)
print(tokenizer.batch_decode(tokenized))

tensor([[128000,  29673,    311,    856,  28277,   5613]])
['<|begin_of_text|>Subscribe to my youtube channel']


In [146]:
input_ids = tokenized[:, :-1]  # (start) to (end-1)
target_ids = tokenized[:, 1:]  # (start+1) to (end)
print(input_ids)
print(target_ids)

tensor([[128000,  29673,    311,    856,  28277]])
tensor([[29673,   311,   856, 28277,  5613]])
