Tokenizers

In [None]:
# ------------------------------------ Packages ----------------------------------!
!pip install -q transformers

In [None]:
# ------------------------------------ Imports ----------------------------------
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [None]:
# ------------------------------------ Configure Hugging Face Token ----------------------------------
# Retrieve stored API key from Colab's secure userdata store
hf_token = userdata.get('HF_TOKEN')

if hf_token:
    print(f"Hugging Face Token exists and begins {hf_token[:10]}")
else:
  print("Hugging Face Token not set")


In [None]:
# ------------------------------------ Connect to Hugging Face ----------------------------------
login(hf_token, add_to_git_credential=True)

# Request Access to HuggingFace Model:
# https://huggingface.co/black-forest-labs/FLUX.1-schnell

🦙 Accessing LLaMA 3.1 from Meta

Meta's LLaMA 3.1 is an incredible open-weight large language model that you have to agree to their terms of service to use.  
  
📄 Step 1: Accept Meta's Terms  
https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fmeta-llama%2FMeta-Llama-3.1-8B

    Go to the model page on Hugging Face:
    👉 Meta-LLaMA-3.1-8B on Hugging Face

    At the top of the page, you'll find instructions to agree to Meta's terms.
    ✅ Use the same email address as your Hugging Face account for the smoothest experience.

🧠 Step 2: Load the Model Using transformers

Meta's LLaMA models are compatible with the amazing 🤗 transformers library — one of the most widely used tools for working with pre-trained machine learning models, especially in NLP.
✨ Key Components

    AutoTokenizer

        A smart class from transformers that automatically selects the correct tokenizer based on the model.

        You don’t need to know whether the model uses LlamaTokenizer, BertTokenizer, etc.

    .from_pretrained(...)

        This method downloads and loads the tokenizer or model weights from the Hugging Face Model Hub or a local path.

🧬 Example Identifier

"meta-llama/Meta-Llama-3.1-8B"

This is the model ID used in Hugging Face to reference the 8B parameter version of LLaMA 3.1.

In [None]:
# ------------------------------------ Connect to Meta-Llama-3.1-8B ----------------------------------
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B', trust_remote_code=True)

Define the input text

text = "I am excited to show Tokenizers in action to my LLM engineers"

This is the sentence you want to tokenize — break into smaller units (called tokens) that the model can understand.

Encode the text

tokens = tokenizer.encode(text)

    The tokenizer.encode() method:

        Converts your input string into a list of token IDs.

        These are integers that represent the text as the model sees it.

        It automatically adds special tokens (like <s> or </s>) depending on the tokenizer configuration.

View the result

tokens

This outputs a list of integers like:

    [1, 72, 393, 2172, 281, 1262, 18196, 287, 389, 15548, 20571, 2]

    (Note: Actual output will vary based on which model/tokenizer is used.)

📌 Why Tokenization?

    LLMs (like LLaMA, GPT, BERT) don't process raw text — they work with numbers.

    Tokenization is the essential step that translates text into numerical input the model can work with.

🧠 To convert the token IDs back to readable text:

decoded = tokenizer.decode(tokens)  
print(decoded)

In [None]:
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(text)
tokens

In [None]:
# The count of the tokens that the text was encoded into.
len(tokens)

In [None]:
tokenizer.decode(tokens)

In [None]:
# Get the decoded text as a list of strings
tokenizer.batch_decode(tokens)

In [None]:
# tokenizer.vocab - Get the dictionary ID number of the token
tokenizer.get_added_vocab()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)

In [None]:

messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)



Example Use:

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True, device_map="auto")

# Define the chat messages
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

# Create prompt
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate response
outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9)

# Decode and print
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Trying 3 Different Models:

Phi3 from Microsoft Qwen2 from Alibaba Cloud Starcoder2 from BigCode (ServiceNow + HuggingFace + NVidia)

In [None]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [None]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)

text = "I am excited to show Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print()
tokens = phi3_tokenizer.encode(text)
print(phi3_tokenizer.batch_decode(tokens))


In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)

text = "I am excited to show Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print()
print(phi3_tokenizer.encode(text))
print()
print(qwen2_tokenizer.encode(text))

In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print()
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={starcoder2_tokenizer.decode(token)}")