In [None]:
!pip install transformers accelerate


##Downloading and Running An LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
# we can directly use model and tokenizer directly from pipeline object wrapper
# here we have downloaded model on local machine and then using it.

from transformers import pipeline

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=False,
    return_full_text=False,
    )

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-4k-instruct')

In [None]:
# The prompt (user input / query)
messages = [
    {"role": "user", "content": "Create a funny joke about chickens."}
]

# Generate output
output = generator(messages)
print(output[0]["generated_text"])

In [None]:
prompt = "Write an email apologizing her for her loss of here brother in covid pandemic .Explain how it happened. <|assistant|>"

# tokenize the i/p
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

# generate the text
generation_output = model.generate(
    input_ids = input_ids,
    max_new_tokens = 30
)

print(tokenizer.decode(generation_output[0]))

In [None]:
print(input_ids)

In [None]:
for id in input_ids[0]:
  print(tokenizer.decode(id))

In [None]:
generation_output

In [None]:
print(tokenizer.decode(385))
print(tokenizer.decode(6225))
print(tokenizer.decode(9544))
print(tokenizer.decode(13))
print(tokenizer.decode(29871))
print(tokenizer.decode(29901))
print(tokenizer.decode(920))

#**Comparing Trained LLM Tokenizers**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  token_ids = tokenizer(sentence).input_ids
  for idx, t in enumerate(token_ids):
    print(
        f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
    )

In [None]:
text = """
English and CAPITALIZATION
🎵 鸟
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

In [None]:
# The official is `tiktoken` but this the same tokenizer on the HF platform

In [None]:
show_tokens(text, "bert-base-cased")

In [None]:
show_tokens(text,"bert-base-uncased")

In [None]:
show_tokens(text, "gpt2")

In [None]:
show_tokens(text, "google/flan-t5-small")

In [None]:
show_tokens(text, "Xenova/gpt-4")

In [None]:
# Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

In [None]:
show_tokens(text, "bigcode/starcoder2-15b")

In [None]:
show_tokens(text, "facebook/galactica-1.3b")

In [None]:
show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

In [None]:
show_tokens(text, "distilgpt2")

In [None]:
import pprint

In [None]:
def compare_tokenizers(model_names):
  info={}
  for name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(name)
    info[name]={
        "vocab_size": tokenizer.vocab_size,
        "full_vocab": len(tokenizer),
        "algo": tokenizer.__class__.__name__,
        "special_tokens": tokenizer.all_special_tokens,
    }
  return info


if __name__=="__main__":
  models = [
        "bert-base-cased",
        "bert-base-uncased",
        "gpt2",
        "distilgpt2",
        "microsoft/Phi-3-mini-4k-instruct",
        "google/flan-t5-small",
        "xenova/gpt-4",
        "bigcode/starcoder2-15b",
        "facebook/galactica-1.3b",
    ]
  pprint.pprint(compare_tokenizers(models))

In [None]:
text = "def my_function(param): return param*2  # test code"
for name, tok in compare_tokenizers(models).items():
    ids = AutoTokenizer.from_pretrained(name)(text).input_ids
    print(name, "→", len(ids), "tokens")


##**Less tokens = higher efficiency, but fewer tokens might mean poorer granularity**

In [None]:
# Vocabulary size = number of unique tokens a tokenizer knows.
# Token count for a sentence = how many tokens a sentence is split into.

# Fewer tokens per sentence usually comes from a larger vocab (more whole words).

# Smaller vocab = more subwords = more tokens per sentence.

# But the type of tokenizer (BPE, WordPiece, SentencePiece, etc.) affects this too.



In [None]:
'''
BERT tokenizers (cased vs. uncased): Preprocess for masked language modeling; WordPiece segmentation. Case sensitivity affects vocabulary size marginally (30522 uncased vs ~30k cased).

GPT‑2 / DistilGPT‑2: Auto-regressive generation using byte-level BPE, handles rare characters robustly. DistilGPT‑2 is lighter but uses the same tokenizer.

Phi‑3‑mini‑4k‑instruct: Instruction‑tuned conversational model with 4K context; uses SentencePiece plus added special tokens.

FLAN‑T5‑small: Encoder‑decoder model pre‑trained for instruction tasks; SentencePiece vocabulary (~32k + extras).

xenova/gpt‑4: A reimplementation of GPT‑4's tokenizer for HF, based on OpenAI’s tiktoken; optimized for multilingual and efficiency.

StarCoder2‑15B: Code-generation model; Byte-level BPE with code-aware tokens, long context windows.

Galactica‑1.3B: Sci‑text specialized; vocabulary learned from scientific corpus; tokenizer stored as ~2 MB JSON.
'''

In [None]:
'''
✅ Which is "best"?
Use‑case matters most:

-Natural language general → GPT‑2 or GPT‑4 tokenizers offer broader coverage with robust handling.
-Instructional text → FLAN‑T5 or Phi‑3 include special tokens for prompt patterns.
-Scientific text → Galactica’s tokenizer aligns with domain-specific terms.
-Code generation → StarCoder2 has code-optimized tokens.

Tokenizer algorithm:
Byte-level BPE (GPT‑2, StarCoder2): Great OOV handling and multilingual robustness.
SentencePiece / WordPiece (T5, BERT, Phi‑3): Efficient and structured but may struggle with rare scripts.

'''

'''
BERT (WordPiece, vocab ~30K)  bcz it has smaller vocab and uses subwords it splits unknown or rare words into muiltiple tokens
MORE TOKENS -> SMALLER VOCAB

GPT-2 (Byte-level BPE, vocab 50K)might have this whole word as one token (or two max), because its vocab is trained on bytes and includes very frequent whole words and even emojis.
FEWER TOKENS - > LARGER VOCAB



'''