In [1]:
import random
import string
from collections import Counter

In [2]:
# Step 1: Create a simulated document (200 words from a sample vocabulary)
vocab_pool = ["intelligence", "learning", "thinking", "system", "model", "language",
              "neuron", "data", "pattern", "understanding", "train", "token", "input",
              "output", "network", "parameter", "representation", "information",
              "reasoning", "embedding"]

document = " ".join(random.choices(vocab_pool, k=200))
print("Sample document:")
print(document[:300], "...")  # Preview


Sample document:
thinking understanding information embedding thinking token input model neuron thinking input token model learning output data reasoning network thinking embedding understanding parameter learning system pattern embedding intelligence reasoning data parameter parameter embedding token output learnin ...


In [3]:
# Step 2: Create vocabulary by counting words (basic word-level tokenization)
words = document.split()
vocabulary = sorted(set(words))  # sorted just for visual clarity
word_to_token = {word: idx for idx, word in enumerate(vocabulary)}

print("\nVocabulary (word -> token ID):")
for word, idx in word_to_token.items():
    print(f"{word} -> {idx}")



Vocabulary (word -> token ID):
data -> 0
embedding -> 1
information -> 2
input -> 3
intelligence -> 4
language -> 5
learning -> 6
model -> 7
network -> 8
neuron -> 9
output -> 10
parameter -> 11
pattern -> 12
reasoning -> 13
representation -> 14
system -> 15
thinking -> 16
token -> 17
train -> 18
understanding -> 19


In [4]:
# Step 3: Tokenize the document (convert to token IDs)
tokenized = [word_to_token[word] for word in words]
print("\nTokenized document (first 50 tokens):")
print(tokenized[:50])


Tokenized document (first 50 tokens):
[16, 19, 2, 1, 16, 17, 3, 7, 9, 16, 3, 17, 7, 6, 10, 0, 13, 8, 16, 1, 19, 11, 6, 15, 12, 1, 4, 13, 0, 11, 11, 1, 17, 10, 6, 16, 10, 11, 12, 17, 16, 6, 6, 18, 19, 9, 12, 6, 14, 12]


# A) Tokenizing with OpenAI

In [5]:
import tiktoken

# Step 1: Define some text
text = "What is intelligence? Can machines think or understand meaning?"

# Step 2: Use GPT-3.5 tokenizer
enc = tiktoken.get_encoding("cl100k_base")  # tokenizer used in GPT-4/3.5

tokens = enc.encode(text)
decoded = enc.decode(tokens)

print("\nOriginal Text:\n", text)
print("\nTokens (IDs):\n", tokens)
print("\nBack to Text:\n", decoded)
print("\nNumber of tokens:", len(tokens))



Original Text:
 What is intelligence? Can machines think or understand meaning?

Tokens (IDs):
 [3923, 374, 11478, 30, 3053, 12933, 1781, 477, 3619, 7438, 30]

Back to Text:
 What is intelligence? Can machines think or understand meaning?

Number of tokens: 11


# B) Tockenizing with HuggingFace

In [6]:
from transformers import AutoTokenizer

# Load tokenizer (same used by GPT-like models)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

text = "What is intelligence? Can machines think or understand meaning?"

tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("\nOriginal Text:\n", text)
print("\nTokenizer Output:")
for tok, tid in zip(tokens, token_ids):
    print(f"{tok} -> {tid}")

print("\nTotal tokens:", len(tokens))


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.



Original Text:
 What is intelligence? Can machines think or understand meaning?

Tokenizer Output:
What -> 2061
Ġis -> 318
Ġintelligence -> 4430
? -> 30
ĠCan -> 1680
Ġmachines -> 8217
Ġthink -> 892
Ġor -> 393
Ġunderstand -> 1833
Ġmeaning -> 3616
? -> 30

Total tokens: 11
