In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
MODEL = "openai/gpt-oss-20b"

In [27]:
# A study of Tokenizers: https://learn.deeplearning.ai/courses/how-transformer-llms-work/lesson/e34gz/tokenizers
tokenizer = AutoTokenizer.from_pretrained(MODEL)
emotions = ["happy", "sad", "anxious", "calm", "depressed", "elated"]
emotion_tokens = [tokenizer(emo).input_ids for emo in emotions]
print(emotion_tokens)

[[51917], [121007], [270, 87, 1595], [5842, 76], [613, 26974], [296, 780]]


In [28]:
DSM_DEPRESSION = "Depression is a mood disorder that causes a persistent feeling of sadness and loss of interest."
print(DSM_DEPRESSION)
print(tokenizer(DSM_DEPRESSION))
print("Depression")
print(tokenizer("Depression"))
print("depression")
print(tokenizer("depression"))
print("Depressed")
print(tokenizer("Depressed"))
print("Depress")
print(tokenizer("Depress"))
print("Dep")
print(tokenizer("Dep"))
print("pression")
print(tokenizer("pression"))
print("dep")
print(tokenizer("dep"))
print("Depp")
print(tokenizer("Depp"))
print("press")
print(tokenizer("press"))
print("pressed")
print(tokenizer("pressed"))
print("pression")
print(tokenizer("pression"))

Depression is a mood disorder that causes a persistent feeling of sadness and loss of interest.
{'input_ids': [8000, 15210, 382, 261, 25199, 31145, 484, 16431, 261, 43952, 10656, 328, 79740, 326, 6266, 328, 3425, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Depression
{'input_ids': [8000, 15210], 'attention_mask': [1, 1]}
depression
{'input_ids': [613, 4521], 'attention_mask': [1, 1]}
Depressed
{'input_ids': [8000, 23406], 'attention_mask': [1, 1]}
Depress
{'input_ids': [8000, 799], 'attention_mask': [1, 1]}
Dep
{'input_ids': [8000], 'attention_mask': [1]}
pression
{'input_ids': [4521], 'attention_mask': [1]}
dep
{'input_ids': [27480], 'attention_mask': [1]}
Depp
{'input_ids': [1923, 654], 'attention_mask': [1, 1]}
press
{'input_ids': [2020], 'attention_mask': [1]}
pressed
{'input_ids': [26974], 'attention_mask': [1]}
pression
{'input_ids': [4521], 'attention_mask': [1]}


In [29]:
depression_tokens = [8000, 15210, 613, 4521, 799, 23406]
print(depression_tokens)
print([tokenizer.decode(tok) for tok in depression_tokens])
# decode: token id -> text, encode: text -> token id, code ~= token

[8000, 15210, 613, 4521, 799, 23406]
['Dep', 'ression', 'de', 'pression', 'ress', 'ressed']


Observations: "Depression" and "depression" have different tokens. And they are both split into two tokens each.

In [30]:
# A list of colors in RGB for representing the tokens
colors = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence: str, tokenizer_name: str):
    """ Show the tokens each separated by a different color """

    # Load the tokenizer and tokenize the input
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids

    # Extract vocabulary length
    print(f"Vocab length: {len(tokenizer)}")

    # Print a colored list of tokens
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors[idx % len(colors)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

In [31]:
show_tokens(DSM_DEPRESSION, "openai/gpt-oss-20b")
print()
show_tokens(DSM_DEPRESSION, "openai/gpt-oss-120b")

Vocab length: 200019
[0;30;48;2;102;194;165mDep[0m [0;30;48;2;252;141;98mression[0m [0;30;48;2;141;160;203m is[0m [0;30;48;2;231;138;195m a[0m [0;30;48;2;166;216;84m mood[0m [0;30;48;2;255;217;47m disorder[0m [0;30;48;2;102;194;165m that[0m [0;30;48;2;252;141;98m causes[0m [0;30;48;2;141;160;203m a[0m [0;30;48;2;231;138;195m persistent[0m [0;30;48;2;166;216;84m feeling[0m [0;30;48;2;255;217;47m of[0m [0;30;48;2;102;194;165m sadness[0m [0;30;48;2;252;141;98m and[0m [0;30;48;2;141;160;203m loss[0m [0;30;48;2;231;138;195m of[0m [0;30;48;2;166;216;84m interest[0m [0;30;48;2;255;217;47m.[0m 
Vocab length: 200019
[0;30;48;2;102;194;165mDep[0m [0;30;48;2;252;141;98mression[0m [0;30;48;2;141;160;203m is[0m [0;30;48;2;231;138;195m a[0m [0;30;48;2;166;216;84m mood[0m [0;30;48;2;255;217;47m disorder[0m [0;30;48;2;102;194;165m that[0m [0;30;48;2;252;141;98m causes[0m [0;30;48;2;141;160;203m a[0m [0;30;48;2;231;138;195m persistent[0m [0;30;48

Observation: gpt-oss models share the same tokenizer between two model sizes.

In [5]:
# A study of Transformers: https://learn.deeplearning.ai/courses/how-transformer-llms-work/lesson/m3nid/model-example
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=True,
)

ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`