In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch



In [3]:
MODEL = "google/gemma-2b"
DEVICE = "cuda:0" # run on my gpu

In [4]:
# A study of Tokenizers: https://learn.deeplearning.ai/courses/how-transformer-llms-work/lesson/e34gz/tokenizers
tokenizer = AutoTokenizer.from_pretrained(MODEL)
emotions = ["happy", "sad", "anxious", "calm", "depressed", "elated"]
emotion_tokens = [tokenizer(emo).input_ids for emo in emotions]
print(emotion_tokens)

[[2, 11896], [2, 37968], [2, 481, 24192], [2, 116051], [2, 3243, 3734], [2, 521, 840]]


In [5]:
DSM_DEPRESSION = "Depression is a mood disorder that causes a persistent feeling of sadness and loss of interest."
print(DSM_DEPRESSION)
print(tokenizer(DSM_DEPRESSION))
print("Depression")
print(tokenizer("Depression"))
print("depression")
print(tokenizer("depression"))
print("Depressed")
print(tokenizer("Depressed"))
print("Depress")
print(tokenizer("Depress"))
print("Dep")
print(tokenizer("Dep"))
print("pression")
print(tokenizer("pression"))
print("dep")
print(tokenizer("dep"))
print("Depp")
print(tokenizer("Depp"))
print("press")
print(tokenizer("press"))
print("pressed")
print(tokenizer("pressed"))
print("pression")
print(tokenizer("pression"))

Depression is a mood disorder that causes a persistent feeling of sadness and loss of interest.
{'input_ids': [2, 116465, 603, 476, 18068, 24283, 674, 10792, 476, 36727, 7965, 576, 51863, 578, 4783, 576, 3273, 235265], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Depression
{'input_ids': [2, 116465], 'attention_mask': [1, 1]}
depression
{'input_ids': [2, 161067], 'attention_mask': [1, 1]}
Depressed
{'input_ids': [2, 5789, 3734], 'attention_mask': [1, 1, 1]}
Depress
{'input_ids': [2, 5789, 1054], 'attention_mask': [1, 1, 1]}
Dep
{'input_ids': [2, 5789], 'attention_mask': [1, 1]}
pression
{'input_ids': [2, 206753], 'attention_mask': [1, 1]}
dep
{'input_ids': [2, 3243], 'attention_mask': [1, 1]}
Depp
{'input_ids': [2, 1680, 658], 'attention_mask': [1, 1, 1]}
press
{'input_ids': [2, 11355], 'attention_mask': [1, 1]}
pressed
{'input_ids': [2, 49716], 'attention_mask': [1, 1]}
pression
{'input_ids': [2, 206753], 'attention_mask': [1, 1]}


In [6]:
test_tokens = [8000, 15210, 613, 4521, 799, 23406]
print(test_tokens)
print([tokenizer.decode(tok) for tok in test_tokens])
# decode: token id -> text, encode: text -> token id, code ~= token

[8000, 15210, 613, 4521, 799, 23406]
['Chapter', 'лай', ' wi', 'Hello', ' tr', ' sü']


Note: In gpt-oss "Depression" and "depression" have different tokens. And they are both split into two tokens each.

In [7]:
# A list of colors in RGB for representing the tokens
colors = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence: str, tokenizer_name: str):
    """ Show the tokens each separated by a different color """

    # Load the tokenizer and tokenize the input
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids

    # Extract vocabulary length
    print(f"Vocab length: {len(tokenizer)}")

    # Print a colored list of tokens
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors[idx % len(colors)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

In [8]:
show_tokens(DSM_DEPRESSION, MODEL)

Vocab length: 256000
[0;30;48;2;102;194;165m<bos>[0m [0;30;48;2;252;141;98mDepression[0m [0;30;48;2;141;160;203m is[0m [0;30;48;2;231;138;195m a[0m [0;30;48;2;166;216;84m mood[0m [0;30;48;2;255;217;47m disorder[0m [0;30;48;2;102;194;165m that[0m [0;30;48;2;252;141;98m causes[0m [0;30;48;2;141;160;203m a[0m [0;30;48;2;231;138;195m persistent[0m [0;30;48;2;166;216;84m feeling[0m [0;30;48;2;255;217;47m of[0m [0;30;48;2;102;194;165m sadness[0m [0;30;48;2;252;141;98m and[0m [0;30;48;2;141;160;203m loss[0m [0;30;48;2;231;138;195m of[0m [0;30;48;2;166;216;84m interest[0m [0;30;48;2;255;217;47m.[0m 

Observation: gpt-oss models share the same tokenizer between two model sizes.

In [9]:
# A study of Transformers: https://learn.deeplearning.ai/courses/how-transformer-llms-work/lesson/m3nid/model-example
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="cuda:0",
    torch_dtype="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False, # False means to not include the prompt text in the returned text
    max_new_tokens=50,
    do_sample=False, # no randomness in the generated text
)

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [11]:
print(generator(DSM_DEPRESSION)[0]['generated_text'])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


 It can affect your thoughts, behavior, and physical health. Depression can be caused by a variety of factors, including genetics, brain chemistry, and life events.

There are many different types of depression, and the symptoms can vary from person to person


In [12]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

In [13]:
model.model.embed_tokens

Embedding(256000, 2048, padding_idx=0)

In [14]:
model.model # printing the stack of transformer blocks without the LM head component

GemmaModel(
  (embed_tokens): Embedding(256000, 2048, padding_idx=0)
  (layers): ModuleList(
    (0-17): 18 x GemmaDecoderLayer(
      (self_attn): GemmaAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
        (k_proj): Linear(in_features=2048, out_features=256, bias=False)
        (v_proj): Linear(in_features=2048, out_features=256, bias=False)
        (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
      )
      (mlp): GemmaMLP(
        (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
        (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
        (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
        (act_fn): GELUActivation()
      )
      (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
    )
  )
  (norm): GemmaRMSNorm((2048,), eps=1e-06)
  (rotary_emb): GemmaRotaryEmbedding()
)

In [15]:
model.model.layers[0]

GemmaDecoderLayer(
  (self_attn): GemmaAttention(
    (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
    (k_proj): Linear(in_features=2048, out_features=256, bias=False)
    (v_proj): Linear(in_features=2048, out_features=256, bias=False)
    (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
  )
  (mlp): GemmaMLP(
    (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
    (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
    (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
    (act_fn): GELUActivation()
  )
  (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
  (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
)

In [16]:

print(DSM_DEPRESSION)
input_ids = tokenizer(DSM_DEPRESSION, return_tensors="pt").input_ids.to(DEVICE) # pt probably means pytorch
input_ids

Depression is a mood disorder that causes a persistent feeling of sadness and loss of interest.


tensor([[     2, 116465,    603,    476,  18068,  24283,    674,  10792,    476,
          36727,   7965,    576,  51863,    578,   4783,    576,   3273, 235265]],
       device='cuda:0')

In [17]:
# Get the output of the model before the lm_head
model_output = model.model(input_ids)

In [18]:
# Get the shape the output the model before the lm_head
model_output[0].shape

torch.Size([1, 18, 2048])

In [19]:
# Get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])
lm_head_output.shape

torch.Size([1, 18, 256000])

In [20]:
token_id = lm_head_output[0,-1].argmax(-1)
token_id

tensor(1165, device='cuda:0')

In [21]:
tokenizer.decode(token_id)

' It'

In [23]:
named_layers = dict(model.named_modules())
print(named_layers)

{'': GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((204