In [33]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from IPython.display import Markdown, display

load_dotenv()  # 读取 .env 文件
token = os.getenv("HUGGINGFACE_HUB_TOKEN")
login(token=token, new_session=False)

In [4]:
model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

# Part1 - Understanding Tokens in Large Language Models (5%)

- Q1: What is the vocabulary size of the Gemma-3-1B tokenizer? (1%)
- Q3: Encode the string「作業一」to token IDs (Gemma-3-1B). (1%)
- Q4: Which pair correctly reports the longest decoded token string in the vocabulary (token_id, 
character_length)? (1%)
- Q5: Given the prefix 「阿姆斯特朗旋風迴旋加速噴氣式阿姆斯特朗砲」 , which single Chinese 
character is the model’s most probable next token? (1%)

In [None]:
print("Q1", tokenizer.vocab_size)
print("Q3", tokenizer.encode("作業一", add_special_tokens=False))

Q1 262144
Q3 [46306, 237009]


In [13]:
max_len, max_token_id = -1, None
for i in range(tokenizer.vocab_size):
    token = tokenizer.decode(i)
    if len(token) > max_len:
        max_len, max_token_id = len(token), i
print("Q4", (max_token_id, max_len), "->", repr(tokenizer.decode(max_token_id)))

Q4 (137, 31) -> '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'


In [None]:
input_ids = tokenizer.encode("阿姆斯特朗旋風迴旋加速噴氣式阿姆斯特朗砲", return_tensors="pt")
outputs = model(input_ids)
# logits: [batch_size, sequence_length, vocab_size]
print(outputs.logits.shape)
# 信心分数（可能为负数） --- softmax ---> 几率分布
# P.S. 联想到 模拟退火中 Metropolis 接受概率函数
probabilities = torch.softmax(outputs.logits[:, -1, :], dim=-1)
# 首尾呼应、以终为始：LM Head 就是 Embedding Table
tokenizer.decode(torch.argmax(probabilities.item(), dim=-1))

torch.Size([1, 18, 262144])


'塔'

# Part2 - System and User Prompt Engineering (3%)

- Q6: instruction following (1%)
- Q7: restrictive system prompt (1%)
- Q8: language constraint (1%)

In [42]:
pipe = pipeline("text-generation", model_id)
messages = [{"role": "system", "content": "You are a smart agent."},
    {"role": "user", "content": "皮卡丘源自於哪個動畫作品?"}]
outputs = pipe(messages, max_new_tokens=2000, pad_token_id=pipe.tokenizer.eos_token_id)
response = outputs[0]["generated_text"][-1]['content']
display(Markdown('Q6-1: '+response))

messages = [{"role": "system", "content": "You are a smart agent."},
    {"role": "user", "content": "Which anime is Pikachu derived from?"}]
outputs = pipe(messages, max_new_tokens=2000, pad_token_id=pipe.tokenizer.eos_token_id)
response = outputs[0]["generated_text"][-1]['content']
display(Markdown('Q6-2: '+response))

Device set to use mps:0


Q6-1: 皮卡丘的起源於 **《超時預言家》 (Time Traveler’s Tale)**。

它最初是《超時預言家》的短篇故事，並於2008年首次播出。


Q6-2: Pikachu is derived from the **Pokémon** franchise! 

Specifically, he's based on the adorable and energetic Pokémon Pikachu, created by Satoshi Tajiri in 1996. 😊 

Let me know if you’d like to learn more about the Pokémon franchise!

In [44]:
# https://ai.google.dev/gemma/docs/core/prompt-structure
# Gemma 的指令调优模型仅适用于两个角色：user 和 model
print(tokenizer.apply_chat_template(messages, tokenize=False))

<bos><start_of_turn>user
 You can only answer: I don’t know.

皮卡丘源自於哪個動畫作品?<end_of_turn>



In [None]:
pipe = pipeline("text-generation", model_id)
messages = [{"role": "system", "content": "You can only answer: I don’t know."},
    {"role": "user", "content": "皮卡丘源自於哪個動畫作品?"}]
outputs = pipe(messages, max_new_tokens=2000, pad_token_id=pipe.tokenizer.eos_token_id)
response = outputs[0]["generated_text"][-1]['content']
display(Markdown('Q7: '+response))

Device set to use mps:0


Q7: I don’t know.

In [46]:
pipe = pipeline("text-generation", model_id)
messages = [{"role": "system", "content": "Answer in English only"},
    {"role": "user", "content": "皮卡丘源自於哪個動畫作品?"}]
outputs = pipe(messages, max_new_tokens=2000, pad_token_id=pipe.tokenizer.eos_token_id)
response = outputs[0]["generated_text"][-1]['content']
display(Markdown('Q8: '+response))

Device set to use mps:0


Q8: Pikachu originated from the anime series **Pokemon**.
