In [1]:
!pip install jieba transformers torch

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting torch
  Downloading torch-2.6.0-cp310-none-macosx_11_0_arm64.whl (66.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0mm
Collecting huggingface-hub<1.0,>=0.30.0
  Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 KB[0m [31m718.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting safetensors>=0.4.3
  Downlo

In [2]:
import jieba

hsk1_sentences = [
    "我喝水。", "你是学生。", "这是书。", 
    "妈妈好。", "爸爸喝茶。", "你好吗？"
]

# Add HSK words to Jieba's dictionary
hsk1_words = ["我", "喝", "水", "学生", "书", "妈妈", "爸爸", "茶", "你", "好"]
with open("hsk1_words.txt", "w") as f:
    f.write("\n".join(hsk1_words))
jieba.load_userdict("hsk1_words.txt")

# Tokenize a sentence
print(jieba.lcut("我喝水。"))  # Output: ['我', '喝', '水', '。']

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/p9/gl1s91rn2fv_77662_261gb80000gn/T/jieba.cache
Loading model cost 0.303 seconds.
Prefix dict has been built successfully.


['我', '喝水', '。']


In [4]:
import random

def generate_cloze(sentence):
    words = jieba.lcut(sentence)
    mask_pos = random.randint(0, len(words)-1)
    correct = words[mask_pos]
    words[mask_pos] = "[MASK]"
    return {
        "sentence": "".join(words),
        "correct": correct,
        "options": random.sample(hsk1_words, k=3) + [correct]
    }

cloze_data = [generate_cloze(s) for s in hsk1_sentences * 10]  # 60 examples
cloze_data

[{'sentence': '我喝水[MASK]', 'correct': '。', 'options': ['学生', '书', '爸爸', '。']},
 {'sentence': '你是[MASK]。', 'correct': '学生', 'options': ['爸爸', '茶', '水', '学生']},
 {'sentence': '这是[MASK]。', 'correct': '书', 'options': ['茶', '我', '你', '书']},
 {'sentence': '[MASK]好。', 'correct': '妈妈', 'options': ['书', '你', '我', '妈妈']},
 {'sentence': '爸爸[MASK]。', 'correct': '喝茶', 'options': ['学生', '喝', '茶', '喝茶']},
 {'sentence': '[MASK]吗？', 'correct': '你好', 'options': ['学生', '妈妈', '茶', '你好']},
 {'sentence': '[MASK]喝水。', 'correct': '我', 'options': ['爸爸', '好', '喝', '我']},
 {'sentence': '[MASK]是学生。', 'correct': '你', 'options': ['学生', '我', '妈妈', '你']},
 {'sentence': '这是[MASK]。', 'correct': '书', 'options': ['妈妈', '水', '我', '书']},
 {'sentence': '妈妈[MASK]。', 'correct': '好', 'options': ['爸爸', '我', '妈妈', '好']},
 {'sentence': '[MASK]喝茶。', 'correct': '爸爸', 'options': ['书', '爸爸', '水', '爸爸']},
 {'sentence': '你好吗[MASK]', 'correct': '？', 'options': ['妈妈', '我', '好', '？']},
 {'sentence': '[MASK]喝水。', 'correct': '我', 'options':

In [6]:
from transformers import BertTokenizerFast, BertForMaskedLM, AutoModelForMaskedLM

# Tokenizer (using Jieba-based vocab)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese", max_len=10)
model = BertForMaskedLM.from_pretrained("bert-base-chinese").to("cuda")

# Masked LM Training
optimizer = AutoModelForMaskedLM(model.parameters(), lr=5e-5)

for epoch in range(10):
    for example in cloze_data:
        inputs = tokenizer(example["sentence"], return_tensors="pt").to("cuda")
        labels = tokenizer(example["correct"], return_tensors="pt")["input_ids"].to("cuda")
        
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 

In [None]:
def generate_exercise():
    example = random.choice(cloze_data)
    inputs = tokenizer(example["sentence"], return_tensors="pt").to("cuda")
    logits = model(**inputs).logits
    mask_pos = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
    predicted = tokenizer.decode(logits[0, mask_pos].argmax(-1).item())
    
    return {
        "sentence": example["sentence"],
        "options": example["options"],
        "correct": predicted
    }

print(generate_exercise())
# Output: {"sentence": "我[MASK]水。", "options": ["喝", "是", "在", "有"], "correct": "喝"}