In [1]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch

MODEL_PATH="../../models/hsk1-gpt2-jieba-2"

# Load your fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)
model.eval()

# Ensure pad token is set
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def generate_sentence(prompt, max_length=40, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.8,
                num_return_sequences=num_return_sequences,
                pad_token_id=model.config.pad_token_id,
                eos_token_id=model.config.eos_token_id,
                repetition_penalty=1.1
            )

    results = []
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        # remove all spaces
        decoded = decoded.replace(" ", "")
        results.append(decoded)

    return results



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_output(output, prompt):
    # remove prompt from output based on the length of the prompt minus 1
    output = output[len(prompt) - 2:]
    # remove all spaces
    output = output.replace(" ", "")
    # remove all punctuation
    output = output.replace("。", "").replace("！", "").replace("？", "")
    
    
    return output

def generate_fn(word, num_return_sequences=1):
    prompt = f"为词语“{word}”造句："
    result = generate_sentence(prompt, max_length=40, num_return_sequences=num_return_sequences)
    return clean_output(result[0], prompt)

generate_fn('我')

'句造句:很高兴认识你他们,”””!?””!””””””””!””””'

In [3]:
import jieba

def benchmark_hsk_word_usage(generate, hsk_vocab: list, verbose: bool = True):
    """
    Benchmarks how well the model generates sentences containing target HSK words.

    Args:
        model_generator: an instance of ChineseSentenceGenerator
        hsk_vocab (list): list of HSK words (str)
        verbose (bool): whether to print detailed results

    Returns:
        dict with metrics and list of results
    """
    results = []
    hits = 0

    for word in hsk_vocab:
        try:
            sentence = generate(word, num_return_sequences=1)
            tokens = jieba.lcut(sentence)

            used = word in tokens
            if used:
                hits += 1

            results.append({
                "word": word,
                "sentence": sentence,
                "used": used
            })

            if verbose:
                status = "✅" if used else "❌"
                print(f"{status} {word} → {sentence}")

        except Exception as e:
            print(f"⚠️ Error generating for word '{word}': {e}")
            results.append({
                "word": word,
                "sentence": None,
                "used": False,
                "error": str(e)
            })

    coverage = hits / len(hsk_vocab) if hsk_vocab else 0

    return {
        "coverage": coverage,
        "used": hits,
        "total": len(hsk_vocab),
        "results": results
    }

In [4]:
# Path to HSK vocabulary file (one word per line)
hsk_vocabulary_path = "../../datasets/vocabulary/hsk1.txt"

# Read and clean HSK words
with open(hsk_vocabulary_path, encoding="utf-8") as f:
    hsk_words = [line.strip() for line in f if line.strip()]

# hsk_words = hsk_words[:10]
# Run benchmark
result = benchmark_hsk_word_usage(generate_fn, hsk_words)


print(f"\n✅ Coverage: {result['used']}/{result['total']} = {round(result['coverage'] * 100, 2)}%")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p9/gl1s91rn2fv_77662_261gb80000gn/T/jieba.cache
Loading model cost 0.274 seconds.
Prefix dict has been built successfully.


❌ 大 → 句造句:在你们国家吃米饭吗?,她”””!!她去好“!?”“在桌子上她这儿我很好!,好
❌ 多 → 句造句:我明年想学汉语我很好,””””””!?,”””很少.
❌ 高兴 → 句造句:我喜欢你.??,十”?””!”妈妈?我们今后一星期没有见面的机会了.好“!好“!””
✅ 好 → 句造句:你回来了???!,我很好,.””!?,好“!””去?”“在桌子上
❌ 冷 → 句造句:回来:“!?,?哪里”去,我喜欢你”是的是的好“!””去””””十
❌ 漂亮 → 句造句:很高兴你是对的??”,那儿?”“我很好!造句:看看你都做了些什么我去看医生”DVD”你在读书”我想她喜欢“!”那些很高兴认识你
❌ 热 → 句造句:我很好,?,””!?.他””””!”””你们认识吗”
❌ 少 → 句造句:我十二点出去吃午饭??好“!?”!?”””?””!,,对!?,和你们认识吗
❌ 小 → 句造句:我喜欢你??””!”””””!””!”””!””?”你不想吃饭吗大学生
❌ 不 → 句造句:我在打电话不是这样!,我很好有看?昨天””!!??谢谢”去”””””?
❌ 没有 → 句造句:我喜欢你?!,十.不是我我很好你“””””本书我在吃面这是你的打字机吗谢谢
✅ 很 → 句造句:他是医生来的?”汉语打:“?”汉语和John是好朋友?我很好,”””””””””
❌ 太 → 句造句:我没说”!?,??,,十她去哪里是120吗”这里”?”“在桌子上”””””””
❌ 都 → 句造句:你住在哪儿??,?”“在桌子上””!”””””””””””””你十岁了吗
❌ 会 → 句造句:我的爱人很会做饭?,.我很好!””?””!,好“!”””!?妈妈
❌ 能 → 句造句:我十二点出去吃午饭?,?”“我很好,””!”””””””””””””
✅ 想 → 句造句:吃午饭了吗??”汉语吗是120吗这是我的电脑想我叫小雨那个,?”,这里好“!”想!””想她
❌ 和 → 句造句:很高兴认识你??,十??””””.”!””吗?”””””
❌ 这 → 句”造句:她昨天来见我了喝茶?十!????哪里,”””我在吃面汉语”我的汉语不太好!他
❌ 那 → 句造句:我十二点出去吃午饭?””,好“!好“!?!??妈妈”””””
❌ 喂 → 句造句:我在打电话??””””””””!”””!”””””””很少
❌ 多少 → 句造句

KeyboardInterrupt: 