In [None]:
import jieba

class HSKValidator:
    def __init__(self, model, tokenizer, hsk_words, generate_fn):
        """
        model, tokenizer: your fine-tuned model and tokenizer
        hsk_words: list of Chinese words to test
        generate_fn: function to generate sentence given a word (e.g., generate_sentence)
        """
        self.model = model
        self.tokenizer = tokenizer
        self.hsk_words = hsk_words
        self.generate_fn = generate_fn

    def validate(self):
        total = len(self.hsk_words)
        generated = 0
        correct = 0
        results = []

        for word in self.hsk_words:
            try:
                sentence = self.generate_fn(word)
                generated += 1
                used = word in sentence or word in jieba.lcut(sentence)
                if used:
                    correct += 1
                results.append({"word": word, "sentence": sentence, "used": used})
            except Exception as e:
                results.append({"word": word, "sentence": None, "used": False, "error": str(e)})

        coverage = round(100 * generated / total, 2)
        usage = round(100 * correct / generated, 2) if generated > 0 else 0.0

        return {
            "coverage_%": coverage,
            "usage_%": usage,
            "total": total,
            "generated": generated,
            "correct": correct,
            "results": results
        }

In [None]:

validator = HSKValidator(model, tokenizer, hsk_words, generate_sentence)
report = validator.validate()

print("Coverage:", report["coverage_%"], "%")
print("Correct word usage:", report["usage_%"], "%")

# Print example results
for r in report["results"][:5]:
    print(f"{r['word']} → {r['sentence']} ✅" if r['used'] else f"{r['word']} → {r['sentence']} ❌")

In [17]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch

MODEL_PATH="../../models/hsk1-gpt2"

# Load your fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)
model.eval()

# Ensure pad token is set
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def generate_sentence(word, max_length=40):
    prompt = f"输入词语：{word}, 生成句子:"
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            do_sample=True,
            top_k=40,
            top_p=0.9,
            temperature=0.4,
            num_return_sequences=1,
            pad_token_id=model.config.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
    decoded = tokenizer.convert_tokens_to_string(tokens)

    # Remove the prompt and unnecessary spaces
    result = decoded.replace(prompt, "").replace(" ", "").strip()

    # Optionally stop at first sentence-ending punctuation
    import re
    result = re.split(r"[。！？]", result)[0] + "。"
    return result



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'GPT2Tokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [14]:
import re

def clean_output(output, prompt):
    output = output.replace("[CLS]", "").replace("[SEP]", "").strip()
    output = output.replace(prompt, "").strip()
    # Trim to first sentence-ending punctuation
    output = re.split(r"[。！？]", output)[0] + "。"
    return output

def generate_fn(word):
    prompt = f"输入词语：{word},生成句子:"
    result = generate_sentence(word)
    print(result)
    return clean_output(result, prompt)

In [16]:
generate_fn("我")

[CLS]输入词语：我,生成句子:[SEP],我生成句子:我们在吃面。


',我生成句子:我们在吃面。'