In [1]:
%pip install transformers datasets sentencepiece accelerate jieba scikit-learn

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


## Jieba Tokenizer

In [2]:
import os
import json
import jieba
from tokenizers import Tokenizer
from tokenizers.models import WordPiece, BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFKC
from transformers import BertTokenizerFast


def create_jieba_tokenizer(dataset_path, tokenizer_name, approach="wordpiece") -> BertTokenizerFast:
    os.makedirs(tokenizer_name, exist_ok=True)

    vocab_output_path = os.path.join(tokenizer_name, "vocab.txt")
    train_path = os.path.join(tokenizer_name, "train.txt")

    # Load JSON lines and extract sentences
    with open(dataset_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sentences = []
    for line in lines:
        data = json.loads(line.strip())
        if "prompt" in data and "completion" in data:
            sentences.append(data["prompt"] + data["completion"])
        elif "input" in data:
            sentences.append(data["input"])
        elif "text" in data:
            sentences.append(data["text"])

    # Write segmented sentences to vocab.txt and train.txt
    with open(vocab_output_path, "w", encoding="utf-8") as vocab_out, \
         open(train_path, "w", encoding="utf-8") as train_out:
        for sentence in sentences:
            segmented = " ".join(jieba.cut(sentence.strip()))
            vocab_out.write(segmented + "\n")
            train_out.write(segmented + "\n")

    # Create tokenizer model
    if approach == "wordpiece":
        tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        trainer = WordPieceTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    elif approach == "bpe":
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        trainer = BpeTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    else:
        raise ValueError(f"Invalid approach: {approach}")

    tokenizer.normalizer = NFKC()
    tokenizer.pre_tokenizer = Whitespace()

    # Train tokenizer
    tokenizer.train([train_path], trainer)

    tokenizer_json_path = os.path.join(tokenizer_name, "tokenizer.json")
    tokenizer.save(tokenizer_json_path)

    # Create HF-compatible tokenizer
    hf_tokenizer = BertTokenizerFast(
        tokenizer_file=tokenizer_json_path,
        vocab_file=vocab_output_path,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]"
    )
    hf_tokenizer.save_pretrained(tokenizer_name)

    return hf_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
tokenizer = create_jieba_tokenizer(
    dataset_path="../datasets/s3_prompts_hsk1.jsonl",
    tokenizer_name="hsk1-wordpiece-tokenizer",
    approach="wordpiece"
)

tokenizer.tokenize("朋友在商店买东西。")






['[UNK]', '。']

# Train

In [20]:
import torch
from datasets import Dataset, load_dataset
from transformers import (GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, BertTokenizer
)
TOKENIZERS_PARALLELISM=False

class ChineseGPTTrainer:
    def __init__(
        self,
        model_name: str,
        dataset_path: str,
        output_dir: str,
        test_size: float = 0.1,
        max_length: int = 64,
        batch_size: int = 4,
        gradient_accumulation_steps: int = 4,
        epochs: int = 3,
        learning_rate: float = 5e-5,
        tokenizer: BertTokenizer = None
    ):
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        self.test_size = test_size
        self.max_length = max_length
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.tokenizer = tokenizer

        # Tokenizer and model
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        self.train_dataset, self.eval_dataset = self.load_and_tokenize_data()

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        self.training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            learning_rate=self.learning_rate,
            save_steps=500,
            save_total_limit=1,
            eval_steps=100,
            logging_steps=20,
            logging_first_step=True,
            prediction_loss_only=True,
            disable_tqdm=False,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=2,
            push_to_hub=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
        )

    def load_and_tokenize_data(self):
        # Detect file format
        if self.dataset_path.endswith(".jsonl") or self.dataset_path.endswith(".json"):
            dataset = load_dataset("json", data_files=self.dataset_path)["train"]
        else:
            with open(self.dataset_path, encoding="utf-8") as f:
                lines = [line.strip() for line in f if line.strip()]
            dataset = Dataset.from_list([{"text": l} for l in lines])

        # Split dataset
        train_dataset, eval_dataset = dataset.train_test_split(test_size=self.test_size, seed=42).values()

        def tokenize(example):
            # If structured fields exist
            if "prompt" in example and "completion" in example:
                text = example["prompt"] + example["completion"]
            elif "input" in example and "labels" in example:
                # Optional: support for mask format if needed
                text = example["input"]
            else:
                text = example["text"]

            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            )

            # Set labels for LM: same as input_ids, but pad tokens get -100
            input_ids = tokenized["input_ids"]
            tokenized["labels"] = [
                token_id if token_id != self.tokenizer.pad_token_id else -100
                for token_id in input_ids
            ]

            return tokenized

        return train_dataset.map(tokenize), eval_dataset.map(tokenize)

    def train(self):
        self.trainer.train()

    def save(self):
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

In [4]:
import os
DATASET_PATH = "../datasets/" + "s3_prompts_hsk1.jsonl"
SAVE_MODEL_PATH = "../models/"

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = ChineseGPTTrainer(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path=DATASET_PATH,
    output_dir=SAVE_MODEL_PATH + "hsk1-jieba-bpe",
    tokenizer=create_jieba_tokenizer(dataset_path=DATASET_PATH, tokenizer_name="hsk2-bpe-tokenizer", approach="bpe"),
    max_length=64,
    batch_size=4,
    epochs=3,
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    test_size=0.1,
)

trainer.train()
trainer.save()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p9/gl1s91rn2fv_77662_261gb80000gn/T/jieba.cache
Loading model cost 0.287 seconds.
Prefix dict has been built successfully.







Map: 100%|██████████| 6472/6472 [00:01<00:00, 4307.19 examples/s]
Map: 100%|██████████| 720/720 [00:00<00:00, 2737.75 examples/s]
  self.trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,9.744
20,5.4073
40,3.2515
60,2.5059
80,2.3097
100,2.1517
120,2.1363
140,2.0096
160,2.0279
180,1.9676


# Hyper Search

In [24]:
import copy
import os
from types import SimpleNamespace


class HyperparamSearch:
    def __init__(self, base_config):
        self.base_config = base_config
        self.results = []

    def search(self, param_grid):
        """
        param_grid: dict of parameter lists, e.g.
        {
            "batch_size": [2, 4],
            "learning_rate": [5e-5, 2e-5],
            "epochs": [3, 5]
        }
        """
        import itertools
        keys = list(param_grid.keys())
        combinations = list(itertools.product(*param_grid.values()))

        for i, combo in enumerate(combinations):
            print(f"\n🚀 Running config {i+1}/{len(combinations)}")
            config = copy.deepcopy(self.base_config)
            print("🧪 Config:", {k: v for k, v in zip(keys, combo)})
            for k, v in zip(keys, combo):
                setattr(config, k, v)

            trainer = ChineseGPTTrainer(
                model_name=config.model_name,
                dataset_path=config.dataset_path,
                output_dir=os.path.join(config.output_dir, f"run_{i}"),
                test_size=config.test_size,
                max_length=config.max_length,
                batch_size=config.batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                epochs=config.epochs,
                learning_rate=config.learning_rate,
                tokenizer=config.tokenizer
            )

            trainer.train()
            metrics = trainer.trainer.evaluate()
            loss = metrics.get("eval_loss", float("inf"))

            self.results.append({
                "run": i,
                "config": combo,
                "loss": loss
            })

        print(self.results)
        self.results.sort(key=lambda x: x["loss"])
        return self.results[0]  # return best config
    

SAVE_MODEL_PATH = "../models/"

base_config = SimpleNamespace(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path="../datasets/hsk1-dataset.json",
    output_dir=SAVE_MODEL_PATH + "grid_search/",
    test_size=0.1,
    max_length=64,
    batch_size=4,
    gradient_accumulation_steps=4,
    epochs=3,
    learning_rate=5e-5
)

tokenizer_list = [
    create_jieba_tokenizer(dataset_path="../datasets/hsk1-dataset.json", tokenizer_name="hsk1-bpe-tokenizer", approach="bpe"),
]

base_model_list = [
    "uer/gpt2-distil-chinese-cluecorpussmall",
    "ckiplab/gpt2-base-chinese",
]

param_grid = {
    "batch_size": [4],
    "learning_rate": [2e-5, 3e-5, 5e-5],
    "epochs": [5, 7],
    "gradient_accumulation_steps": [2, 4, 8],
    "tokenizer": tokenizer_list,
    "model_name": base_model_list,
    "max_length": [64, 100]
}

searcher = HyperparamSearch(base_config)
best = searcher.search(param_grid)

print("\n✅ Best configuration:")
print(best)





🚀 Running config 1/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 2, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, specia

Map: 100%|██████████| 1673/1673 [00:00<00:00, 4185.76 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4026.30 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,10.1985
20,6.986
40,4.8858
60,3.824
80,3.4205
100,3.0102
120,2.7308
140,2.5319
160,2.4718
180,2.3326



🚀 Running config 2/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 2, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 3038.96 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3029.06 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,10.182
20,6.9626
40,4.8469
60,3.7932
80,3.4043
100,2.9927
120,2.7532
140,2.5217
160,2.4782
180,2.3275



🚀 Running config 3/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 2, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 4331.25 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3980.41 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,12.3251
20,6.131
40,3.814
60,2.9584
80,2.6237
100,2.3808
120,2.2421
140,2.1411
160,2.1442
180,2.0694



🚀 Running config 4/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 2, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 3195.29 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3072.60 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,12.6042
20,6.1417
40,3.7848
60,3.0458
80,2.7053
100,2.4159
120,2.291
140,2.1896
160,2.1773
180,2.0856



🚀 Running config 5/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 4, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 4321.57 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4030.96 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,10.0724
20,6.8879
40,4.7336
60,3.6332
80,3.1615
100,2.8293
120,2.687
140,2.3306
160,2.3381
180,2.2594



🚀 Running config 6/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 4, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 3154.31 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 2903.26 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,10.0937
20,6.876
40,4.7258
60,3.622
80,3.1562
100,2.8223
120,2.6722
140,2.3567
160,2.3604
180,2.2563



🚀 Running config 7/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 4, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 4333.10 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4096.84 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,12.3255
20,6.0195
40,3.5363
60,2.6845
80,2.4036
100,2.2954
120,2.2869
140,2.0152
160,2.0552
180,1.9853



🚀 Running config 8/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 4, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 3151.39 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 2994.06 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,12.5056
20,6.0538
40,3.6027
60,2.783
80,2.4699
100,2.3268
120,2.323
140,2.0471
160,2.0925
180,2.0096



🚀 Running config 9/72
🧪 Config: {'batch_size': 4, 'learning_rate': 2e-05, 'epochs': 5, 'gradient_accumulation_steps': 8, 'tokenizer': BertTokenizerFast(name_or_path='', vocab_size=758, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

Map: 100%|██████████| 1673/1673 [00:00<00:00, 4221.37 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3885.43 examples/s]
  self.trainer = Trainer(


Step,Training Loss
1,10.0592
20,6.8524


KeyboardInterrupt: 

In [17]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch
import re

class ChineseSentenceGenerator:
    def __init__(self, model_path: str, device: str = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)

        self.model.eval()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Ensure pad token is defined
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def generate(self, word: str, max_length: int = 60, num_return_sequences: int = 1) -> list:
        prompt = f"请用词语“{word}”造句："
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                top_k=50,
                top_p=0.9,
                temperature=0.8,
                num_return_sequences=num_return_sequences,
                repetition_penalty=1.3,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        print(self.tokenizer.pad_token_id, self.tokenizer.eos_token_id)

        results = []
        for output in outputs:
            decoded = self.tokenizer.decode(output, skip_special_tokens=True)
            print(decoded)
            sentence = decoded.replace(prompt, "").replace(" ", "").strip()
            sentence = re.split(r"[。！？]", sentence)[0] + "。"  # stop at first punctuation
            results.append(sentence)

        return results
    
generator = ChineseSentenceGenerator("../models/hsk1-gpt2-jieba")

results = generator.generate("老师", max_length=60, num_return_sequences=3)
for i, sent in enumerate(results):
    print(f"📝 Sentence {i+1}: {sent}")

0 None
请 用 词语 “ 老师 ” 造句 : 我 是 老师 。 。 你 的 书 。 , 不是 吗 ? ? ? 这是 我 坐在 ! 。 些 呢 。 。 。 。 。 。 。 。 。 。 。 。 再有 。 。 。 。 。 ! 。 。 。 对 。 。 。 。
请 用 词语 “ 老师 ” 造句 : 我 是 老师 。 。 ! 你 的 书 , 老师 了 。 , 对 吗 ? ” “ 我 。 ” 。 。 。 几分钟 。 。 再有 。 。 几分钟 吗 ? 什么 呢 。 ? 。 。 时 的 。 。 。 。 。 ” 。 。
请 用 词语 “ 老师 ” 造句 : 他 是 老师 。 。 吗 ? 我 不是 , 老师 。 。 你 的 书 。 。 。 。 狗狗 。 我 。 了 ! 。 几分钟 什么 。 。 。 。 。 。 。 。 。 。 再有 。 。 。 。 。 。 。 。 。 。 。 。
📝 Sentence 1: 请用词语“老师”造句:我是老师。
📝 Sentence 2: 请用词语“老师”造句:我是老师。
📝 Sentence 3: 请用词语“老师”造句:他是老师。
