In [1]:
%pip install transformers datasets sentencepiece accelerate jieba scikit-learn

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


## Jieba Tokenizer

In [3]:
import os
import json
import jieba
from tokenizers import Tokenizer
from tokenizers.models import WordPiece, BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFKC
from transformers import BertTokenizerFast


def create_jieba_tokenizer(dataset_path, tokenizer_name, approach="wordpiece") -> BertTokenizerFast:
    os.makedirs(tokenizer_name, exist_ok=True)

    vocab_output_path = os.path.join(tokenizer_name, "vocab.txt")
    train_path = os.path.join(tokenizer_name, "train.txt")

    # Load JSON lines and extract sentences
    with open(dataset_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sentences = []
    for line in lines:
        data = json.loads(line.strip())
        if "prompt" in data and "completion" in data:
            sentences.append(data["prompt"] + data["completion"])
        elif "input" in data:
            sentences.append(data["input"])
        elif "text" in data:
            sentences.append(data["text"])

    # Write segmented sentences to vocab.txt and train.txt
    with open(vocab_output_path, "w", encoding="utf-8") as vocab_out, \
         open(train_path, "w", encoding="utf-8") as train_out:
        for sentence in sentences:
            segmented = " ".join(jieba.cut(sentence.strip()))
            vocab_out.write(segmented + "\n")
            train_out.write(segmented + "\n")

    # Create tokenizer model
    if approach == "wordpiece":
        tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        trainer = WordPieceTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    elif approach == "bpe":
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        trainer = BpeTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    else:
        raise ValueError(f"Invalid approach: {approach}")

    tokenizer.normalizer = NFKC()
    tokenizer.pre_tokenizer = Whitespace()

    # Train tokenizer
    tokenizer.train([train_path], trainer)

    tokenizer_json_path = os.path.join(tokenizer_name, "tokenizer.json")
    tokenizer.save(tokenizer_json_path)

    # Create HF-compatible tokenizer
    hf_tokenizer = BertTokenizerFast(
        tokenizer_file=tokenizer_json_path,
        vocab_file=vocab_output_path,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]"
    )
    hf_tokenizer.save_pretrained(tokenizer_name)

    return hf_tokenizer

In [5]:
tokenizer = create_jieba_tokenizer(
    dataset_path="../datasets/s3_prompts_hsk1.jsonl",
    tokenizer_name="hsk1-bpe-tokenizer",
    approach="bpe"
)

tokenizer.tokenize("你不用工作吗？")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p9/gl1s91rn2fv_77662_261gb80000gn/T/jieba.cache
Loading model cost 0.279 seconds.
Prefix dict has been built successfully.







['你', '不', '用', '工作', '吗', '?']

# Train

In [2]:
import torch
from datasets import Dataset, load_dataset
from transformers import (GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, BertTokenizer
)
TOKENIZERS_PARALLELISM=False

class ChineseGPTTrainer:
    def __init__(
        self,
        model_name: str,
        dataset_path: str,
        output_dir: str,
        test_size: float = 0.1,
        max_length: int = 64,
        batch_size: int = 4,
        gradient_accumulation_steps: int = 4,
        epochs: int = 3,
        learning_rate: float = 5e-5,
        tokenizer: BertTokenizer = None
    ):
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        self.test_size = test_size
        self.max_length = max_length
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.tokenizer = tokenizer

        # Tokenizer and model
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        self.train_dataset, self.eval_dataset = self.load_and_tokenize_data()

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        self.training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            learning_rate=self.learning_rate,
            save_steps=500,
            save_total_limit=1,
            eval_steps=100,
            logging_steps=20,
            logging_first_step=True,
            prediction_loss_only=True,
            disable_tqdm=False,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=2,
            push_to_hub=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
        )

    def load_and_tokenize_data(self):
        # Detect file format
        if self.dataset_path.endswith(".jsonl"):
            dataset = load_dataset("json", data_files=self.dataset_path)["train"]
        else:
            with open(self.dataset_path, encoding="utf-8") as f:
                lines = [line.strip() for line in f if line.strip()]
            dataset = Dataset.from_list([{"text": l} for l in lines])

        # Split dataset
        train_dataset, eval_dataset = dataset.train_test_split(test_size=self.test_size, seed=42).values()

        def tokenize(example):
            # If structured fields exist
            if "prompt" in example and "completion" in example:
                text = example["prompt"] + example["completion"]
            elif "input" in example and "labels" in example:
                # Optional: support for mask format if needed
                text = example["input"]
            else:
                text = example["text"]

            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            )

            # Set labels for LM: same as input_ids, but pad tokens get -100
            input_ids = tokenized["input_ids"]
            tokenized["labels"] = [
                token_id if token_id != self.tokenizer.pad_token_id else -100
                for token_id in input_ids
            ]

            return tokenized

        return train_dataset.map(tokenize), eval_dataset.map(tokenize)

    def train(self):
        self.trainer.train()

    def save(self):
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
DATASET_PATH = "../datasets/" + "s3_prompts_hsk2.jsonl"
SAVE_MODEL_PATH = "../models/"

os.environ["TOKENIZERS_PARALLELISM"] = "false"

trainer = ChineseGPTTrainer(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path=DATASET_PATH,
    output_dir=SAVE_MODEL_PATH + "hsk2-jieba-bpe",
    tokenizer=create_jieba_tokenizer(dataset_path=DATASET_PATH, tokenizer_name="hsk2-bpe-tokenizer", approach="bpe"),
    max_length=64,
    batch_size=4,
    epochs=3,
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    test_size=0.1,
)

trainer.train()
trainer.save()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/p9/gl1s91rn2fv_77662_261gb80000gn/T/jieba.cache
Loading model cost 0.287 seconds.
Prefix dict has been built successfully.







Map: 100%|██████████| 6472/6472 [00:01<00:00, 4307.19 examples/s]
Map: 100%|██████████| 720/720 [00:00<00:00, 2737.75 examples/s]
  self.trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,9.744
20,5.4073
40,3.2515
60,2.5059
80,2.3097
100,2.1517
120,2.1363
140,2.0096
160,2.0279
180,1.9676


# Hyper Search

In [275]:
import copy
import os
from types import SimpleNamespace


class HyperparamSearch:
    def __init__(self, base_config):
        self.base_config = base_config
        self.results = []

    def search(self, param_grid):
        """
        param_grid: dict of parameter lists, e.g.
        {
            "batch_size": [2, 4],
            "learning_rate": [5e-5, 2e-5],
            "epochs": [3, 5]
        }
        """
        import itertools
        keys = list(param_grid.keys())
        combinations = list(itertools.product(*param_grid.values()))

        for i, combo in enumerate(combinations):
            print(f"\n🚀 Running config {i+1}/{len(combinations)}")
            config = copy.deepcopy(self.base_config)
            for k, v in zip(keys, combo):
                setattr(config, k, v)

            trainer = ChineseGPTTrainer(
                model_name=config.model_name,
                dataset_path=config.dataset_path,
                output_dir=os.path.join(config.output_dir, f"run_{i}"),
                test_size=config.test_size,
                max_length=config.max_length,
                batch_size=config.batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                epochs=config.epochs,
                learning_rate=config.learning_rate,
                tokenizer=config.tokenizer
            )

            trainer.train()
            metrics = trainer.trainer.evaluate()
            loss = metrics.get("eval_loss", float("inf"))

            self.results.append({
                "run": i,
                "config": combo,
                "loss": loss
            })

        self.results.sort(key=lambda x: x["loss"])
        return self.results[0]  # return best config
    

SAVE_MODEL_PATH = "../../models/hsk1/"

base_config = SimpleNamespace(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path="../datasets/hsk1-dataset.json",
    output_dir=SAVE_MODEL_PATH + "grid_search/",
    test_size=0.1,
    max_length=64,
    batch_size=2,
    gradient_accumulation_steps=4,
    epochs=3,
    learning_rate=5e-5
)

tokenizer_list = [
    create_jieba_tokenizer(dataset_path="../datasets/labeled_dataset.json", tokenizer_name="hsk1-bpe-tokenizer", approach="bpe"),
]

param_grid = {
    "batch_size": [4],
    "learning_rate": [5e-5],
    "epochs": [3, 4, 5, 7, 9, 12, 15],
    "tokenizer": tokenizer_list
}

searcher = HyperparamSearch(base_config)
best = searcher.search(param_grid)

print("\n✅ Best configuration:")
print(best)





🚀 Running config 1/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3640.74 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4113.04 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1976
40,3.217
60,2.5362
80,2.271
100,2.1505
120,2.1396
140,1.9195
160,2.0071
180,1.9654


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 2/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3616.56 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3235.25 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1936
40,3.2067
60,2.5246
80,2.2596
100,2.1404
120,2.1303
140,1.9089
160,1.9968
180,1.9537


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 3/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3965.62 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4039.17 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1912
40,3.2006
60,2.5176
80,2.2532
100,2.1351
120,2.1246
140,1.903
160,1.9908
180,1.9482


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 4/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3902.24 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4021.88 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1885
40,3.1939
60,2.5096
80,2.2463
100,2.1296
120,2.1193
140,1.8975
160,1.9847
180,1.9411


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 5/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 2886.29 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 4090.89 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.187
40,3.1902
60,2.5052
80,2.2427
100,2.1268
120,2.1165
140,1.8942
160,1.9811
180,1.9381


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 6/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3172.42 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3735.82 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1857
40,3.187
60,2.5012
80,2.2395
100,2.1245
120,2.1139
140,1.8927
160,1.978
180,1.9354


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


🚀 Running config 7/7


Map: 100%|██████████| 1673/1673 [00:00<00:00, 3445.64 examples/s]
Map: 100%|██████████| 186/186 [00:00<00:00, 3804.12 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9865
20,5.1849
40,3.1851
60,2.4989
80,2.2375
100,2.1228
120,2.1128
140,1.8925
160,1.9762
180,1.933


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


✅ Best configuration:
{'run': 3, 'config': (4, 5e-05, 7, BertTokenizerFast(name_or_path='', vocab_size=755, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)), 'loss': 1.8088364601135254}


In [267]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch
import re

class ChineseSentenceGenerator:
    def __init__(self, model_path: str, device: str = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)

        self.model.eval()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Ensure pad token is defined
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def generate(self, word: str, max_length: int = 60, num_return_sequences: int = 1) -> list:
        prompt = f"请用词语“{word}”造句："
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                top_k=50,
                top_p=0.9,
                temperature=0.8,
                num_return_sequences=num_return_sequences,
                repetition_penalty=1.3,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        print(self.tokenizer.pad_token_id, self.tokenizer.eos_token_id)

        results = []
        for output in outputs:
            decoded = self.tokenizer.decode(output, skip_special_tokens=True)
            print(decoded)
            sentence = decoded.replace(prompt, "").replace(" ", "").strip()
            sentence = re.split(r"[。！？]", sentence)[0] + "。"  # stop at first punctuation
            results.append(sentence)

        return results
    
generator = ChineseSentenceGenerator("../models/hsk1-gpt2-jieba")

results = generator.generate("学生", max_length=60, num_return_sequences=3)
for i, sent in enumerate(results):
    print(f"📝 Sentence {i+1}: {sent}")

0 None
请 用 词语 “ 学生 ” 造句 : 很多 学生 去 了 。 这 本书 。 。 。 我 一个 都 不 认识 。 学生 。 ! 。 。 电影 的 工 。 。 时 。 。 。 我 。 。 。 。 。 点觉 。 电影 。 。 我 。 ” 。 。 。 。 。 ”
请 用 词语 “ 学生 ” 造句 : 有个 是 学生 。 的 人 了 吗 ? ? 这有 你 ! 我 在 一个 朋友家 。 他 。 她 认识 。 。 。 。 吗 ? 。 。 。 。 。 。 。 。 本 小说 。 。 。 ” 。 。 本 。 。 。 ” 。
请 用 词语 “ 学生 ” 造句 : 很多 学生 买 了 这 本书 。 。 。 很会 ! 。 。 。 雨 。 。 。 电影 不怎么样 。 电影 好看 吗 ? 。 。 。 好看 。 好看 。 谢谢 。 。 。 好看 。 。 买些 。 。 好看 。 饭店 。 。 买些 ,
📝 Sentence 1: 请用词语“学生”造句:很多学生去了。
📝 Sentence 2: 请用词语“学生”造句:有个是学生。
📝 Sentence 3: 请用词语“学生”造句:很多学生买了这本书。
