In [1]:
%pip install transformers datasets sentencepiece accelerate jieba scikit-learn

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


# Dataset

## Jieba Tokenizer

In [179]:
import os
import json
import jieba
from tokenizers import Tokenizer
from tokenizers.models import WordPiece, BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFKC
from transformers import BertTokenizerFast


def create_jieba_tokenizer(dataset_path, tokenizer_name, approach="wordpiece") -> BertTokenizerFast:
    os.makedirs(tokenizer_name, exist_ok=True)

    vocab_output_path = os.path.join(tokenizer_name, "vocab.txt")
    train_path = os.path.join(tokenizer_name, "train.txt")

    # Load JSON lines and extract sentences
    with open(dataset_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sentences = []
    for line in lines:
        data = json.loads(line.strip())
        if "prompt" in data and "completion" in data:
            sentences.append(data["prompt"] + data["completion"])
        elif "input" in data:
            sentences.append(data["input"])
        elif "text" in data:
            sentences.append(data["text"])

    # Write segmented sentences to vocab.txt and train.txt
    with open(vocab_output_path, "w", encoding="utf-8") as vocab_out, \
         open(train_path, "w", encoding="utf-8") as train_out:
        for sentence in sentences:
            segmented = " ".join(jieba.cut(sentence.strip()))
            vocab_out.write(segmented + "\n")
            train_out.write(segmented + "\n")

    # Create tokenizer model
    if approach == "wordpiece":
        tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        trainer = WordPieceTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    elif approach == "bpe":
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        trainer = BpeTrainer(
            vocab_size=30000,
            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        )
    else:
        raise ValueError(f"Invalid approach: {approach}")

    tokenizer.normalizer = NFKC()
    tokenizer.pre_tokenizer = Whitespace()

    # Train tokenizer
    tokenizer.train([train_path], trainer)

    tokenizer_json_path = os.path.join(tokenizer_name, "tokenizer.json")
    tokenizer.save(tokenizer_json_path)

    # Create HF-compatible tokenizer
    hf_tokenizer = BertTokenizerFast(
        tokenizer_file=tokenizer_json_path,
        vocab_file=vocab_output_path,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]"
    )
    hf_tokenizer.save_pretrained(tokenizer_name)

    return hf_tokenizer

In [218]:
tokenizer = create_jieba_tokenizer(
    dataset_path="../datasets/labeled_dataset.json",
    tokenizer_name="hsk1-bpe-tokenizer",
    approach="bpe"
)

tokenizer.tokenize("我认识你")






['我', '认识', '你']

# Train

In [181]:
import torch
from datasets import Dataset, load_dataset
from transformers import (GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, BertTokenizer
)
TOKENIZERS_PARALLELISM=False

class ChineseGPTTrainer:
    def __init__(
        self,
        model_name: str,
        dataset_path: str,
        output_dir: str,
        test_size: float = 0.1,
        max_length: int = 64,
        batch_size: int = 4,
        gradient_accumulation_steps: int = 4,
        epochs: int = 3,
        learning_rate: float = 5e-5,
        tokenizer: BertTokenizer = None
    ):
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        self.test_size = test_size
        self.max_length = max_length
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.tokenizer = tokenizer

        # Tokenizer and model
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        self.train_dataset, self.eval_dataset = self.load_and_tokenize_data()

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        self.training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            learning_rate=self.learning_rate,
            save_steps=500,
            save_total_limit=1,
            eval_steps=100,
            logging_steps=20,
            logging_first_step=True,
            prediction_loss_only=True,
            disable_tqdm=False,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=2,
            push_to_hub=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
        )

    def load_and_tokenize_data(self):
        # Detect file format
        if self.dataset_path.endswith(".json"):
            dataset = load_dataset("json", data_files=self.dataset_path)["train"]
        else:
            with open(self.dataset_path, encoding="utf-8") as f:
                lines = [line.strip() for line in f if line.strip()]
            dataset = Dataset.from_list([{"text": l} for l in lines])

        # Split dataset
        train_dataset, eval_dataset = dataset.train_test_split(test_size=self.test_size, seed=42).values()

        def tokenize(example):
            # If structured fields exist
            if "prompt" in example and "completion" in example:
                text = example["prompt"] + example["completion"]
            elif "input" in example and "labels" in example:
                # Optional: support for mask format if needed
                text = example["input"]
            else:
                text = example["text"]

            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=self.max_length,
                padding="max_length"
            )

            # Set labels for LM: same as input_ids, but pad tokens get -100
            input_ids = tokenized["input_ids"]
            tokenized["labels"] = [
                token_id if token_id != self.tokenizer.pad_token_id else -100
                for token_id in input_ids
            ]

            return tokenized

        return train_dataset.map(tokenize), eval_dataset.map(tokenize)

    def train(self):
        self.trainer.train()

    def save(self):
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

In [262]:
DATASET_PATH = "../datasets/"
SAVE_MODEL_PATH = "../models/"

trainer = ChineseGPTTrainer(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path=DATASET_PATH + "labeled_dataset.json",
    output_dir=SAVE_MODEL_PATH + "hsk1-gpt2-jieba",
    tokenizer=create_jieba_tokenizer(dataset_path="../datasets/labeled_dataset.json", tokenizer_name="hsk1-bpe-tokenizer", approach="bpe"),
    max_length=64,
    batch_size=4,
    epochs=3,
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    test_size=0.1,
)

trainer.train()
trainer.save()






Generating train split: 18354 examples [00:00, 405998.86 examples/s]
Map: 100%|██████████| 16518/16518 [00:03<00:00, 4358.45 examples/s]
Map: 100%|██████████| 1836/1836 [00:00<00:00, 4611.14 examples/s]
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,9.9029
20,5.3327
40,3.3458
60,2.608
80,2.3288
100,2.1079
120,2.0208
140,2.0172
160,1.9735
180,1.9691


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

# Hyper Search

In [None]:
import copy
import os
from types import SimpleNamespace


class HyperparamSearch:
    def __init__(self, base_config):
        self.base_config = base_config
        self.results = []

    def search(self, param_grid):
        """
        param_grid: dict of parameter lists, e.g.
        {
            "batch_size": [2, 4],
            "learning_rate": [5e-5, 2e-5],
            "epochs": [3, 5]
        }
        """
        import itertools
        keys = list(param_grid.keys())
        combinations = list(itertools.product(*param_grid.values()))

        for i, combo in enumerate(combinations):
            print(f"\n🚀 Running config {i+1}/{len(combinations)}")
            config = copy.deepcopy(self.base_config)
            for k, v in zip(keys, combo):
                setattr(config, k, v)

            trainer = ChineseGPTTrainer(
                model_name=config.model_name,
                dataset_path=config.dataset_path,
                output_dir=os.path.join(config.output_dir, f"run_{i}"),
                test_size=config.test_size,
                max_length=config.max_length,
                batch_size=config.batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                epochs=config.epochs,
                learning_rate=config.learning_rate,
                tokenizer=config.tokenizer
            )

            trainer.train()
            metrics = trainer.trainer.evaluate()
            loss = metrics.get("eval_loss", float("inf"))

            self.results.append({
                "run": i,
                "config": combo,
                "loss": loss
            })

        self.results.sort(key=lambda x: x["loss"])
        return self.results[0]  # return best config
    

SAVE_MODEL_PATH = "../../models/hsk1/"

base_config = SimpleNamespace(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path="../datasets/for_train/hsk1/labeled_dataset.txt",
    output_dir=SAVE_MODEL_PATH + "grid_search/",
    test_size=0.1,
    max_length=64,
    batch_size=2,
    gradient_accumulation_steps=4,
    epochs=3,
    learning_rate=2e-5
)

tokenizer_list = [
    create_jieba_tokenizer(dataset_path="../datasets/labeled_dataset.json", tokenizer_name="hsk1-bpe-tokenizer", approach="bpe"),
]

param_grid = {
    "batch_size": [2, 3, 4],
    "learning_rate": [2e-5, 3e-5, 4e-5, 5e-5],
    "epochs": [3, 4, 5],
    "tokenizer": tokenizer_list
}

searcher = HyperparamSearch(base_config)
best = searcher.search(param_grid)

print("\n✅ Best configuration:")
print(best)





🚀 Running config 1/36


Map: 100%|██████████| 23752/23752 [00:02<00:00, 8527.98 examples/s]
Map: 100%|██████████| 2640/2640 [00:10<00:00, 251.53 examples/s] 
  self.trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
1,10.9666
20,7.8622
40,5.0974
60,3.9326
80,3.1825
100,2.7622


KeyboardInterrupt: 

In [267]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch
import re

class ChineseSentenceGenerator:
    def __init__(self, model_path: str, device: str = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)

        self.model.eval()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Ensure pad token is defined
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def generate(self, word: str, max_length: int = 60, num_return_sequences: int = 1) -> list:
        prompt = f"请用词语“{word}”造句："
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                top_k=50,
                top_p=0.9,
                temperature=0.8,
                num_return_sequences=num_return_sequences,
                repetition_penalty=1.3,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )

        print(self.tokenizer.pad_token_id, self.tokenizer.eos_token_id)

        results = []
        for output in outputs:
            decoded = self.tokenizer.decode(output, skip_special_tokens=True)
            print(decoded)
            sentence = decoded.replace(prompt, "").replace(" ", "").strip()
            sentence = re.split(r"[。！？]", sentence)[0] + "。"  # stop at first punctuation
            results.append(sentence)

        return results
    
generator = ChineseSentenceGenerator("../models/hsk1-gpt2-jieba")

results = generator.generate("学生", max_length=60, num_return_sequences=3)
for i, sent in enumerate(results):
    print(f"📝 Sentence {i+1}: {sent}")

0 None
请 用 词语 “ 学生 ” 造句 : 很多 学生 去 了 。 这 本书 。 。 。 我 一个 都 不 认识 。 学生 。 ! 。 。 电影 的 工 。 。 时 。 。 。 我 。 。 。 。 。 点觉 。 电影 。 。 我 。 ” 。 。 。 。 。 ”
请 用 词语 “ 学生 ” 造句 : 有个 是 学生 。 的 人 了 吗 ? ? 这有 你 ! 我 在 一个 朋友家 。 他 。 她 认识 。 。 。 。 吗 ? 。 。 。 。 。 。 。 。 本 小说 。 。 。 ” 。 。 本 。 。 。 ” 。
请 用 词语 “ 学生 ” 造句 : 很多 学生 买 了 这 本书 。 。 。 很会 ! 。 。 。 雨 。 。 。 电影 不怎么样 。 电影 好看 吗 ? 。 。 。 好看 。 好看 。 谢谢 。 。 。 好看 。 。 买些 。 。 好看 。 饭店 。 。 买些 ,
📝 Sentence 1: 请用词语“学生”造句:很多学生去了。
📝 Sentence 2: 请用词语“学生”造句:有个是学生。
📝 Sentence 3: 请用词语“学生”造句:很多学生买了这本书。
