In [3]:
%pip install transformers datasets sentencepiece accelerate jieba scikit-learn

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
ROOT = "../../"

BASE_MODEL_NAME="uer/gpt2-chinese-cluecorpussmall"
SAVE_MODEL_PATH= ROOT + "models/"
DATASET_PATH = ROOT + "datasets/for_train/"

In [5]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
HSK_1_TRAIN_PARAMS = {'loss': 0.8389942049980164, 'batch_size': 4, 'learning_rate': 5e-05, 'epochs': 3}

In [8]:
import torch
import os
from datasets import Dataset
from transformers import (
    AutoTokenizer, GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)
from sklearn.model_selection import train_test_split

class ChineseGPTTrainer:
    def __init__(
        self,
        model_name: str,
        dataset_path: str,
        output_dir: str,
        test_size: float = 0.1,
        max_length: int = 64,
        batch_size: int = 2,
        gradient_accumulation_steps: int = 4,
        epochs: int = 3,
        learning_rate: float = 2e-5,
    ):
        self.model_name = model_name
        self.dataset_path = dataset_path
        self.output_dir = output_dir
        self.test_size = test_size
        self.max_length = max_length
        self.batch_size = batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.epochs = epochs
        self.learning_rate = learning_rate

        # Tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        self.train_dataset, self.eval_dataset = self.load_and_tokenize_data()

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        self.training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            learning_rate=self.learning_rate,
            save_steps=500,
            save_total_limit=1,
            eval_steps=100,
            logging_steps=20,
            logging_first_step=True,
            prediction_loss_only=True,
            disable_tqdm=False,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=2,
            push_to_hub=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
        )

    def load_and_tokenize_data(self):
        with open(self.dataset_path, encoding="utf-8") as f:
            lines = [line.strip() for line in f if line.strip()]

        train_lines, eval_lines = train_test_split(lines, test_size=self.test_size, random_state=42)

        train_dataset = Dataset.from_list([{"text": l} for l in train_lines])
        eval_dataset = Dataset.from_list([{"text": l} for l in eval_lines])

        def tokenize(example):
            return self.tokenizer(example["text"], truncation=True, max_length=self.max_length)

        return train_dataset.map(tokenize), eval_dataset.map(tokenize)

    def train(self):
        self.trainer.train()

    def save(self):
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

In [4]:
DATASET_PATH = "../../datasets/for_train/hsk1/"
SAVE_MODEL_PATH = "../../models/"

trainer = ChineseGPTTrainer(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path=DATASET_PATH + "hsk1/labeled_dataset.txt",
    output_dir=SAVE_MODEL_PATH + "hsk1-gpt2-extended",
    batch_size=HSK_1_TRAIN_PARAMS['batch_size'],
    learning_rate=HSK_1_TRAIN_PARAMS['learning_rate'],
    epochs=HSK_1_TRAIN_PARAMS['epochs']
)

trainer.train()
trainer.save()

NameError: name 'SAVE_MODEL_PATH' is not defined

In [5]:
import copy

class HyperparamSearch:
    def __init__(self, base_config):
        self.base_config = base_config
        self.results = []

    def search(self, param_grid):
        """
        param_grid: dict of parameter lists, e.g.
        {
            "batch_size": [2, 4],
            "learning_rate": [5e-5, 2e-5],
            "epochs": [3, 5]
        }
        """
        import itertools
        keys = list(param_grid.keys())
        combinations = list(itertools.product(*param_grid.values()))

        for i, combo in enumerate(combinations):
            print(f"\n🚀 Running config {i+1}/{len(combinations)}")
            config = copy.deepcopy(self.base_config)
            for k, v in zip(keys, combo):
                setattr(config, k, v)

            trainer = ChineseGPTTrainer(
                model_name=config.model_name,
                dataset_path=config.dataset_path,
                output_dir=os.path.join(config.output_dir, f"run_{i}"),
                test_size=config.test_size,
                max_length=config.max_length,
                batch_size=config.batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                epochs=config.epochs,
                learning_rate=config.learning_rate
            )

            trainer.train()
            metrics = trainer.trainer.evaluate()
            loss = metrics.get("eval_loss", float("inf"))

            self.results.append({
                "run": i,
                "config": combo,
                "loss": loss
            })

        self.results.sort(key=lambda x: x["loss"])
        return self.results[0]  # return best config

In [9]:
from types import SimpleNamespace

SAVE_MODEL_PATH = "../../models/hsk1/"

base_config = SimpleNamespace(
    model_name="uer/gpt2-chinese-cluecorpussmall",
    dataset_path=DATASET_PATH + "hsk1_labeled.txt",
    output_dir=SAVE_MODEL_PATH + "grid_search/",
    test_size=0.1,
    max_length=64,
    batch_size=2,
    gradient_accumulation_steps=4,
    epochs=3,
    learning_rate=2e-5
)

param_grid = {
    "batch_size": [2, 4],
    "learning_rate": [2e-5, 5e-5],
    "epochs": [3, 5]
}

searcher = HyperparamSearch(base_config)
best = searcher.search(param_grid)

print("\n✅ Best configuration:")
print(best)


🚀 Running config 1/8


FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/for_train/hsk1/hsk1_labeled.txt'

In [13]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch
import re

class ChineseSentenceGenerator:
    def __init__(self, model_path: str, device: str = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_path)

        self.model.eval()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Ensure pad token is defined
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

    def generate(self, word: str, max_length: int = 60, num_return_sequences: int = 1) -> list:
        prompt = f"为词语“{word}”造句："
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.3,
                num_return_sequences=num_return_sequences,
                pad_token_id=self.model.config.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.2
            )

        results = []
        for output in outputs:
            decoded = self.tokenizer.decode(output, skip_special_tokens=True)
            sentence = decoded.replace(prompt, "").replace(" ", "").strip()
            print(sentence)
            sentence = re.split(r"[。！？]", sentence)[0] + "。"  # stop at first punctuation
            results.append(sentence)

        return results

In [14]:
generator = ChineseSentenceGenerator(SAVE_MODEL_PATH + "hsk1-gpt2")

results = generator.generate("中国", num_return_sequences=10)
for i, sent in enumerate(results):
    print(f"📝 Sentence {i+1}: {sent}")

为词语中国造句：:我们是中国人。你呢？:他在说汉语吗？:他在说汉语。.....。。。。。
为词语中国造句：:我们是中国人。:你在说什么？:他们在听。.....:我们是中国人。
为词语中国造句：:我们在做什么？?.:我们在做什么？:我们在做什么。:我们在做什么呢？
为词语中国造句：:我们在做什么？:我们在做什么呢？:我们在做什么。:我们在做什么呢？......
为词语中国造句：:我们是中国人。你呢？.....:我们是中国人。
为词语中国造句：:我们是中国人。你呢？:他在说汉语吗？
为词语中国造句：:我们在做。:我们在做。:你是中国人吗？.....。。。。。。。。
为词语中国造句：:我们在做什么？...:生成句子:你是中国人吗？:没有。:他们在做什么呢？...中国
为词语中国造句：:我们在说话。:生成句子:你是中国人吗？:我是中国人。听起来很高兴。
为词语中国造句：:你们在说什么？.....:我们在说什么呢？.....。。
📝 Sentence 1: 为词语中国造句：:我们是中国人。
📝 Sentence 2: 为词语中国造句：:我们是中国人。
📝 Sentence 3: 为词语中国造句：:我们在做什么。
📝 Sentence 4: 为词语中国造句：:我们在做什么。
📝 Sentence 5: 为词语中国造句：:我们是中国人。
📝 Sentence 6: 为词语中国造句：:我们是中国人。
📝 Sentence 7: 为词语中国造句：:我们在做。
📝 Sentence 8: 为词语中国造句：:我们在做什么。
📝 Sentence 9: 为词语中国造句：:我们在说话。
📝 Sentence 10: 为词语中国造句：:你们在说什么。
