In [1]:
%pip install transformers datasets accelerate

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load tokenizer and model
model_name = "uer/gpt2-chinese-cluecorpussmall"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [26]:
import re

def keep_only_chinese(text):
    # Keep Chinese characters and Chinese punctuation
    return re.sub(r"[^\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]", "", text)

def clean_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            cleaned = keep_only_chinese(line)
            if cleaned.strip():  # skip empty lines
                outfile.write(cleaned.strip() + '\n')

# Example usage
clean_file("raw_datasets/raw_hsk_sentences.txt", "datasets/clean_hsk_sentences.txt")

In [37]:
import re

def contains_chinese(text):
    return any('\u4e00' <= ch <= '\u9fff' for ch in text)

def build_prompted_dataset(sentences, hsk_vocab):
    prompted = []
    for sentence in sentences:
        sentence = sentence.strip()
        if not contains_chinese(sentence) or len(sentence) < 5:
            continue
        for word in hsk_vocab:
            if word in sentence:
                prompt = f"输入词语：{word}，生成句子：{sentence}"
                prompted.append(prompt)
                break  # only use first matching word
    return prompted

In [38]:

with open("datasets/hsk2_vocab.txt", "r", encoding="utf-8") as f:
    hsk_vocab = [line.strip() for line in f if line.strip()]

# Load raw sentence data
with open("datasets/chinese_sentences.txt", "r", encoding="utf-8") as f:
    raw_sentences = f.readlines()

# Build new dataset
dataset = build_prompted_dataset(raw_sentences, hsk_vocab)

# Save to file
with open("datasets/finetune_data.txt", "w", encoding="utf-8") as f:
    for line in dataset:
        f.write(line + "\n")

In [None]:
# Load your dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="datasets/finetune_data.txt",  # your training file
    block_size=64,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="./gpt2-chinese-finetuned",     # Where to save the model
    overwrite_output_dir=True,                 # Overwrite old model files
    num_train_epochs=10,                       # 🔼 Longer training helps on small data
    per_device_train_batch_size=2,             # 🔽 Reduce batch size if on CPU or Mac
    gradient_accumulation_steps=4,             # 🔼 Simulate larger batch size (2x4 = 8)
    learning_rate=5e-5,                        # 🔧 Lower learning rate for small dataset
    save_steps=200,                            # 🔼 Save more frequently for tracking
    save_total_limit=3,                        # Keep last 3 checkpoints
    logging_steps=20,                          # Log more often to monitor loss
    logging_first_step=True,                   # Show loss on first step
    prediction_loss_only=True,
    disable_tqdm=False,                        # Show progress bar
    report_to="none",                          # Don't report to W&B or Hugging Face Hub
    fp16=False,                                # ✅ No FP16 on Mac/CPU
    push_to_hub=False,                         # Don't sync with Hugging Face hub
    use_cpu=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune
trainer.train()

# Save final model
model.save_pretrained("./gpt2-chinese-finetuned")
tokenizer.save_pretrained("./gpt2-chinese-finetuned")



Step,Training Loss
1,4.6323
20,1.2457
40,0.76
60,0.7117
80,0.6826
100,0.6887
120,0.6199
140,0.6084
160,0.6173
180,0.6334


('./gpt2-chinese-finetuned/tokenizer_config.json',
 './gpt2-chinese-finetuned/special_tokens_map.json',
 './gpt2-chinese-finetuned/vocab.txt',
 './gpt2-chinese-finetuned/added_tokens.json')

In [1]:
from transformers import GPT2LMHeadModel, BertTokenizer
import torch

# Load your fine-tuned model and tokenizer
model_path = "./gpt2-chinese-finetuned"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Ensure pad token is set
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def generate_sentence(word, max_length=40):
    prompt = f"输入词语：{word}，生成句子："
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            do_sample=True,
            top_k=40,
            top_p=0.9,
            temperature=0.7,
            num_return_sequences=1,
            pad_token_id=model.config.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
    decoded = tokenizer.convert_tokens_to_string(tokens)

    # Remove the prompt and unnecessary spaces
    result = decoded.replace(prompt, "").replace(" ", "").strip()

    # Optionally stop at first sentence-ending punctuation
    import re
    result = re.split(r"[。！？]", result)[0] + "。"
    return result

# 🧪 Test the model with an HSK word
print(generate_sentence("今天"))
print(generate_sentence("朋友"))
print(generate_sentence("学校"))

  from .autonotebook import tqdm as notebook_tqdm


[CLS]输入词语：今天，生成句子：[SEP]今天中国下雨了输入词语：北京，生成句子：北京是你的。
[CLS]输入词语：朋友，生成句子：[SEP]我们都认识老师输入词语：的，生成句子：学校是他的学。
[CLS]输入词语：学校，生成句子：[SEP]学校有音乐输入词语：电视，生成句子：她在医院喝电视。
