In [1]:
%pip install transformers datasets sentencepiece accelerate jieba

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [6]:
RAW_DATA_PATH = "../../datasets/corpus/hsk_sentences.txt"
HSK_1_VOCAB_PATH = "../../datasets/vocabulary/hsk1.txt"
HSK_2_VOCAB_PATH = "../../datasets/vocabulary/hsk2.txt"
HSK_3_VOCAB_PATH = "../../datasets/vocabulary/hsk3.txt"

TARGET_DATASET_PATH = "../../datasets/corpus/finetune_hsk_sentences.txt"
COMPILED_MODEL_PATH = "../../models/"

In [7]:
import re

def keep_only_chinese(text):
    return re.sub(r"[^\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]", "", text)

def clean_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            cleaned = keep_only_chinese(line)
            if cleaned.strip():  # skip empty lines
                outfile.write(cleaned.strip() + '\n')

# Example usage
clean_file("../../datasets/corpus/raw_hsk_sentences.txt", RAW_DATA_PATH)

In [8]:
import re

def contains_chinese(text):
    return any('\u4e00' <= ch <= '\u9fff' for ch in text)

def build_prompted_dataset(sentences, hsk_vocab):
    prompted = []
    for sentence in sentences:
        sentence = sentence.strip()
        if not contains_chinese(sentence) or len(sentence) < 5:
            continue
        for word in hsk_vocab:
            if word in sentence:
                prompt = f"输入词语：{word}，生成句子：{sentence}"
                prompted.append(prompt)
                break  # only use first matching word
    return prompted

In [9]:
# Load HSK vocab
with open(HSK_1_VOCAB_PATH, "r", encoding="utf-8") as f:
    hsk_vocab = [line.strip() for line in f if line.strip()]

# Load raw sentence data
with open(RAW_DATA_PATH, "r", encoding="utf-8") as f:
    raw_sentences = f.readlines()

# Build new dataset
dataset = build_prompted_dataset(raw_sentences, hsk_vocab)

# Save to file
with open(TARGET_DATASET_PATH, "w", encoding="utf-8") as f:
    for line in dataset:
        f.write(line + "\n")

In [10]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load tokenizer and model
model_name = "uer/gpt2-chinese-cluecorpussmall"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Make sure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [12]:
import torch

MODEL_NAME = "gpt2-chinese-cluecorpussmall-finetuned2"
OUTPUT_DIR = COMPILED_MODEL_PATH + MODEL_NAME

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=TARGET_DATASET_PATH,
    block_size=64,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=10,                    # 🔼 Increase training
    per_device_train_batch_size=2,          # 🔽 Lower batch if memory issues
    gradient_accumulation_steps=4,          # Effective batch size = 8
    learning_rate=5e-5,                     # 🔼 Higher LR for faster convergence (OK on small data)
    save_steps=500,                         # 🔽 Save less often
    save_total_limit=1,                     # 🔽 Keep only last checkpoint
    logging_steps=20,
    logging_first_step=True,
    prediction_loss_only=True,
    disable_tqdm=False,
    report_to="none",
    fp16=torch.cuda.is_available(),         # 🔄 Use FP16 if you have GPU (speeds up a lot)
    dataloader_num_workers=2,               # ✅ Parallel data loading
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune
trainer.train()

# Save final model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.5819
20,2.6286
40,2.2475
60,2.195
80,2.126
100,2.1213
120,2.1028
140,2.1499
160,2.1252
180,2.167


('../../models/gpt2-chinese-cluecorpussmall-finetuned2/tokenizer_config.json',
 '../../models/gpt2-chinese-cluecorpussmall-finetuned2/special_tokens_map.json',
 '../../models/gpt2-chinese-cluecorpussmall-finetuned2/vocab.txt',
 '../../models/gpt2-chinese-cluecorpussmall-finetuned2/added_tokens.json')

In [34]:
from transformers import GPT2LMHeadModel, BertTokenizer
import torch

# Load your fine-tuned model and tokenizer
model_path = COMPILED_MODEL_PATH + MODEL_NAME
tokenizer = BertTokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Ensure pad token is set
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def generate_sentence(word, max_length=40):
    prompt = f"输入词语：{word}，生成句子："
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            do_sample=True,
            top_k=40,
            top_p=0.9,
            temperature=0.5,
            num_return_sequences=1,
            pad_token_id=model.config.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
    decoded = tokenizer.convert_tokens_to_string(tokens)

    # Remove the prompt and unnecessary spaces
    result = decoded.replace(prompt, "").replace(" ", "").strip()

    # Optionally stop at first sentence-ending punctuation
    import re
    result = re.split(r"[。！？]", result)[0] + "。"
    return result

# 🧪 Test the model with an HSK word
print(generate_sentence("飞机"))

# GENERATE SENTENCES WITH HSK WORDS

# 1. CALCULATE COVERAGE
# 2. CHECK GRAMMAR WITH SAPLING
# 3. CHECK GRAMMAR WITH LLM


[CLS]输入词语：飞机，生成句子：[SEP]飞机在跑道上滑行输入词语：的，生成句子：他的行为触。
