In [63]:
%pip install transformers datasets sentencepiece accelerate jieba scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.0-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 KB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.0 scikit-learn-1.6.1 threadpoolctl-3.6.0
You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import os
import json

def extract_labeled_sentences(json_path, output_path):
    """
    Extract labeled sentences from a JSON file and save them to a text file.
    
    Args:
        json_path (str): Path to the input JSON file
        output_path (str): Path to save the output text file
    """
    try:
        # Read JSON file
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Extract labeled sentences
        labeled_sentences = [item['labeled_sentence'] for item in data if item['labeled_sentence']]
        
        # Write to output file
        with open(output_path, 'w', encoding='utf-8') as f:
            for sentence in labeled_sentences:
                f.write(sentence + '\n')
        
        print(f"Successfully extracted {len(labeled_sentences)} labeled sentences to {output_path}")
        
    except Exception as e:
        print(f"Error processing file: {e}")

# Example usage
TARGET_DATASET_PATH = "../../train_datasets/json/hsk3.json"
OUTPUT_PATH = "../../train_datasets/txt/hsk3_labeled.txt"

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# Extract labeled sentences
extract_labeled_sentences(TARGET_DATASET_PATH, OUTPUT_PATH)

Successfully extracted 91774 labeled sentences to ../../train_datasets/txt/hsk3_labeled.txt


In [81]:
BASE_MODEL_NAME="uer/gpt2-chinese-cluecorpussmall"
MODEL_PATH="../../models/hsk1-gpt2"
TARGET_DATASET_PATH = "../../train_datasets/txt/hsk1_labeled.txt"

In [110]:
import os
import torch
import warnings
from datasets import Dataset
from transformers import (
    AutoTokenizer, GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)
from sklearn.model_selection import train_test_split

# Suppress tokenizer warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=UserWarning, module="transformers.tokenization_utils_base")

# === Load model and tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall", trust_remote_code=True)
model = GPT2LMHeadModel.from_pretrained(BASE_MODEL_NAME)

# Make sure pad token is defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

# === Load and prepare data ===
with open(TARGET_DATASET_PATH, encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

train_lines, eval_lines = train_test_split(lines, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list([{"text": l} for l in train_lines])
eval_dataset = Dataset.from_list([{"text": l} for l in eval_lines])

# Tokenization function
def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=64)

train_dataset = train_dataset.map(tokenize, batched=False)
eval_dataset = eval_dataset.map(tokenize, batched=False)

# === Data collator ===
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# === Training arguments ===
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=3,                      # ⚠️ Reduced to avoid overfitting
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,                      # ⚠️ Slightly lower for stability
    save_steps=500,
    save_total_limit=1,
    eval_steps=100,                         # ✅ Evaluate every 100 steps
    logging_steps=20,
    logging_first_step=True,
    prediction_loss_only=True,
    disable_tqdm=False,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    push_to_hub=False,
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# === Train ===
trainer.train()

# === Save final model ===
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'GPT2Tokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [122]:
from transformers import GPT2LMHeadModel, AutoTokenizer
import torch

# Load your fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)

model.eval()

# Ensure pad token is set
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def generate_sentence(word, max_length=60):
    prompt = f"输入词语：{word}。生成句子："
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.5,
            num_return_sequences=2,
            pad_token_id=model.config.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
    decoded = tokenizer.convert_tokens_to_string(tokens)

    # Remove the prompt and unnecessary spaces
    result = decoded.replace(prompt, "").replace(" ", "").strip()

    return result

# 🧪 Test the model with an HSK word
print(generate_sentence("商店"))

# GENERATE SENTENCES WITH HSK WORDS

# 1. CALCULATE COVERAGE
# 2. CHECK GRAMMAR WITH SAPLING
# 3. CHECK GRAMMAR WITH LLM


[CLS]输入词语：商店。生成句子：[SEP]:我在这里买东西。[SEP]我在这里买东西。[SEP][SEP][SEP][SEP][SEP][SEP][SEP][SEP][SEP][SEP][SEP][SEP]。[SEP]。[SEP]。[SEP]。[SEP]。[SEP]。[SEP]。[SEP]。
