In [1]:
%pip install jieba

You should consider upgrading via the '/Users/danilkladnitsky/.pyenv/versions/3.10.4/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
HSK_SENTENCES_PATH = "../../datasets/hsk_sentences/"
HSK_LEVELS = ["hsk1", "hsk2", "hsk3"]

In [5]:
import jieba
import re
import os

LIMIT_MAIN_WORDS = 4
MIN_MAIN_WORD_LENGTH = 1

def tokenize_sentence(sentence):
    return jieba.lcut(sentence)

def extract_main_words(sentence):
    sentence = sentence.strip().replace("\n", "")  # ✅ remove line breaks

    tokens = tokenize_sentence(sentence)

    filtered = [
        word for word in tokens
        if len(word) >= MIN_MAIN_WORD_LENGTH and not re.match(r"[。！？…，：；《》、（）“”‘’]", word)
    ]

    return filtered[:LIMIT_MAIN_WORDS]

def create_prompt(word, sentence):
    return f"为词语“{word}”造句：{sentence}"

def label_hsk_dataset(dataset_path, target_folder):
    result = []
    # Read dataset
    with open(dataset_path, 'r', encoding='utf-8') as f:
        sentences = f.readlines()

    # For each sentence, extract main words and generate labeled sentence
    for sentence in sentences:
        main_words = extract_main_words(sentence)

        current_word_index = 0
        for word in main_words:
            if current_word_index >= LIMIT_MAIN_WORDS:
                break
            prompt = create_prompt(word, sentence)
            result.append(prompt)
            current_word_index += 1

    # Ensure target folder exists
    os.makedirs(target_folder, exist_ok=True)
    
    # Define output file path
    output_path = os.path.join(target_folder, "labeled_dataset.txt")
    
    # Write result to file
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in result:
            f.write(line.strip() + "\n")

    return result

labeled_dataset = label_hsk_dataset("../../datasets/hsk_sentences/hsk1.txt", "../../datasets/for_train/hsk1/")


        

In [None]:
import os
import json

HSK_1_VOCABULARY_PATH = "../../datasets/vocabulary/hsk1.txt"
HSK_2_VOCABULARY_PATH = "../../datasets/vocabulary/hsk2.txt"
HSK_3_VOCABULARY_PATH = "../../datasets/vocabulary/hsk3.txt"

DATASET_PATH_LIST = [
    "../../datasets/raw/hsk_sentences.txt",
    "../../datasets/raw/cmn_sentences.txt"
]

def collect_hsk_sentences_with_labels(dataset_paths, hsk_paths, target_folder):
    """
    Collect sentences for each HSK level and create JSON files with labeled sentences.
    
    Args:
        dataset_paths (list): List of paths to dataset files
        hsk_paths (dict): Dictionary mapping HSK levels to their vocabulary file paths
        target_folder (str): Path to save the JSON files
    """
    # Create target folder if it doesn't exist
    os.makedirs(target_folder, exist_ok=True)

    PROCESS_EACH_HSK_WORD = False
    
    # Load HSK vocabularies
    hsk_vocabularies = {}
    for level, path in hsk_paths.items():
        hsk_vocabularies[level] = read_hsk_vocabulary(path)
    
    # Process each dataset
    for dataset_path in dataset_paths:
        print(f"\nProcessing dataset: {dataset_path}")
        
        # Filter sentences for each HSK level
        for level, vocabulary in hsk_vocabularies.items():
            print(f"\nFiltering for {level}...")
            filtered_sentences = filter_sentences_by_hsk_vocabulary(vocabulary, dataset_path)
            
            # Create JSON data
            json_data = []
            for sentence in filtered_sentences:
                # Find all HSK words in the sentence
                hsk_words_in_sentence = []
                for word in vocabulary:
                    if word in sentence:
                        hsk_words_in_sentence.append(word)
                
                if PROCESS_EACH_HSK_WORD:
                    for word in hsk_words_in_sentence:
                        # Create labeled sentence
                        labeled_sentence = ""
                        if hsk_words_in_sentence:
                            labeled_sentence = f"为词语“{word}”造句：{sentence}"
                        
                        # Create sentence object
                        sentence_obj = {
                            "hsk": int(level[-1]),  # Extract number from 'hsk1', 'hsk2', etc.
                            "original_sentence": sentence,
                            "labeled_sentence": labeled_sentence
                        }
                        json_data.append(sentence_obj)
                else:
                    # Create labeled sentence
                    labeled_sentence = ""
                    if hsk_words_in_sentence:
                        labeled_sentence = f"为词语“{hsk_words_in_sentence[0]}”造句：{sentence}"
                    
                    # Create sentence object
            
            # Save to JSON file
            output_path = os.path.join(target_folder, f"{level}.json")
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, ensure_ascii=False, indent=2)
            
            print(f"Saved {len(json_data)} sentences to {output_path}")

# Define HSK paths dictionary
hsk_paths = {
    'hsk1': HSK_1_VOCABULARY_PATH,
    'hsk2': HSK_2_VOCABULARY_PATH,
    'hsk3': HSK_3_VOCABULARY_PATH
}

TARGET_FOLDER = "../../train_datasets/"

# Run the collection process
collect_hsk_sentences_with_labels(DATASET_PATH_LIST, hsk_paths, TARGET_FOLDER)

def extract_labeled_sentences(json_path, output_path):
    """
    Extract labeled sentences from a JSON file and save them to a text file.
    
    Args:
        json_path (str): Path to the input JSON file
        output_path (str): Path to save the output text file
    """
    try:
        # Read JSON file
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Extract labeled sentences
        labeled_sentences = [item['labeled_sentence'] for item in data if item['labeled_sentence']]
        
        # Write to output file
        with open(output_path, 'w', encoding='utf-8') as f:
            for sentence in labeled_sentences:
                f.write(sentence + '\n')
        
        print(f"Successfully extracted {len(labeled_sentences)} labeled sentences to {output_path}")
        
    except Exception as e:
        print(f"Error processing file: {e}")

# Example usage
TARGET_DATASET_PATH = "../../train_datasets/json/hsk1.json"
OUTPUT_PATH = "../../train_datasets/txt/hsk1_labeled.txt"

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# Extract labeled sentences
extract_labeled_sentences(TARGET_DATASET_PATH, OUTPUT_PATH)