In [10]:
import os
import json
import csv
import random

def generate_error_judgement(json_data, error_probability=0.1, typo_map=None):
    judgement_text = json_data['judgement']

    # Function to determine if a character is Chinese
    def is_chinese_char(char):
        return '\u4e00' <= char <= '\u9fff'

    # Function to randomly generate a wrong Chinese character (if typo_map is None)
    def random_typo():
        return chr(random.randint(0x4e00, 0x9fff))

    # Function to introduce typos into a character (use typo_map if provided)
    def introduce_typo(char):
        if typo_map and char in typo_map:
            return random.choice(typo_map[char])
        else:
            return random_typo()

    # Split the judgement text by sentence using '。' (Chinese period)
    sentences = judgement_text.split('。')

    # Prepare rows for the CSV output
    csv_rows = []

    for sentence in sentences:
        if not sentence.strip():
            continue  # Skip empty sentences

        correct_sentence = []
        error_sentence = []
        error_labeling = []

        for char in sentence:
            if char in ['\n', '\r']:
                continue
            if char in ['　']:
                correct_sentence.append(' ')
                error_sentence.append(' ')
                error_labeling.append('0')  # Mark as correct
                continue
            correct_sentence.append(char)
            if is_chinese_char(char):  # Only apply typos to Chinese characters
                if random.random() < error_probability:  # Introduce typo with a given probability
                    typo_char = introduce_typo(char)
                    error_sentence.append(typo_char)
                    error_labeling.append('1')  # Mark as error
                else:
                    error_sentence.append(char)
                    error_labeling.append('0')  # Mark as correct
            else:
                # Preserve non-Chinese characters (punctuation, numbers, etc.)
                error_sentence.append(char)
                # if char in ['、']:
                    # continue
                # if char in ',.!?;:，。！？；：':  # Only skip labeling for punctuation
                    # continue  # Ignore punctuation for labeling
                error_labeling.append('0')  # Mark as correct for other characters

        correct_sentence_str = ''.join(correct_sentence) + '。'  # Add period back to sentence
        error_sentence_str = ''.join(error_sentence) + '。'
        error_labeling_str = ','.join(error_labeling)

        # Append the row to the CSV rows list
        csv_rows.append([correct_sentence_str, error_sentence_str, f'[{error_labeling_str}]'])

    return csv_rows

def process_json_files(input_folder, output_folder, error_probability=0.05, typo_map=None):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Loop through all JSON files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            json_file_path = os.path.join(input_folder, filename)
            
            # Read the JSON data from the file
            with open(json_file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Generate error sentences and labels
            csv_rows = generate_error_judgement(json_data, error_probability, typo_map)

            # Prepare the output CSV file path
            output_csv_path = os.path.join(output_folder, filename.replace('.json', '_with_typos.csv'))

            # Write to a CSV file
            with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow(['Correct Sentence', 'Error Sentence', 'Position Error Labeling'])
                csv_writer.writerows(csv_rows)

            print(f'Processed {filename} and saved to {output_csv_path}')

# Example usage
input_folder = 'Judgement_json'  # Replace with your actual input folder path
output_folder = 'Error_Judgement_csv'  # Replace with your actual output folder path

# Sample common typo map
typo_map_example = {
    '判': ['盼', '拚', '畔'],
    '訟': ['送', '頌', '鬆'],
    '責': ['債', '賊', '澤'],
    '刑': ['型', '行', '醒'],
    '訴': ['數', '塑', '俗'],
    '罰': ['伐', '閥', '乏'],
    '證': ['正', '徵', '政'],
    '權': ['勸', '全', '拳'],
    '職': ['植', '執', '值'],
    '賠': ['陪', '培', '牌'],
    '約': ['躍', '藥', '鑰'],
    '執': ['植', '值', '直'],
    '罪': ['最', '嘴', '醉'],
    '辯': ['辨', '變', '辦'],
    '檢': ['簡', '減', '撿'],
    '審': ['神', '甚', '深'],
    '復': ['複', '覆', '服'],
    '協': ['洽', '鞋', '挾'],
    '令': ['領', '靈', '零'],
    '拘': ['居', '矩', '俱'],
    '控': ['空', '孔', '恐'],
    '違': ['圍', '唯', '危'],
    '執': ['值', '直', '植'],
    '申': ['伸', '身', '紳'],
    '賦': ['赴', '副', '傅'],
    '裁': ['才', '材', '財'],
    '契': ['鍥', '汽', '凱'],
    '遺': ['移', '疑', '倚'],
    '釋': ['式', '識', '適'],
    '羈': ['寄', '雞', '跡'],
    '召': ['兆', '昭', '肇'],
    '案': ['按', '暗', '岸'],
    '證': ['徵', '正', '政'],
    '示': ['市', '式', '事'],
    '註': ['注', '駐', '祝'],
    '裁': ['財', '才', '材'],
    '終': ['鐘', '忠', '中'],
    '證': ['徵', '政', '正'],
    '控': ['空', '恐', '孔'],
    '律': ['慮', '履', '旅'],
    '調': ['條', '跳', '挑'],
    '訴': ['數', '塑', '俗']
}

# Process all JSON files in the input folder and save CSV files to the output folder
process_json_files(input_folder, output_folder, error_probability=0.05, typo_map=typo_map_example)


Processed 民事_102,司促,31519_2014-01-09.json and saved to Error_Judgement_csv\民事_102,司促,31519_2014-01-09_with_typos.csv
Processed 民事_102,司促,32792_2014-01-06.json and saved to Error_Judgement_csv\民事_102,司促,32792_2014-01-06_with_typos.csv
