In [13]:
import json
import opencc

# Initialize OpenCC for Simplified to Traditional conversion
converter = opencc.OpenCC('s2t.json')

# Load and merge typo maps
def merge_and_convert_typo_maps(existing_typo_map, new_typo_file, output_file):
    # Read the new typo map from the confusion.txt
    new_typo_map = {}
    with open(new_typo_file, 'r', encoding='utf-8') as f:
        for line in f:
            key, *values = line.strip().split(':')
            if len(values) > 0:
                new_typo_map[key] = values[0].split()

    # Convert keys and values to Traditional Chinese
    converted_typo_map = {
        converter.convert(key): [converter.convert(value) for value in values]
        for key, values in new_typo_map.items()
    }

    # Merge with existing typo map
    merged_typo_map = {**existing_typo_map, **converted_typo_map}

    # Save the merged map to a new file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_typo_map, f, ensure_ascii=False, indent=2)

    print(f"Merged and converted typo map saved to {output_file}")

# Example usage
existing_typo_map = {
    '判': ['盼', '拚', '畔'],
    '訟': ['送', '頌', '鬆'],
    '責': ['債', '賊', '澤'],
    '刑': ['型', '行', '醒'],
    '訴': ['數', '塑', '俗'],
    '罰': ['伐', '閥', '乏'],
    '證': ['正', '徵', '政'],
    '權': ['勸', '全', '拳'],
    '職': ['植', '執', '值'],
    '賠': ['陪', '培', '牌'],
    '約': ['躍', '藥', '鑰'],
    '執': ['植', '值', '直'],
    '罪': ['最', '嘴', '醉'],
    '辯': ['辨', '變', '辦'],
    '檢': ['簡', '減', '撿'],
    '審': ['神', '甚', '深'],
    '復': ['複', '覆', '服'],
    '協': ['洽', '鞋', '挾'],
    '令': ['領', '靈', '零'],
    '拘': ['居', '矩', '俱'],
    '控': ['空', '孔', '恐'],
    '違': ['圍', '唯', '危'],
    '申': ['伸', '身', '紳'],
    '賦': ['赴', '副', '傅'],
    '裁': ['才', '材', '財'],
    '契': ['鍥', '汽', '凱'],
    '遺': ['移', '疑', '倚'],
    '釋': ['式', '識', '適'],
    '羈': ['寄', '雞', '跡'],
    '召': ['兆', '昭', '肇'],
    '案': ['按', '暗', '岸'],
    '示': ['市', '式', '事'],
    '註': ['注', '駐', '祝'],
    '終': ['鐘', '忠', '中'],
    '律': ['慮', '履', '旅'],
    '調': ['條', '跳', '挑'],
    '議': ['異', '藝', '義'],
    '罷': ['霸', '爸', '怕'],
    '諭': ['喻', '裕', '愉'],
    '涉': ['設', '射', '社'],
    '懲': ['誠', '城', '承'],
    '處': ['除', '儲', '初'],
    '繼': ['技', '記', '既'],
    '典': ['顛', '佃', '甸'],
    '策': ['側', '測', '冊'],
    '德': ['得', '特', '惪'],
    '護': ['滬', '互', '糊'],
    '憲': ['線', '現', '縣'],
    '預': ['喻', '裕', '愉'],
    '勞': ['牢', '撈', '澇'],
    '償': ['常', '場', '嘗'],
    '屬': ['熟', '蜀', '署'],
    '監': ['簡', '間', '減'],
    '授': ['售', '獸', '瘦'],
    '額': ['俄', '鵝', '餓'],
    '規': ['歸', '貴', '鬼'],
    '效': ['校', '孝', '肖'],
    '施': ['是', '室', '試'],
    '應': ['影', '英', '映'],
    '為': ['未', '唯', '位'],
    '計': ['際', '既', '技'],
    '劃': ['畫', '話', '化'],
    '績': ['跡', '脊', '激'],
    '整': ['正', '征', '證'],
    '登': ['燈', '凳', '等'],
    '條': ['跳', '挑', '調'],
    '憑': ['瓶', '平', '屏'],
    '索': ['縮', '鎖', '朔'],
    '協': ['洽', '鞋', '挾'],
    '險': ['現', '顯', '鹹'],
    '損': ['孫', '遜', '筍'],
    '收': ['壽', '首', '售'],
    '維': ['唯', '危', '偉'],
    '駁': ['博', '薄', '撥'],
    '糧': ['粮', '梁', '良'],
    '覆': ['複', '復', '副'],
    '複': ['復', '覆', '褒'],
    '靈': ['零', '領', '齡'],
    '檢': ['簡', '撿', '減'],
    '繩': ['勝', '乘', '聲'],
    '精': ['睛', '清', '靜']
}
new_typo_file = './confusion.txt'
output_file = 'merged_typo_map.txt'

merge_and_convert_typo_maps(existing_typo_map, new_typo_file, output_file)

Merged and converted typo map saved to merged_typo_map.txt


In [3]:
import json

def reformat_typo_map(input_file, output_file):
    """
    Reformat the typo map values from single strings to lists of individual characters.

    Args:
        input_file (str): Path to the input typo map file.
        output_file (str): Path to save the reformatted typo map.
    """
    # Load the existing typo map
    with open(input_file, 'r', encoding='utf-8') as infile:
        typo_map = json.load(infile)

    # Transform the values in the typo map
    reformatted_map = {
        key: [char for string in value for char in string]
        for key, value in typo_map.items()
    }

    # Save the reformatted typo map
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(reformatted_map, outfile, ensure_ascii=False, indent=4)

# Example usage
input_file = "merged_typo_map.txt"  # Replace with the actual file path
output_file = "merged_typo_map.txt"  # Replace with the desired output file path
reformat_typo_map(input_file, output_file)

print(f"Reformatted typo map saved to {output_file}")


Reformatted typo map saved to merged_typo_map.txt


In [6]:
import os
import json
import csv
import random
import requests
from PyPDF2 import PdfReader
import io


def load_typo_map(file_path):
    """Load the typo map from a file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def generate_error_judgement(json_data, typo_map_probability, random_typo_probability, typo_map=None):
    # Extract the judgement text from JSON or PDF URL
    judgement_text = json_data.get('judgement', '')

    # If judgement is empty, check for a PDF URL
    if not judgement_text and 'attachAsJudgement' in json_data and 'fileUrl' in json_data['attachAsJudgement']:
        try:
            judgement_text = extract_text_from_pdf(json_data['attachAsJudgement']['fileUrl'])
        except ValueError as e:
            print(f"Error fetching PDF: {e}")
            return []

    # Function to determine if a character is Chinese
    def is_chinese_char(char):
        return '\u4e00' <= char <= '\u9fff'

    # Function to randomly generate a wrong Chinese character
    def random_typo():
        return chr(random.randint(0x4e00, 0x9fff))

    # Function to introduce typos into a character
    def introduce_typo(char):
        if typo_map and char in typo_map and random.random() < typo_map_probability:
            return random.choice(typo_map[char])
        elif random.random() < random_typo_probability:
            return random_typo()
        else:
            return char

    # If judgement is a list, join the strings
    if isinstance(judgement_text, list):
        judgement_text = ' '.join(
            item if isinstance(item, str) else ''  # Include only strings
            for item in judgement_text
        )
    elif not isinstance(judgement_text, str):
        raise ValueError(f"Expected a string or list for judgement_text, but got {type(judgement_text)}")

    # Split the judgement text by sentence using '。' (Chinese period)
    sentences = [s for s in judgement_text.split('。') if s.strip()]  # Filter out empty sentences

    # Randomly select half of the sentences to remain error-free
    error_free_indices = set(random.sample(range(len(sentences)), len(sentences) // 2))

    # Prepare rows for the CSV output
    csv_rows = []

    for idx, sentence in enumerate(sentences):
        correct_sentence = []
        error_sentence = []
        error_labeling = []

        if idx in error_free_indices:
            correct_sentence_str = sentence.replace('\n', '').replace('\r', '') + '。'
            error_labeling_str = ','.join(['0'] * len(correct_sentence_str))
            csv_rows.append([correct_sentence_str, correct_sentence_str, f'[{error_labeling_str}]'])
            continue

        # Process the sentence for typos
        for char in sentence:
            if char in ['\n', '\r']:
                continue
            if char in ['\u3000']:  # Replace special spaces with standard space
                correct_sentence.append(' ')
                error_sentence.append(' ')
                error_labeling.append('0')  # Mark as correct
                continue

            correct_sentence.append(char)
            if is_chinese_char(char):  # Only apply typos to Chinese characters
                typo_char = introduce_typo(char)
                error_sentence.append(typo_char)
                error_labeling.append('1' if typo_char != char else '0')
            else:
                # Preserve non-Chinese characters (punctuation, numbers, etc.)
                error_sentence.append(char)
                error_labeling.append('0')  # Mark as correct for other characters

        correct_sentence_str = ''.join(correct_sentence) + '。'  # Add period back to sentence
        error_sentence_str = ''.join(error_sentence) + '。'
        error_labeling_str = ','.join(error_labeling)

        # Append the row to the CSV rows list
        csv_rows.append([correct_sentence_str, error_sentence_str, f'[{error_labeling_str}]'])

    return csv_rows


def extract_text_from_pdf(url):
    """Extract text from a PDF file at the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        with io.BytesIO(response.content) as pdf_file:
            reader = PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
    else:
        raise ValueError(f"Failed to download PDF from {url}, status code: {response.status_code}")


def process_json_files_in_folders(base_folder, output_folder, typo_map_probability, random_typo_probability, typo_map_file):
    # Load the typo map from file
    typo_map = load_typo_map(typo_map_file)

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Walk through all folders and files in the base folder
    for root, _, files in os.walk(base_folder):
        for filename in files:
            if filename.endswith('.json'):  # Process JSON files only
                input_file_path = os.path.join(root, filename)
                if os.path.getsize(input_file_path) == 0:
                    print(f"Skipping empty file: {input_file_path}")
                    continue

                # Read the JSON data from the file
                with open(input_file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        json_data = json.loads(line.strip())

                # Generate error sentences and labels
                csv_rows = generate_error_judgement(json_data, typo_map_probability, random_typo_probability, typo_map)

                # Prepare the output CSV file path
                relative_path = os.path.relpath(root, base_folder)  # Keep relative folder structure
                output_dir = os.path.join(output_folder, relative_path)
                os.makedirs(output_dir, exist_ok=True)
                output_csv_path = os.path.join(output_dir, filename.replace('.json', '_with_typos.csv'))

                # Write to a CSV file
                with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(['Correct Sentence', 'Error Sentence', 'Position Error Labeling'])
                    csv_writer.writerows(csv_rows)

                print(f'Processed {input_file_path} and saved to {output_csv_path}')


# Example usage
input_folder = 'Judgement_json'  # Replace with the folder containing JSON files
output_folder = 'Error_Judgement_csv'  # Replace with the folder to save generated CSV files
typo_map_file = 'merged_typo_map.txt'  # Path to the new typo map file

process_json_files_in_folders(input_folder, output_folder, typo_map_probability=0.05, random_typo_probability=0.001, typo_map_file=typo_map_file)


Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台聲覆,1_2014-07-22.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台聲覆,1_2014-07-22_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台聲覆,2_2014-07-22.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台聲覆,2_2014-07-22_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台職覆,1_2014-02-26.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台職覆,1_2014-02-26_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台職覆,2_2014-06-18.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台職覆,2_2014-06-18_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台覆,10_2014-02-26.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,10_2014-02-26_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台覆,11_2014-02-26.json and saved to Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,11_2014-02-26_with_typos.csv
Processed Judgement_json\司法院刑事補償法庭_刑事\刑事_103,台覆,12_2014-02-26.json and saved to Error_Ju

In [None]:
# import os
# import json
# import csv
# import random
# import requests
# from PyPDF2 import PdfReader
# import io

# def generate_error_judgement(json_data, typo_map_probability, random_typo_probability, typo_map=None):
#     # Extract the judgement text from JSON or PDF URL
#     judgement_text = json_data.get('judgement', '')

#     # If judgement is empty, check for a PDF URL
#     if not judgement_text and 'attachAsJudgement' in json_data and 'fileUrl' in json_data['attachAsJudgement']:
#         try:
#             judgement_text = extract_text_from_pdf(json_data['attachAsJudgement']['fileUrl'])
#         except ValueError as e:
#             print(f"Error fetching PDF: {e}")
#             return []

#     # Function to determine if a character is Chinese
#     def is_chinese_char(char):
#         return '\u4e00' <= char <= '\u9fff'

#     # Function to randomly generate a wrong Chinese character
#     def random_typo():
#         return chr(random.randint(0x4e00, 0x9fff))

#     # Function to introduce typos into a character
#     def introduce_typo(char):
#         if typo_map and char in typo_map and random.random() < typo_map_probability:
#             return random.choice(typo_map[char])
#         elif random.random() < random_typo_probability:
#             return random_typo()
#         else:
#             return char

#     # If judgement is a list, join the strings
#     if isinstance(judgement_text, list):
#         judgement_text = ' '.join(
#             item if isinstance(item, str) else ''  # Include only strings
#             for item in judgement_text
#         )
#     elif not isinstance(judgement_text, str):
#         raise ValueError(f"Expected a string or list for judgement_text, but got {type(judgement_text)}")

#     # Split the judgement text by sentence using '。' (Chinese period)
#     sentences = [s for s in judgement_text.split('。') if s.strip()]  # Filter out empty sentences

#     # Randomly select half of the sentences to remain error-free
#     error_free_indices = set(random.sample(range(len(sentences)), len(sentences) // 2))

#     # Prepare rows for the CSV output
#     csv_rows = []

#     for idx, sentence in enumerate(sentences):
#         correct_sentence = []
#         error_sentence = []
#         error_labeling = []

#         if idx in error_free_indices:
#             # Add the sentence as-is without introducing typos
#             correct_sentence_str = sentence.replace('\n', '').replace('\r', '') + '。'
#             csv_rows.append([correct_sentence_str, correct_sentence_str, '[0]'])
#             continue

#         # Process the sentence for typos
#         for char in sentence:
#             if char in ['\n', '\r']:
#                 continue
#             if char in ['\u3000']:  # Replace special spaces with standard space
#                 correct_sentence.append(' ')
#                 error_sentence.append(' ')
#                 error_labeling.append('0')  # Mark as correct
#                 continue

#             correct_sentence.append(char)
#             if is_chinese_char(char):  # Only apply typos to Chinese characters
#                 typo_char = introduce_typo(char)
#                 error_sentence.append(typo_char)
#                 error_labeling.append('1' if typo_char != char else '0')
#             else:
#                 # Preserve non-Chinese characters (punctuation, numbers, etc.)
#                 error_sentence.append(char)
#                 error_labeling.append('0')  # Mark as correct for other characters

#         correct_sentence_str = ''.join(correct_sentence) + '。'  # Add period back to sentence
#         error_sentence_str = ''.join(error_sentence) + '。'
#         error_labeling_str = ','.join(error_labeling)

#         # Append the row to the CSV rows list
#         csv_rows.append([correct_sentence_str, error_sentence_str, f'[{error_labeling_str}]'])

#     return csv_rows



# def extract_text_from_pdf(url):
#     """Extract text from a PDF file at the given URL."""
#     response = requests.get(url)
#     if response.status_code == 200:
#         with io.BytesIO(response.content) as pdf_file:
#             reader = PdfReader(pdf_file)
#             text = ""
#             for page in reader.pages:
#                 text += page.extract_text()
#             return text
#     else:
#         raise ValueError(f"Failed to download PDF from {url}, status code: {response.status_code}")

# def process_json_files_in_folders(base_folder, output_folder, typo_map_probability, random_typo_probability, typo_map=None):
#     # Create the output folder if it doesn't exist
#     if not os.path.exists(output_folder):
#         os.makedirs(output_folder)

#     # Walk through all folders and files in the base folder
#     for root, _, files in os.walk(base_folder):
#         for filename in files:
#             if filename.endswith('.json'):  # Process JSON files only
#                 input_file_path = os.path.join(root, filename)
#                 if os.path.getsize(input_file_path) == 0:
#                     print(f"Skipping empty file: {input_file_path}")
#                     continue

#                 # Read the JSON data from the file
#                 with open(input_file_path, 'r', encoding='utf-8') as f:
#                     for line in f:
#                         json_data = json.loads(line.strip())

#                 # Generate error sentences and labels
#                 csv_rows = generate_error_judgement(json_data, typo_map_probability, random_typo_probability, typo_map)

#                 # Prepare the output CSV file path
#                 relative_path = os.path.relpath(root, base_folder)  # Keep relative folder structure
#                 output_dir = os.path.join(output_folder, relative_path)
#                 os.makedirs(output_dir, exist_ok=True)
#                 output_csv_path = os.path.join(output_dir, filename.replace('.json', '_with_typos.csv'))

#                 # Write to a CSV file
#                 with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
#                     csv_writer = csv.writer(csvfile)
#                     csv_writer.writerow(['Correct Sentence', 'Error Sentence', 'Position Error Labeling'])
#                     csv_writer.writerows(csv_rows)

#                 print(f'Processed {input_file_path} and saved to {output_csv_path}')

# # Example usage
# input_folder = 'Judgement_json'  # Replace with the folder containing JSON files
# output_folder = 'Error_Judgement_csv'  # Replace with the folder to save generated CSV files

# # Sample typo map for generating specific typos
# typo_map_example = {
#     '判': ['盼', '拚', '畔'],
#     '訟': ['送', '頌', '鬆'],
#     '責': ['債', '賊', '澤'],
#     '刑': ['型', '行', '醒'],
#     '訴': ['數', '塑', '俗'],
#     '罰': ['伐', '閥', '乏'],
#     '證': ['正', '徵', '政'],
#     '權': ['勸', '全', '拳'],
#     '職': ['植', '執', '值'],
#     '賠': ['陪', '培', '牌'],
#     '約': ['躍', '藥', '鑰'],
#     '執': ['植', '值', '直'],
#     '罪': ['最', '嘴', '醉'],
#     '辯': ['辨', '變', '辦'],
#     '檢': ['簡', '減', '撿'],
#     '審': ['神', '甚', '深'],
#     '復': ['複', '覆', '服'],
#     '協': ['洽', '鞋', '挾'],
#     '令': ['領', '靈', '零'],
#     '拘': ['居', '矩', '俱'],
#     '控': ['空', '孔', '恐'],
#     '違': ['圍', '唯', '危'],
#     '申': ['伸', '身', '紳'],
#     '賦': ['赴', '副', '傅'],
#     '裁': ['才', '材', '財'],
#     '契': ['鍥', '汽', '凱'],
#     '遺': ['移', '疑', '倚'],
#     '釋': ['式', '識', '適'],
#     '羈': ['寄', '雞', '跡'],
#     '召': ['兆', '昭', '肇'],
#     '案': ['按', '暗', '岸'],
#     '示': ['市', '式', '事'],
#     '註': ['注', '駐', '祝'],
#     '終': ['鐘', '忠', '中'],
#     '律': ['慮', '履', '旅'],
#     '調': ['條', '跳', '挑'],
#     '議': ['異', '藝', '義'],
#     '罷': ['霸', '爸', '怕'],
#     '諭': ['喻', '裕', '愉'],
#     '涉': ['設', '射', '社'],
#     '懲': ['誠', '城', '承'],
#     '處': ['除', '儲', '初'],
#     '繼': ['技', '記', '既'],
#     '典': ['顛', '佃', '甸'],
#     '策': ['側', '測', '冊'],
#     '德': ['得', '特', '惪'],
#     '護': ['滬', '互', '糊'],
#     '憲': ['線', '現', '縣'],
#     '預': ['喻', '裕', '愉'],
#     '勞': ['牢', '撈', '澇'],
#     '償': ['常', '場', '嘗'],
#     '屬': ['熟', '蜀', '署'],
#     '監': ['簡', '間', '減'],
#     '授': ['售', '獸', '瘦'],
#     '額': ['俄', '鵝', '餓'],
#     '規': ['歸', '貴', '鬼'],
#     '效': ['校', '孝', '肖'],
#     '施': ['是', '室', '試'],
#     '應': ['影', '英', '映'],
#     '為': ['未', '唯', '位'],
#     '計': ['際', '既', '技'],
#     '劃': ['畫', '話', '化'],
#     '績': ['跡', '脊', '激'],
#     '整': ['正', '征', '證'],
#     '登': ['燈', '凳', '等'],
#     '條': ['跳', '挑', '調'],
#     '憑': ['瓶', '平', '屏'],
#     '索': ['縮', '鎖', '朔'],
#     '協': ['洽', '鞋', '挾'],
#     '險': ['現', '顯', '鹹'],
#     '損': ['孫', '遜', '筍'],
#     '收': ['壽', '首', '售'],
#     '維': ['唯', '危', '偉'],
#     '駁': ['博', '薄', '撥'],
#     '糧': ['粮', '梁', '良'],
#     '覆': ['複', '復', '副'],
#     '複': ['復', '覆', '褒'],
#     '靈': ['零', '領', '齡'],
#     '檢': ['簡', '撿', '減'],
#     '繩': ['勝', '乘', '聲'],
#     '精': ['睛', '清', '靜']
# }


# # Process all JSON files in the input folder and save results in the output folder
# process_json_files_in_folders(input_folder, output_folder, typo_map_probability=0.05, random_typo_probability=0.001, typo_map=typo_map_example)


In [7]:
import os
import pandas as pd
from transformers import BertTokenizer
from Levenshtein import editops

def process_all_folders_and_files(base_folder, output_file):
    # Initialize the BERT tokenizer for Chinese
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')

    results = []  # List to store results

    # Traverse all folders and files in the base folder
    for root, dirs, files in os.walk(base_folder):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            if file_name.endswith('.csv'):  # Process only CSV files
                print(f"Processing file: {file_path}")

                # Read the CSV file
                df = pd.read_csv(file_path)

                # Ensure the necessary columns exist in the file
                if 'Correct Sentence' not in df.columns or 'Error Sentence' not in df.columns:
                    print(f"Skipping file {file_name}: Required columns missing.")
                    continue

                # Process each row in the CSV file
                for idx, row in df.iterrows():
                    correct_sentence = row['Correct Sentence']  # Get the correct sentence
                    error_sentence = row['Error Sentence']  # Get the error sentence

                    # Tokenize the sentences using BERT tokenizer
                    correct_tokens = tokenizer.tokenize(correct_sentence)
                    error_tokens = tokenizer.tokenize(error_sentence)

                    # Compute Levenshtein edit operations to transform error tokens into correct tokens
                    edits = editops(error_tokens, correct_tokens)

                    # Generate 01 labeling for error sentence tokens
                    labeling = [0] * len(error_tokens)
                    for op in edits:
                        if op[0] == 'replace' or op[0] == 'delete':  # Mark positions requiring replacement or deletion
                            labeling[op[1]] = 1
                        elif op[0] == 'insert':
                            # Handle insert operations if needed (not applicable for 01 labeling in this case)
                            pass

                    # Save the result
                    results.append({
                        "Error Sentence": error_sentence,
                        "Correct Sentence": correct_sentence,
                        "01 Labeling": labeling
                    })

    # Convert the results list into a DataFrame and save as a CSV file
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Results saved to {output_file}")

# Example usage
process_all_folders_and_files('./Error_Judgement_csv', './ErrorSentenceWithLabel/ErrorSentenceWithLabel.csv')


  from .autonotebook import tqdm as notebook_tqdm


Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台聲覆,1_2014-07-22_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台聲覆,2_2014-07-22_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台職覆,1_2014-02-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台職覆,2_2014-06-18_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,10_2014-02-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,11_2014-02-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,12_2014-02-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,13_2014-03-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,14_2014-03-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,15_2014-03-26_with_typos.csv
Processing file: ./Error_Judgement_csv\司法院刑事補償法庭_刑事\刑事_103,台覆,16_2014-03-26_with_typos.csv

In [16]:
import pandas as pd

# File paths
error_sentence_file = "./Generate_Judgement_Error_Data/ErrorSentenceWithLabel.csv"
speech_correction_file = "./Generate_Judgement_Error_Data/speech_correction.csv"
output_file = "./Generate_Judgement_Error_Data./271K&Speech_error.csv"

# Load the datasets
error_sentence_df = pd.read_csv(error_sentence_file)
speech_correction_df = pd.read_csv(speech_correction_file)

# Remove 01 labeling from ErrorSentenceWithLabel
if "Position Error Labeling" in error_sentence_df.columns:
    error_sentence_df.drop(columns=["Position Error Labeling"], inplace=True)

# Remove newlines in speech_correction and ensure each row is single line
speech_correction_df.replace(r'\r', '', regex=True, inplace=True)

# Rename columns for merging
error_sentence_df.rename(columns={
    "Error Sentence": "Wrong_text",
    "Correct Sentence": "Truth_text"
}, inplace=True)

# speech_correction_df.rename(columns={
#     "Error Sentence": "wrong_text",
#     "Correct Sentence": "truth_text"
# }, inplace=True)

# Combine the two datasets
merged_df = pd.concat([error_sentence_df, speech_correction_df], ignore_index=True)

# Select only the required columns
merged_df = merged_df[["Wrong_text", "Truth_text"]]

# Save the merged output to a new file
merged_df.to_csv(output_file, index=False)

output_file

'./Generate_Judgement_Error_Data./271K&Speech_error.csv'

In [None]:
# Correct column names and format the data
formatted_data = []
for index, row in csv_data.iterrows():
    wrong_text = row['Wrong_text']
    truth_text = row['Truth_text']
    formatted_row = f"{wrong_text}\t{truth_text}\n"
    formatted_data.append(formatted_row)

# Save to new TXT file
with open(output_txt_file, 'w', encoding='utf-8') as output_file:
    output_file.writelines(formatted_data)

output_txt_file
