In [24]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import os
import glob
import numpy as np

# RemovePII

In [28]:
import json
import re

def normalize_for_matching(text):
    """Remove spaces, newlines, and other whitespace for matching purposes"""
    return re.sub(r'\s+', '', text)

def remove_pii(fn, txt_folder, json_pii_folder):
    """
    Remove PII information from text file based on JSON labels.
    Handles spaces and newlines in the text content intelligently.
    """
    # Read txt file content
    with open(txt_folder + fn, 'r', encoding='utf-8') as f:
        txt_content = f.read()
        
    # Read JSON PII file content
    json_fn = fn.replace('.txt', '.json')
    with open(json_pii_folder + json_fn, 'r', encoding='utf-8') as f:
        json_pii = json.load(f)
    
    # Track what we remove
    removed_fields = []
    original_txt = txt_content
    
    # Normalize the text for searching (remove all whitespace)
    txt_normalized = normalize_for_matching(txt_content)
    
    # Fields to remove (in order of specificity - longer/more specific first)
    fields_to_check = ['address', 'company_name', 'full_name', 'phone_number']
    
    for field in fields_to_check:
        if field in json_pii and json_pii[field]:
            pii_value = str(json_pii[field])
            pii_normalized = normalize_for_matching(pii_value)
            
            # Find the position in normalized text
            pos = txt_normalized.find(pii_normalized)
            
            if pos != -1:
                # Now find the actual substring in the original text
                # We need to map the normalized position back to the original text
                original_pos = 0
                normalized_pos = 0
                
                # Walk through original text until we reach the position
                while normalized_pos < pos and original_pos < len(txt_content):
                    if not txt_content[original_pos].isspace():
                        normalized_pos += 1
                    original_pos += 1
                
                # Now extract the substring of the same length (accounting for whitespace)
                start_pos = original_pos
                chars_matched = 0
                end_pos = original_pos
                
                while chars_matched < len(pii_normalized) and end_pos < len(txt_content):
                    if not txt_content[end_pos].isspace():
                        chars_matched += 1
                    end_pos += 1
                
                # Extract the actual text that matches (with whitespace)
                actual_match = txt_content[start_pos:end_pos]
                
                # Replace with redaction marker
                redaction = f"[REDACTED_{field.upper()}]"
                txt_content = txt_content[:start_pos] + redaction + txt_content[end_pos:]
                
                # Update normalized text too
                txt_normalized = normalize_for_matching(txt_content)
                
                removed_fields.append(field)
                # print(f"  Removed {field}: '{pii_value}'")
    
    # Save the redacted content
    output_fn = txt_folder.replace('/txt/', '/txt_no_pii/') + fn
    os.makedirs(os.path.dirname(output_fn), exist_ok=True)
    with open(output_fn, 'w', encoding='utf-8') as f:
        f.write(txt_content)
    

    if len(removed_fields) != 4:
        
        print(fn)
        print(json_pii)
        print(removed_fields)

        print("original", original_txt)
        
        print("no pii", txt_content)
        
    return len(removed_fields), removed_fields


In [31]:
# pdf_folder='../data/pdf/'
txt_folder='../data/txt/'
fns = glob.glob(f'{txt_folder}*.txt')
fns = sorted([fn.split('/')[-1] for fn in fns if 'tess' in fn ])
txt_no_pii_folder = '../data/txt_no_pii/'
json_pii_folder = '../data/json_pii/'
os.makedirs(txt_no_pii_folder, exist_ok=True)

c = 0
for fn in fns[0:38]:
    count, fields = remove_pii(fn, txt_folder, json_pii_folder)
    if count != 4:
        c+=1
        print(f"  WARNING: Expected 4 fields but removed {count}, {fields} on {fn}")

print(c)

tesseract_Redactor_sample_0001.txt
{'full_name': '早稲田 太郎', 'company_name': 'ソノソフィア・ミラー株式会社オリオンシステムズ', 'address': '東京都千代田区大手町 1-9-2 大手町フィナンシャルシティ03-6262-1212', 'phone_number': '03-6262-1212'}
['address', 'company_name', 'full_name']
original 
--- Page 1 ---
氏名 (フルネーム)会社名会社所在地電話番号 令和 7 年 10 月 11 日労働条件通知書兼雇用契約書 (シンプル)ソノソフィア・ミラー株式会社オリオンシステムズ〒 100-0004 東京都千代田区大手町 1-9-2 大手町フィナンシャルシティ03-6262-1212次の労働条件によって雇用契約を締結します。雇用期間更新の有無試用期間就業の場所仕事の内容各業時間時間外・休日休日休暇退職金額割増率貞金締切・支払その他V期間の定めあり (令和7 年 11 月 1 日一令和 8 年 10 月 31 日)更新する場合があり得る (基準 : 業務量/成績・態度能カプ経営状況進捗)令和 7 年 11 月 1 日ー令和 8 年 1 月 31 日本社 (上記所在地)産業機械の組立補助・検査・付随事務始業 9.00 終業 18:00 休憩 12:00て13:00所定時間外有 (約 20 時間/月) 休日労働無土曜・日曜・祝日・年末年始 (12/29一1/3) 年間 120 日年次有給 (法定) 特別休暇 (慶事 3 日有給)定年 60 蔵ノ継続雇用 65 歳までその他は就業規則による月給 320.000 円、役職手当 10.000 円、通勤手当実費 (上限 30.000 円)時間外 (法定超月 60h 以内) 25%/ (60h 超) 50%%、休日 (法定) 35%、深夜25%毎月末日締切翌月 25 日支払 (口座振込)社会保険・雇用保険・労災保険加入  使用者 : 代表取締役早稲田太郎           ⑧ (会社名・所在地は上記のとおり)

no pii 
--- Page 1 ---
氏名 (フルネーム)会社名会社所在地電話番号 令和 7 年 10 月 11 日労働

In [2]:
txt_folder = '../data/unstructured_pii/'

In [3]:
import pandas as pd
import glob
import os


def normalize(txt):
    txt = txt.replace(' ', '').replace('\n', '')
    txt = txt.replace('〒','')
    return txt

master_labels_fn = '../data/data_generation/master_label.csv'

# Read master labels
master_labels = pd.read_csv(master_labels_fn)
print("Master Labels:")
print(master_labels)
print("\n" + "="*80 + "\n")

# Get all txt files
txt_files = glob.glob(txt_folder + '*.txt')
txt_files = sorted(txt_files)

# Columns to check (excluding file_name and template_id)
columns_to_check = ['full_name', 'company_name', 'company_address', 'phone_number']

# Results storage
results = []

for txt_file in txt_files:
    txt_filename = os.path.basename(txt_file)
    
    # Extract base filename (e.g., "Redactor_sample_0001" from "paddle_Redactor_sample_0001.txt")
    # This handles both tesseract_ and paddle_ prefixes
    base_name = txt_filename.replace('paddle_', '').replace('tesseract_', '').replace('.txt', '')
    
    # Find corresponding row in master_labels
    label_row = master_labels[master_labels['file_name'] == base_name]
    
    if label_row.empty:
        print(f"Warning: No labels found for {base_name}")
        continue
    
    # Read txt file content
    with open(txt_file, 'r', encoding='utf-8') as f:
        txt_content = f.read()
    
    # Remove spaces and newlines for better matching
    txt_content_normalized = normalize(txt_content)
    
    
    print(f"Evaluating: {txt_filename}")
    print(f"Base name: {base_name}")
    
    # Check each column
    found_count = 0
    total_count = 0
    column_results = {}
    
    for col in columns_to_check:
        label_value = str(label_row[col].values[0])
        if pd.notna(label_value) and label_value != 'nan':
            total_count += 1
            # Normalize label value (remove spaces)
            label_normalized = normalize(label_value)
            
            # Check if label is in txt content
            if label_normalized in txt_content_normalized:
                found_count += 1
                column_results[col] = 'FOUND'
                print(f"  ✓ {col}: '{label_value}' - FOUND")
            else:
                column_results[col] = 'NOT FOUND'
                print(f"  ✗ {col}: '{label_value}' - NOT FOUND")
        else:
            column_results[col] = 'N/A'
    
    # Calculate percentage
    percentage = (found_count / total_count * 100) if total_count > 0 else 0
    
    print(f"  → Found: {found_count}/{total_count} ({percentage:.1f}%)")
    print()
    
    # Store results
    result_dict = {
        'txt_file': txt_filename,
        'base_name': base_name,
        'found': found_count,
        'total': total_count,
        'percentage': percentage
    }
    result_dict.update(column_results)
    results.append(result_dict)

# Create results DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("SUMMARY OF RESULTS:")
print("="*80)
print(results_df[['txt_file', 'found', 'total', 'percentage']])

print("\n" + "="*80)
print("DETAILED RESULTS BY COLUMN:")
print("="*80)
print(results_df)

# Save results to CSV
output_file = '../data/ocr_evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

Master Labels:
               file_name  full_name    company_name  \
0   Redactor_sample_0001   ソフィア・ミラー   株式会社オリオンシステムズ   
1   Redactor_sample_0002   キム・ミンジュン      ネクサス貿易合同会社   
2   Redactor_sample_0003  エミリー・ブラウン  スカイネット・ロジスティクス   
3   Redactor_sample_0004    チャン・ウェイ   株式会社アース・クリエイト   
4   Redactor_sample_0005   ルイス・カルロス   株式会社オリオンシステムズ   
..                   ...        ...             ...   
59  Redactor_sample_0060      野村 美穂        リバーサイド工業   
60  Redactor_sample_0061      古川 幸助    株式会社アルファドライブ   
61  Redactor_sample_0062       菊地 栞       合同会社サンセット   
62  Redactor_sample_0063      千葉 雄大    株式会社アルファドライブ   
63  Redactor_sample_0064      上野 莉奈       合同会社サンセット   

                               company_address  phone_number  template_id  
0      〒100-0004 東京都千代田区大手町1-9-2 大手町フィナンシャルシティ  03-6262-1212            1  
1   〒220-0012 神奈川県横浜市西区みなとみらい2-2-1 横浜ランドマークタワー  045-222-5501            1  
2           〒530-0011 大阪府大阪市北区大深町3-1 グランフロント大阪  06-6372-6789            1  
3         〒812-0012 