In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import os
import glob
import numpy as np
import pandas as pd

In [13]:

master_labels_fn = '../data/data_generation/master_label.csv'

# Read master labels
master_labels = pd.read_csv(master_labels_fn)
print("Master Labels:")
print(master_labels)
print("\n" + "="*80 + "\n")

# Columns to check (excluding file_name and template_id)
columns_to_check = ['full_name', 'company_name', 'company_address', 'phone_number']


json_pii_folder='../data/json_pii/'
fns = glob.glob(f'{json_pii_folder}*.json')
fns = sorted([fn.split('/')[-1] for fn in fns if 'paddle' in fn ])

import json

# Initialize stats tracking
total_comparisons = 0
total_matches = 0
column_stats = {col: {'matches': 0, 'total': 0} for col in columns_to_check}
file_stats = []

print(f"Found {len(fns)} JSON files to process\n")

for fn in fns:
    json_path = os.path.join(json_pii_folder, fn)
    with open(json_path, 'r') as f:
        json_data = json.load(f)
    
    # Try different filename patterns
    lookup_name = fn.replace('tesseract_','').replace('paddle_','').replace('.json','')
    print(f"Looking for '{lookup_name}' in master_labels...")
    
    # Find corresponding row in master_labels
    row = master_labels[master_labels['file_name'] == lookup_name]
    if row.empty:
        print(f"  ❌ No matching row for {fn} (tried: {lookup_name})")
        print(f"  Available filenames in master_labels: {master_labels['file_name'].tolist()[:5]}...")
        continue
    
    row = row.iloc[0]
    print(f"  ✓ Found match!")
    print(f"\nComparing {fn}:")
    
    file_matches = 0
    file_total = 0
    
    for col in columns_to_check:
        json_val = json_data.get(col, None)
        csv_val = row[col]
        match = json_val == csv_val
        print(f"  {col}: JSON='{json_val}' | CSV='{csv_val}' | Match={match}")
        
        # Update stats
        total_comparisons += 1
        file_total += 1
        column_stats[col]['total'] += 1
        
        if match:
            total_matches += 1
            file_matches += 1
            column_stats[col]['matches'] += 1
    
    file_accuracy = (file_matches / file_total * 100) if file_total > 0 else 0
    file_stats.append({'file': fn, 'matches': file_matches, 'total': file_total, 'accuracy': file_accuracy})
    print(f"  File Accuracy: {file_matches}/{file_total} ({file_accuracy:.2f}%)")
    print("-"*40)

# Print overall statistics
print("\n" + "="*80)
print("OVERALL STATISTICS")
print("="*80)

if total_comparisons == 0:
    print("\n⚠️  WARNING: No files were successfully compared!")
    print("Check that the filenames in JSON folder match those in master_labels.csv")
else:
    overall_accuracy = (total_matches / total_comparisons * 100) if total_comparisons > 0 else 0
    print(f"\nOverall Accuracy: {total_matches}/{total_comparisons} ({overall_accuracy:.2f}%)")

    print("\n" + "-"*40)
    print("Per-Column Accuracy:")
    print("-"*40)
    for col in columns_to_check:
        matches = column_stats[col]['matches']
        total = column_stats[col]['total']
        accuracy = (matches / total * 100) if total > 0 else 0
        print(f"  {col:20s}: {matches:3d}/{total:3d} ({accuracy:6.2f}%)")

    print("\n" + "-"*40)
    print("Per-File Accuracy:")
    print("-"*40)
    for stat in file_stats:
        print(f"  {stat['file']:40s}: {stat['matches']}/{stat['total']} ({stat['accuracy']:6.2f}%)")

    # Create summary DataFrame
    if len(file_stats) > 0:
        summary_df = pd.DataFrame(file_stats)
        print("\n" + "="*80)
        print("Summary Statistics:")
        print(f"  Mean Accuracy: {summary_df['accuracy'].mean():.2f}%")
        print(f"  Median Accuracy: {summary_df['accuracy'].median():.2f}%")
        print(f"  Min Accuracy: {summary_df['accuracy'].min():.2f}%")
        print(f"  Max Accuracy: {summary_df['accuracy'].max():.2f}%")
        print(f"  Std Dev: {summary_df['accuracy'].std():.2f}%")


Master Labels:
               file_name  full_name    company_name  \
0   Redactor_sample_0001   ソフィア・ミラー   株式会社オリオンシステムズ   
1   Redactor_sample_0002   キム・ミンジュン      ネクサス貿易合同会社   
2   Redactor_sample_0003  エミリー・ブラウン  スカイネット・ロジスティクス   
3   Redactor_sample_0004    チャン・ウェイ   株式会社アース・クリエイト   
4   Redactor_sample_0005   ルイス・カルロス   株式会社オリオンシステムズ   
..                   ...        ...             ...   
59  Redactor_sample_0060      野村 美穂        リバーサイド工業   
60  Redactor_sample_0061      古川 幸助    株式会社アルファドライブ   
61  Redactor_sample_0062       菊地 栞       合同会社サンセット   
62  Redactor_sample_0063      千葉 雄大    株式会社アルファドライブ   
63  Redactor_sample_0064      上野 莉奈       合同会社サンセット   

                               company_address  phone_number  template_id  
0      〒100-0004 東京都千代田区大手町1-9-2 大手町フィナンシャルシティ  03-6262-1212            1  
1   〒220-0012 神奈川県横浜市西区みなとみらい2-2-1 横浜ランドマークタワー  045-222-5501            1  
2           〒530-0011 大阪府大阪市北区大深町3-1 グランフロント大阪  06-6372-6789            1  
3         〒812-0012 