In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import numpy as np
from paddleocr import PaddleOCR
import pdf2image
import pytesseract

# pdf2txt

In [3]:
def pdf2txt_tesseract(fn, folder_in, folder_out, override=False):
    try:
        
        output_filename = folder_out+'paddle_' + fn.replace('.pdf', '.txt')
        if not os.path.exists(output_filename) or override:
            
            pages = pdf2image.convert_from_path(folder_in+fn, dpi=300)
            print(f"PDF converted to {len(pages)} page(s)")
            
            # Extract text from each page using OCR
            extracted_text = ""
            for i, page in enumerate(pages):
                print(f"Processing page {i+1}...")
                # Use pytesseract to extract text from the image
                page_text = pytesseract.image_to_string(page, lang='jpn')
                extracted_text += f"\n--- Page {i+1} ---\n"
                extracted_text += page_text.replace('\n','')
                extracted_text += "\n"
            
            print(f"OCR extraction completed. Total characters extracted: {len(extracted_text)}")
            
            # Display first 500 characters as preview
            print("\n--- Text Preview (first 500 characters) ---")
            print(extracted_text.replace(' ', '').replace('\n','')[:50])
            
            # Save extracted text to a file
            output_filename = folder_out+'tesseract_' + fn.replace('.pdf', '.txt')

            with open(output_filename, 'w', encoding='utf-8') as f:
                f.write(extracted_text)
            print(f"\nFull extracted text saved to: {output_filename}")
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        
        
def pdf2txt_paddle(fn, folder_in, folder_out, override = False):
    try:
        
        output_filename = folder_out+'paddle_' + fn.replace('.pdf', '.txt')
        if not os.path.exists(output_filename) or override:
        # Initialize PaddleOCR for Japanese with updated parameter
        # Use use_textline_orientation instead of deprecated use_angle_cls
            ocr = PaddleOCR(lang='japan')
            
            pages = pdf2image.convert_from_path(folder_in+fn, dpi=300)
            print(f"PDF converted to {len(pages)} page(s)")
            
            # Extract text from each page using OCR
            extracted_text = ""
            for i, page in enumerate(pages):
                print(f"Processing page {i+1}...")
                
                # Convert PIL image to numpy array for PaddleOCR
                page_array = np.array(page)
                
                # Use PaddleOCR to extract text from the image
                try:
                    result = ocr.ocr(page_array, cls=True)
                    
                    # Extract text from OCR results
                    page_text = ""
                    if result and result[0]:  # Check if OCR found any text
                        for line in result[0]:
                            if line and len(line) >= 2 and line[1] and line[1][0]:  # Check if text exists
                                page_text += line[1][0] + "\n"
                    
                    extracted_text += f"\n--- Page {i+1} ---\n"
                    extracted_text += page_text.replace('\n','')
                    extracted_text += "\n"
                    
                except Exception as ocr_error:
                    print(f"OCR error on page {i+1}: {str(ocr_error)}")
                    extracted_text += f"\n--- Page {i+1} (OCR Error) ---\n"
                    extracted_text += f"Error: {str(ocr_error)}\n\n"
            
            print(f"OCR extraction completed. Total characters extracted: {len(extracted_text)}")
            
            # Display first 500 characters as preview
            print("\n--- Text Preview (first 500 characters) ---")
            print(extracted_text.replace(' ', '')[:50])
                
            with open(output_filename, 'w', encoding='utf-8') as f:
                f.write(extracted_text)
            print(f"\nFull extracted text saved to: {output_filename}")
            
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")

In [4]:
# pdf_folder='../data/pdf/'
pdf_folder='../data/data_generation/generations/Generation_9_10-43am/pdf/'
txt_folder='../data/txt/'
fns = glob.glob(f'{pdf_folder}*.pdf')
fns = sorted([fn.split('/')[-1] for fn in fns ])

override = True
for fn in fns:
    pdf2txt_tesseract(fn,pdf_folder,txt_folder, override)
    pdf2txt_paddle(fn,pdf_folder,txt_folder, override)


PDF converted to 1 page(s)
Processing page 1...
OCR extraction completed. Total characters extracted: 694

--- Text Preview (first 500 characters) ---
---Page1---氏名(フルネーム)会社名会社所在地電話番号令和7年10月11日労働条件通知書兼

Full extracted text saved to: ../data/txt/tesseract_Redactor_sample_0001.txt
[2025/10/12 11:29:50] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/df/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_p

# txt Eval

In [2]:
txt_folder = '../data/unstructured_pii/'

In [3]:
import pandas as pd
import glob
import os


def normalize(txt):
    txt = txt.replace(' ', '').replace('\n', '')
    txt = txt.replace('〒','')
    return txt

master_labels_fn = '../data/data_generation/master_label.csv'

# Read master labels
master_labels = pd.read_csv(master_labels_fn)
print("Master Labels:")
print(master_labels)
print("\n" + "="*80 + "\n")

# Get all txt files
txt_files = glob.glob(txt_folder + '*.txt')
txt_files = sorted(txt_files)

# Columns to check (excluding file_name and template_id)
columns_to_check = ['full_name', 'company_name', 'company_address', 'phone_number']

# Results storage
results = []

for txt_file in txt_files:
    txt_filename = os.path.basename(txt_file)
    
    # Extract base filename (e.g., "Redactor_sample_0001" from "paddle_Redactor_sample_0001.txt")
    # This handles both tesseract_ and paddle_ prefixes
    base_name = txt_filename.replace('paddle_', '').replace('tesseract_', '').replace('.txt', '')
    
    # Find corresponding row in master_labels
    label_row = master_labels[master_labels['file_name'] == base_name]
    
    if label_row.empty:
        print(f"Warning: No labels found for {base_name}")
        continue
    
    # Read txt file content
    with open(txt_file, 'r', encoding='utf-8') as f:
        txt_content = f.read()
    
    # Remove spaces and newlines for better matching
    txt_content_normalized = normalize(txt_content)
    
    
    print(f"Evaluating: {txt_filename}")
    print(f"Base name: {base_name}")
    
    # Check each column
    found_count = 0
    total_count = 0
    column_results = {}
    
    for col in columns_to_check:
        label_value = str(label_row[col].values[0])
        if pd.notna(label_value) and label_value != 'nan':
            total_count += 1
            # Normalize label value (remove spaces)
            label_normalized = normalize(label_value)
            
            # Check if label is in txt content
            if label_normalized in txt_content_normalized:
                found_count += 1
                column_results[col] = 'FOUND'
                print(f"  ✓ {col}: '{label_value}' - FOUND")
            else:
                column_results[col] = 'NOT FOUND'
                print(f"  ✗ {col}: '{label_value}' - NOT FOUND")
        else:
            column_results[col] = 'N/A'
    
    # Calculate percentage
    percentage = (found_count / total_count * 100) if total_count > 0 else 0
    
    print(f"  → Found: {found_count}/{total_count} ({percentage:.1f}%)")
    print()
    
    # Store results
    result_dict = {
        'txt_file': txt_filename,
        'base_name': base_name,
        'found': found_count,
        'total': total_count,
        'percentage': percentage
    }
    result_dict.update(column_results)
    results.append(result_dict)

# Create results DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("SUMMARY OF RESULTS:")
print("="*80)
print(results_df[['txt_file', 'found', 'total', 'percentage']])

print("\n" + "="*80)
print("DETAILED RESULTS BY COLUMN:")
print("="*80)
print(results_df)

# Save results to CSV
output_file = '../data/ocr_evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")

Master Labels:
               file_name  full_name    company_name  \
0   Redactor_sample_0001   ソフィア・ミラー   株式会社オリオンシステムズ   
1   Redactor_sample_0002   キム・ミンジュン      ネクサス貿易合同会社   
2   Redactor_sample_0003  エミリー・ブラウン  スカイネット・ロジスティクス   
3   Redactor_sample_0004    チャン・ウェイ   株式会社アース・クリエイト   
4   Redactor_sample_0005   ルイス・カルロス   株式会社オリオンシステムズ   
..                   ...        ...             ...   
59  Redactor_sample_0060      野村 美穂        リバーサイド工業   
60  Redactor_sample_0061      古川 幸助    株式会社アルファドライブ   
61  Redactor_sample_0062       菊地 栞       合同会社サンセット   
62  Redactor_sample_0063      千葉 雄大    株式会社アルファドライブ   
63  Redactor_sample_0064      上野 莉奈       合同会社サンセット   

                               company_address  phone_number  template_id  
0      〒100-0004 東京都千代田区大手町1-9-2 大手町フィナンシャルシティ  03-6262-1212            1  
1   〒220-0012 神奈川県横浜市西区みなとみらい2-2-1 横浜ランドマークタワー  045-222-5501            1  
2           〒530-0011 大阪府大阪市北区大深町3-1 グランフロント大阪  06-6372-6789            1  
3         〒812-0012 