# Accuracy Report
This notebook provides a visual analysis of the extraction accuracy compared to the ground truth across multiple documents.

In [2]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl.metadata (52 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.3.1-py3-none-any.whl.metadata (5.6 kB)
Downloading matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m2.0 MB/s[0m  [33m0:00:04[0m eta [36m0:00:01[0m
[?25hDownloading contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl (274 kB)
Usin

In [3]:
import json
import difflib
import os
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Global list to store summary results
batch_summary = []

def calculate_accuracy(str1, str2):
    str1 = str(str1).strip()
    str2 = str(str2).strip()
    if not str1 and not str2:
        return 100.0
    matcher = difflib.SequenceMatcher(None, str1, str2)
    return round(matcher.ratio() * 100, 2)

def get_comparison_data(gt_path, res_path):
    with open(gt_path, 'r', encoding='utf-8') as f:
        gt = json.load(f)
    with open(res_path, 'r', encoding='utf-8') as f:
        res = json.load(f)

    data = []
    
    # Top-level fields
    for key in gt.keys():
        if key == 'line_items': continue
        val_gt = gt.get(key)
        val_res = res.get(key, "")
        
        if not val_gt or str(val_gt).strip() == "":
            continue
            
        acc = calculate_accuracy(val_gt, val_res)
        data.append({"Field": key, "Accuracy": acc, "Ground Truth": val_gt, "Result": val_res, "Category": "Header"})

    # Line items
    gt_items = gt.get('line_items', [])
    res_items = res.get('line_items', [])
    for i, item_gt in enumerate(gt_items):
        item_res = res_items[i] if i < len(res_items) else {}
        for k, v_gt in item_gt.items():
            if not v_gt or str(v_gt).strip() == "": continue
            v_res = item_res.get(k, "")
            acc = calculate_accuracy(v_gt, v_res)
            data.append({"Field": f"Item_{i+1}_{k}", "Accuracy": acc, "Ground Truth": v_gt, "Result": v_res, "Category": "Line Item"})
            
    return pd.DataFrame(data)

def generate_visual_report(df, filename):
    if df.empty:
        print(f"No data to report for {filename}.")
        return

    overall_acc = df['Accuracy'].mean()
    
    # Store in summary
    batch_summary.append({"File": filename, "Score": overall_acc, "Fields": len(df)})

    display(HTML(f"<h1 style='border-bottom: 2px solid #334155; padding-bottom: 10px; margin-top: 50px;'>Report: {filename}</h1>"))

    # 1. Summary Card
    summary_html = f"""
    <div style="padding: 20px; border-radius: 10px; background-color: #f8fafc; border-left: 8px solid #4CAF50; margin-bottom: 24px; box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);">
        <h2 style="margin: 0; color: #1e293b; font-family: sans-serif;">Extraction Accuracy Summary</h2>
        <div style="display: flex; align-items: center; margin-top: 12px;">
            <span style="font-size: 16px; color: #64748b; margin-right: 8px;">Overall Score:</span>
            <span style="color: #16a34a; font-size: 28px; font-weight: bold;">{overall_acc:.2f}%</span>
        </div>
        <p style="margin: 8px 0 0 0; color: #64748b; font-size: 14px;">Total Fields Evaluated: <b style="color: #334155;">{len(df)}</b></p>
    </div>
    """
    display(HTML(summary_html))

    # 2. Bar Chart
    plt.figure(figsize=(10, len(df) * 0.45))
    colors = ['#22c55e' if x > 95 else '#84cc16' if x > 90 else '#eab308' if x > 70 else '#ef4444' for x in df['Accuracy']]
    plt.barh(df['Field'], df['Accuracy'], color=colors, height=0.7)
    plt.xlabel('Accuracy %')
    plt.title(f'Field Accuracy - {filename}')
    plt.xlim(0, 110)
    plt.grid(axis='x', linestyle='--', alpha=0.3)
    plt.gca().invert_yaxis()
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    for i, v in enumerate(df['Accuracy']):
        plt.text(v + 1, i, f"{v}%", color='#334155', va='center')
    plt.tight_layout()
    plt.show()

    # 3. Styled Table
    def style_accuracy_col(val):
        if val > 95: bg, text = '#f0fdf4', '#166534'
        elif val > 90: bg, text = '#ecfdf5', '#065f46'
        elif val > 70: bg, text = '#fffbeb', '#92400e'
        else: bg, text = '#fef2f2', '#991b1b'
        return f'background-color: {bg}; color: {text}; font-weight: bold;'

    styled_df = df.style.map(style_accuracy_col, subset=['Accuracy'])\
                       .format({"Accuracy": "{:.2f}%"})\
                       .set_properties(**{'text-align': 'left', 'padding': '12px 15px', 'border-bottom': '1px solid #e2e8f0', 'font-family': 'sans-serif'})\
                       .set_table_styles([
                           {'selector': 'th', 'props': [('background-color', '#f8fafc'), ('color', '#475569'), ('font-weight', 'bold'), ('text-transform', 'uppercase'), ('font-size', '12px')]},
                           {'selector': 'tr:hover', 'props': [('background-color', '#f1f5f9')]}
                       ])
    
    display(styled_df)

# Execution Loop
result_dir = '/Users/pat/Desktop/custom_FM/working/comparison/result/updated_prompt_result/'
gt_dir = '/Users/pat/Desktop/custom_FM/working/comparison/ground_truth/converted/'

batch_summary = [] # Reset summary

for res_file in sorted(os.listdir(result_dir)):
    if res_file.startswith("output_") and res_file.endswith(".json"):
        # Identify the ID (e.g., MSV-001)
        file_id = res_file.replace("output_", "").replace(".json", "")
        gt_file = f"{file_id}.json"
        
        res_path = os.path.join(result_dir, res_file)
        gt_path = os.path.join(gt_dir, gt_file)
        
        if os.path.exists(gt_path):
            df_results = get_comparison_data(gt_path, res_path)
            generate_visual_report(df_results, file_id)
        else:
            print(f"Warning: No Ground Truth found for {file_id}")



---

In [5]:
# FINAL BATCH SUMMARY
display(HTML("<h1 style='color: #1e293b; margin-top: 40px;'>Batch Processing Final Results</h1>"))

summary_df = pd.DataFrame(batch_summary)

if not summary_df.empty:
    # Aggregate Score
    avg_batch_score = summary_df['Score'].mean()
    
    display(HTML(f"""
    <div style='background: linear-gradient(135deg, #1e293b 0%, #334155 100%); color: white; padding: 30px; border-radius: 15px; margin-bottom: 30px;'>
        <h2 style='margin: 0; opacity: 0.8;'>Aggregate Accuracy</h2>
        <div style='font-size: 48px; font-weight: bold;'>{avg_batch_score:.2f}%</div>
        <div style='margin-top: 10px; font-size: 16px;'>Across {len(summary_df)} documents</div>
    </div>
    """))

    # Plotting Leaderboard
    plt.figure(figsize=(12, 6))
    plt.bar(summary_df['File'], summary_df['Score'], color='#6366f1')
    plt.axhline(y=avg_batch_score, color='#ef4444', linestyle='--', label=f'Average ({avg_batch_score:.1f}%)')
    plt.title("Comparison Score per Document", fontsize=15, fontweight='bold')
    plt.ylabel("Accuracy %")
    plt.ylim(0, 110)
    plt.legend()
    plt.grid(axis='y', alpha=0.2)
    plt.show()

    # Detailed Final Styler
    def final_score_style(val):
        color = '#16a34a' if val > 90 else '#eab308' if val > 75 else '#ef4444'
        return f'color: {color}; font-weight: bold;'

    # Display final results table
    display(HTML("<h3>Document Leaderboard</h3>"))
    display(summary_df.style.map(final_score_style, subset=['Score'])\
                      .format({"Score": "{:.2f}%"})\
                      .set_properties(**{'text-align': 'center', 'padding': '15px'})\
                      .hide(axis='index'))
else:
    print("No results to summarize.")

No results to summarize.
