# NOTEBOOK UPDATED BY AI ASSISTANT
# Date: 2026-01-05
# Status: Weighted scoring, labels, and colors enabled.

# Accuracy Report
This notebook provides a visual analysis of the extraction accuracy compared to the ground truth across multiple documents.

In [None]:
import json, difflib, os, pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

batch_summary = []
FIELD_WEIGHTS = {
    "invoice_number": 3.0, 
    "total_amount": 5.0, 
    "net_amount": 3.0, 
    "invoice_date": 2.0, 
    "total_tax_amount": 2.0,
    "supplier_tax_id": 2.0,
    "customer_tax_id": 2.0,
    "line_items": {
        "description": 0.5, 
        "quantity": 1.0, 
        "unit_price": 1.5, 
        "amount": 2.0
    }
}

def calculate_accuracy(str1, str2):
    str1, str2 = str(str1).strip(), str(str2).strip()
    if not str1 and not str2: return 100.0
    return round(difflib.SequenceMatcher(None, str1, str2).ratio() * 100, 2)

def get_comparison_data(gt_path, res_path):
    with open(gt_path, 'r', encoding='utf-8') as f: gt = json.load(f)
    with open(res_path, 'r', encoding='utf-8') as f: res = json.load(f)
    data = []
    # Header Fields
    for k in gt: 
        if k == 'line_items': continue
        v_gt = gt[k]
        v_res = res.get(k, '')
        acc = calculate_accuracy(v_gt, v_res)
        data.append({
            'Field': k, 
            'Accuracy': acc, 
            'Ground Truth': v_gt, 
            'Result': v_res, 
            'Category': 'Header', 
            'Weight': FIELD_WEIGHTS.get(k, 1.0)
        })
    # Line Items
    for i, gi in enumerate(gt.get('line_items', [])):
        ri = res.get('line_items', [])[i] if i < len(res.get('line_items', [])) else {}
        for k, v in gi.items():
            acc = calculate_accuracy(v, ri.get(k, ''))
            data.append({
                'Field': f'Item_{i+1}_{k}', 
                'Accuracy': acc, 
                'Ground Truth': v, 
                'Result': ri.get(k, ''), 
                'Category': 'Line Item', 
                'Weight': FIELD_WEIGHTS.get('line_items', {}).get(k, 1.0)
            })
    return pd.DataFrame(data)

def generate_visual_report(df, filename):
    if df.empty: return
    doc_points = (df['Accuracy'] * df['Weight']).sum()
    doc_weight = df['Weight'].sum()
    score = doc_points / doc_weight if doc_weight > 0 else 0
    batch_summary.append({'File': filename, 'Score': score, 'WP': doc_points, 'TW': doc_weight, 'Fields': len(df)})
    
    # Premium Header Card
    display(HTML(f"""
    <div style='background: linear-gradient(135deg, #1e293b 0%, #334155 100%); padding: 30px; border-radius: 20px; color: white; margin-bottom: 30px; box-shadow: 0 10px 15px -3px rgba(0,0,0,0.1);'>
        <div style='display: flex; justify-content: space-between; align-items: center;'>
            <div>
                <h1 style='margin: 0; font-size: 32px; font-weight: 800; letter-spacing: -0.025em;'>Report: {filename}</h1>
                <p style='margin: 5px 0 0 0; opacity: 0.7; font-size: 16px;'>Weighted Accuracy Analysis</p>
            </div>
            <div style='text-align: right;'>
                <div style='font-size: 48px; font-weight: 900; line-height: 1;'>{score:.2f}%</div>
                <div style='font-size: 14px; opacity: 0.6; margin-top: 5px;'>TOTAL SCORE</div>
            </div>
        </div>
    </div>
    """))
    
    # Bar Chart Styling
    plt.figure(figsize=(12, len(df)*0.55))
    colors = ['#ef4444' if x < 70 else '#f97316' if x < 85 else '#eab308' if x < 95 else '#10b981' for x in df['Accuracy']]
    bars = plt.barh(df['Field'], df['Accuracy'], color=colors, height=0.7, edgecolor='white', linewidth=1)
    plt.axvline(x=100, color='#e2e8f0', linestyle='-', linewidth=2, zorder=0)
    for i, v in enumerate(df['Accuracy']):
        plt.text(v + 1.5, i, f'{v:.1f}%', va='center', fontweight='800', color='#475569', fontsize=11)
    plt.xlim(0, 115)
    plt.gca().invert_yaxis()
    plt.gca().set_axisbelow(True)
    plt.grid(axis='x', color='#f1f5f9', linestyle='-', linewidth=1)
    plt.title(f'Accuracy Breakdown - {filename}', fontsize=16, pad=20, fontweight='bold', color='#1e293b')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color('#e2e8f0')
    plt.gca().spines['left'].set_color('#e2e8f0')
    plt.tick_params(colors='#64748b', labelsize=10)
    plt.show()
    
    # Premium Table Implementation
    def style_acc(v):
        color = '#10b981' if v >= 95 else '#eab308' if v >= 85 else '#f97316' if v >= 70 else '#ef4444'
        return f'color: {color}; font-weight: 800;'

    styler = df[['Field', 'Accuracy', 'Weight', 'Ground Truth', 'Result', 'Category']].style.hide(axis='index')\
        .applymap(style_acc, subset=['Accuracy'])\
        .format({'Accuracy': '{:.1f}%', 'Weight': '{:.1f}x'})\
        .set_properties(**{
            'text-align': 'left',
            'padding': '16px 20px',
            'border-bottom': '1px solid #f1f5f9',
            'font-family': 'Inter, system-ui, sans-serif',
            'font-size': '14px',
            'color': '#334155'
        })\
        .set_table_styles([
            {'selector': 'th', 'props': [
                ('background-color', '#f8fafc'),
                ('color', '#64748b'),
                ('font-weight', '700'),
                ('text-transform', 'uppercase'),
                ('font-size', '12px'),
                ('letter-spacing', '0.05em'),
                ('border-bottom', '2px solid #e2e8f0'),
                ('padding', '12px 20px')
            ]},
            {'selector': 'tr:hover', 'props': [('background-color', '#f1f5f9')]}
        ])
    
    display(HTML(styler.to_html()))

res_dir = '/Users/pat/Desktop/custom_FM/working/comparison/result/updated_prompt_result/'
gt_dir = '/Users/pat/Desktop/custom_FM/working/comparison/ground_truth/converted/'
batch_summary = []
for f in sorted(os.listdir(res_dir)):
    if f.startswith('output_') and f.endswith('.json'):
        fid = f[7:-5]
        gp, rp = os.path.join(gt_dir, fid+'.json'), os.path.join(res_dir, f)
        if os.path.exists(gp): generate_visual_report(get_comparison_data(gp, rp), fid)


---

In [None]:
# TOTAL BATCH SUMMARY
if batch_summary:
    sdf = pd.DataFrame(batch_summary)
    total_score = sdf['WP'].sum() / sdf['TW'].sum()
    display(HTML(f"""
    <div style='background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%); padding: 40px; border-radius: 24px; color: white; text-align: center; box-shadow: 0 20px 25px -5px rgba(0,0,0,0.1); margin: 40px 0;'>
        <div style='font-size: 18px; opacity: 0.8; text-transform: uppercase; letter-spacing: 0.1em; font-weight: 600;'>Global Batch Accuracy</div>
        <div style='font-size: 84px; font-weight: 900; margin: 10px 0;'>{total_score:.2f}%</div>
        <div style='height: 4px; background: rgba(255,255,255,0.2); width: 100px; margin: 20px auto; border-radius: 2px;'></div>
        <div style='font-size: 16px; opacity: 0.8;'>Analyzed {len(sdf)} documents across all datasets</div>
    </div>
    """))
