In [None]:
# ==========================================
# CELL 1: INSTALL DEPENDENCIES (FIXED)
# Run this cell first - It will take 2-3 minutes
# ==========================================

# System dependencies
!apt-get install -y tesseract-ocr poppler-utils

# Python packages - Install paddleocr WITHOUT dependencies to avoid conflicts
!pip install -q pytesseract pdf2image pillow opencv-python
!pip install -q scikit-learn fuzzywuzzy python-Levenshtein gradio

# Install PaddleOCR with specific version (more stable)
!pip install -q paddlepaddle==2.4.2
!pip install -q paddleocr==2.6.1.3 --no-deps

# Install paddleocr dependencies manually
!pip install -q shapely pyclipper lmdb tqdm numpy opencv-python pillow

# Install EasyOCR
!pip install -q easyocr

# Install sentence-transformers
!pip install -q sentence-transformers

# Install PyMuPDF for PDF handling
!pip install -q pymupdf

print("‚úÖ All libraries installed successfully!")

In [None]:
# ==========================================
# CELL 2: IMPORT LIBRARIES (FIXED)
# Run after Cell 1 completes
# ==========================================

import os
import re
import json
import warnings
warnings.filterwarnings('ignore')

# Image processing
import cv2
import numpy as np
from PIL import Image

# PDF handling
import fitz  # PyMuPDF
from pdf2image import convert_from_path

# OCR engines
import pytesseract
import easyocr

# Import PaddleOCR carefully
try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
    print("‚úÖ PaddleOCR loaded")
except Exception as e:
    print(f"‚ö†Ô∏è PaddleOCR not available: {e}")
    PADDLEOCR_AVAILABLE = False

# Matching
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz

# UI
import gradio as gr

print("‚úÖ Libraries imported successfully!")

In [None]:
# ==========================================
# CELL 3: INITIALIZE OCR ENGINE (FIXED)
# Only runs initialization once
# ==========================================

class OCREngine:
    def __init__(self):
        print("üîß Initializing OCR engines...")

        # Initialize PaddleOCR only if available
        self.paddle_ocr = None
        if PADDLEOCR_AVAILABLE:
            try:
                self.paddle_ocr = PaddleOCR(
                    use_angle_cls=True,
                    lang='en',
                    use_gpu=False,
                    show_log=False,
                    use_dilation=True
                )
                print("‚úÖ PaddleOCR initialized")
            except Exception as e:
                print(f"‚ö†Ô∏è PaddleOCR initialization failed: {e}")
                self.paddle_ocr = None

        # EasyOCR - Always available as backup
        print("Loading EasyOCR...")
        self.easy_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
        print("‚úÖ EasyOCR initialized")

        print("‚úÖ OCR engines ready!")

    def preprocess_image(self, image_path):
        """Enhance image quality for better OCR"""
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        denoised = cv2.fastNlMeansDenoising(gray)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary

    def extract_from_image(self, image_path):
        """Extract text from image using multiple OCR engines"""
        print(f"üì∏ Processing image...")

        # Preprocess
        self.preprocess_image(image_path)

        texts = []

        # Try PaddleOCR first (if available)
        if self.paddle_ocr:
            try:
                paddle_result = self.paddle_ocr.ocr(image_path, cls=True)
                if paddle_result and paddle_result[0]:
                    paddle_text = '\n'.join([line[1][0] for line in paddle_result[0]])
                    texts.append(paddle_text)
                    print(f"‚úÖ PaddleOCR: {len(paddle_text)} chars")
            except Exception as e:
                print(f"‚ö†Ô∏è PaddleOCR failed: {e}")

        # EasyOCR (always runs)
        try:
            easy_result = self.easy_reader.readtext(image_path)
            easy_text = '\n'.join([text[1] for text in easy_result])
            texts.append(easy_text)
            print(f"‚úÖ EasyOCR: {len(easy_text)} chars")
        except Exception as e:
            print(f"‚ö†Ô∏è EasyOCR failed: {e}")

        # Tesseract (always available)
        try:
            tess_text = pytesseract.image_to_string(Image.open(image_path))
            texts.append(tess_text)
            print(f"‚úÖ Tesseract: {len(tess_text)} chars")
        except Exception as e:
            print(f"‚ö†Ô∏è Tesseract failed: {e}")

        # Return the longest result
        if texts:
            combined = max(texts, key=len)
            print(f"‚úÖ Best result: {len(combined)} characters")
            return combined
        else:
            print("‚ùå All OCR methods failed")
            return ""

    def extract_from_pdf(self, pdf_path):
        """Extract text from PDF"""
        print(f"üìÑ Processing PDF...")

        text = ""
        try:
            # Try direct text extraction
            doc = fitz.open(pdf_path)
            for page in doc:
                text += page.get_text()
            doc.close()

            # If PDF is image-based, use OCR
            if len(text.strip()) < 50:
                print("‚ö†Ô∏è PDF is image-based, using OCR...")
                images = convert_from_path(pdf_path)
                for i, img in enumerate(images):
                    temp_path = f'temp_page_{i}.jpg'
                    img.save(temp_path, 'JPEG')
                    text += self.extract_from_image(temp_path) + '\n'
                    os.remove(temp_path)
        except Exception as e:
            print(f"‚ùå PDF Error: {e}")
            return ""

        print(f"‚úÖ Extracted {len(text)} characters")
        return text

# Initialize OCR engine (only once)
if 'ocr' not in globals():
    print("Starting OCR engine initialization...")
    ocr = OCREngine()
    print("‚úÖ OCR Engine ready!")
else:
    print("‚ôªÔ∏è OCR engine already initialized")

In [None]:


# ==========================================
# CELL 4: DATA EXTRACTOR CLASS
# Extracts structured data from text
# ==========================================

class DataExtractor:
    def extract(self, text):
        data = {'merchant_name': '', 'location': '', 'date': '', 'time': '',
                'transaction_id': '', 'payment_method': '', 'items': [],
                'item_count': 0, 'subtotal': 0.0, 'tax': 0.0, 'total': 0.0}

        lines = [l.strip() for l in text.split('\n') if l.strip()]
        if lines: data['merchant_name'] = lines[0][:50]

        # Date
        for pattern in [r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', r'\d{4}[/-]\d{1,2}[/-]\d{1,2}']:
            if match := re.search(pattern, text, re.I):
                data['date'] = match.group(0)
                break

        # Time
        if match := re.search(r'\d{1,2}:\d{2}(:\d{2})?(\s?[AP]M)?', text, re.I):
            data['time'] = match.group(0)

        # Transaction ID
        for pattern in [r'(transaction|trans|txn|receipt|order)[\s#:]+([A-Z0-9-]{6,})', r'#\s*([A-Z0-9-]{6,})']:
            if match := re.search(pattern, text, re.I):
                data['transaction_id'] = match.group(2) if match.lastindex >= 2 else match.group(1)
                break

        # Payment
        for key in ['visa', 'mastercard', 'amex', 'cash', 'credit', 'debit']:
            if key in text.lower():
                data['payment_method'] = key.upper()
                break

        # Location
        if match := re.search(r'(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd))', text, re.I):
            data['location'] = match.group(0)

        # Money
        if match := re.search(r'(subtotal|sub[\s-]?total)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['subtotal'] = float(match.group(2).replace(',', ''))
        if match := re.search(r'(tax|vat)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['tax'] = float(match.group(2).replace(',', ''))
        if match := re.search(r'(total|amount)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['total'] = float(match.group(2).replace(',', ''))

        # Items
        for line in lines:
            if match := re.search(r'^(.+?)\s+\$?(\d+\.?\d{2})$', line):
                if len(match.group(1)) > 3:
                    data['items'].append({'name': match.group(1)[:40], 'quantity': 1, 'price': float(match.group(2))})

        data['item_count'] = len(data['items'])
        return data



In [None]:

# ==========================================
# CELL 5: MATCHING ENGINE
# Compares PDF and image data
# ==========================================

class Matcher:
    def __init__(self):
        print("üîç Loading matcher...")
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        print("‚úÖ Matcher ready!")

    def match(self, d1, d2):
        scores = {'overall': 0, 'merchant': 0, 'date': 0, 'total': 0, 'transaction_id': 0}

        if d1.get('merchant_name') and d2.get('merchant_name'):
            scores['merchant'] = fuzz.ratio(d1['merchant_name'].lower(), d2['merchant_name'].lower()) / 100

        if d1.get('date') and d2.get('date'):
            scores['date'] = 1.0 if d1['date'] == d2['date'] else 0.5

        if d1.get('total') and d2.get('total'):
            diff = abs(d1['total'] - d2['total'])
            tolerance = max(d1['total'], d2['total']) * 0.05
            scores['total'] = max(0, 1 - (diff / (tolerance + 0.01)))

        if d1.get('transaction_id') and d2.get('transaction_id'):
            scores['transaction_id'] = 1.0 if d1['transaction_id'] == d2['transaction_id'] else 0

        weights = {'merchant': 0.25, 'date': 0.20, 'total': 0.30, 'transaction_id': 0.25}
        scores['overall'] = sum(scores[k] * weights[k] for k in weights)

        if scores['overall'] >= 0.85: status, icon = 'STRONG_MATCH', 'üü¢'
        elif scores['overall'] >= 0.70: status, icon = 'PROBABLE_MATCH', 'üü°'
        elif scores['overall'] >= 0.50: status, icon = 'WEAK_MATCH', 'üü†'
        else: status, icon = 'NO_MATCH', 'üî¥'

        return status, scores, icon



In [None]:

# ==========================================
# CELL 6: RECONCILIATION ENGINE
# Creates final corrected bill
# ==========================================

class Reconciler:
    def reconcile(self, pdf_d, img_d, status, sim):
        reconciled = pdf_d.copy()
        conflicts, corrections = [], []

        # Check conflicts
        if abs(pdf_d.get('total', 0) - img_d.get('total', 0)) > 0.01:
            diff = abs(pdf_d['total'] - img_d['total'])
            conflicts.append({'field': 'Total', 'pdf': f"${pdf_d['total']:.2f}",
                            'image': f"${img_d['total']:.2f}", 'diff': f"${diff:.2f}"})

        # Fill missing
        for field in ['date', 'time', 'payment_method', 'transaction_id', 'location']:
            if not reconciled.get(field) and img_d.get(field):
                reconciled[field] = img_d[field]
                corrections.append({'field': field, 'value': img_d[field]})

        # Use better items
        if len(img_d.get('items', [])) > len(pdf_d.get('items', [])):
            reconciled['items'] = img_d['items']
            reconciled['item_count'] = len(img_d['items'])

        return {
            'status': status, 'confidence': sim['overall'], 'data': reconciled,
            'conflicts': conflicts, 'corrections': corrections,
            'needs_review': len(conflicts) > 0 or sim['overall'] < 0.70
        }



In [None]:

# ==========================================
# CELL 7: UI DISPLAY FUNCTIONS
# HTML formatters
# ==========================================

def fmt_data(data, title, color):
    return f"""
    <div style='background:{color};padding:20px;border-radius:10px;margin:10px 0'>
        <h3>{title}</h3>
        <p><b>Merchant:</b> {data.get('merchant_name','N/A')}</p>
        <p><b>Date:</b> {data.get('date','N/A')} | <b>Time:</b> {data.get('time','N/A')}</p>
        <p><b>Transaction:</b> {data.get('transaction_id','N/A')} | <b>Payment:</b> {data.get('payment_method','N/A')}</p>
        <p><b>Location:</b> {data.get('location','N/A')}</p>
        <hr><p><b>Subtotal:</b> ${data.get('subtotal',0):.2f} | <b>Tax:</b> ${data.get('tax',0):.2f}</p>
        <p style='font-size:20px;color:#2e7d32'><b>TOTAL: ${data.get('total',0):.2f}</b></p>
        <p><b>Items:</b> {data.get('item_count',0)}</p>
    </div>"""

def fmt_items(items):
    if not items: return "<p>No items</p>"
    html = "<table style='width:100%;border-collapse:collapse'><tr style='background:#2196F3;color:white'><th style='border:1px solid #ddd;padding:10px'>Item</th><th style='border:1px solid #ddd;padding:10px'>Qty</th><th style='border:1px solid #ddd;padding:10px'>Price</th></tr>"
    for item in items:
        html += f"<tr><td style='border:1px solid #ddd;padding:8px'>{item.get('name','N/A')}</td><td style='border:1px solid #ddd;padding:8px;text-align:center'>{item.get('quantity',1)}</td><td style='border:1px solid #ddd;padding:8px;text-align:right'>${item.get('price',0):.2f}</td></tr>"
    return html + "</table>"

def fmt_match(status, sim, icon):
    colors = {'STRONG_MATCH':'#4caf50','PROBABLE_MATCH':'#ff9800','WEAK_MATCH':'#ff5722','NO_MATCH':'#f44336'}
    return f"""<div style='background:{colors.get(status,"gray")};color:white;padding:25px;border-radius:10px;text-align:center'>
    <h2>{icon} {status.replace('_',' ')}</h2><h3>Confidence: {sim['overall']:.1%}</h3><hr>
    <p>Merchant: {sim['merchant']:.0%} | Date: {sim['date']:.0%} | Total: {sim['total']:.0%} | ID: {sim['transaction_id']:.0%}</p></div>"""

def fmt_conflicts(report):
    html = ""
    if report['conflicts']:
        html = "<div style='background:#ffebee;border-left:5px solid #f44336;padding:20px;border-radius:5px'><h3>‚ö†Ô∏è Conflicts</h3>"
        for c in report['conflicts']:
            html += f"<p><b>{c['field']}:</b> PDF={c['pdf']} vs Image={c['image']} (Diff: {c['diff']})</p>"
        html += "</div>"
    else:
        html = "<div style='background:#e8f5e9;padding:20px'><h3>‚úÖ No Conflicts</h3></div>"

    if report['corrections']:
        html += "<div style='background:#fff3e0;padding:20px;margin-top:10px'><h3>‚úèÔ∏è Auto-Corrections</h3>"
        for c in report['corrections']:
            html += f"<p><b>{c['field']}:</b> {c['value']}</p>"
        html += "</div>"
    return html

def fmt_final(data, needs_review):
    color = '#ffebee' if needs_review else '#e8f5e9'
    border = '#f44336' if needs_review else '#4caf50'
    return f"""<div style='background:{color};padding:25px;border:3px solid {border};border-radius:10px'>
    <h2>‚úÖ FINAL RECONCILED BILL</h2><hr>
    <p><b>Merchant:</b> {data.get('merchant_name','N/A')} | <b>Location:</b> {data.get('location','N/A')}</p>
    <p><b>Date:</b> {data.get('date','N/A')} {data.get('time','N/A')}</p>
    <p><b>Transaction:</b> {data.get('transaction_id','N/A')} | <b>Payment:</b> {data.get('payment_method','N/A')}</p>
    <hr><h3>Items ({data.get('item_count',0)})</h3>{fmt_items(data.get('items',[]))}
    <hr><p><b>Subtotal:</b> ${data.get('subtotal',0):.2f} | <b>Tax:</b> ${data.get('tax',0):.2f}</p>
    <p style='font-size:24px;color:#2e7d32'><b>TOTAL: ${data.get('total',0):.2f}</b></p>
    <p><b>Needs Review:</b> {'YES ‚ö†Ô∏è' if needs_review else 'NO ‚úÖ'}</p></div>"""



In [None]:

# ==========================================
# CELL 8: MAIN PROCESSING FUNCTION
# Processes uploaded files
# ==========================================

def process_files(pdf_file, img_file):
    if not pdf_file or not img_file:
        return "‚ö†Ô∏è Upload both files", "", "", "", "", ""

    try:
        # Extract
        pdf_text = ocr.extract_from_pdf(pdf_file.name)
        img_text = ocr.extract_from_image(img_file.name)

        # Structure
        pdf_data = extractor.extract(pdf_text)
        img_data = extractor.extract(img_text)

        # Match
        status, sim, icon = matcher.match(pdf_data, img_data)

        # Reconcile
        report = reconciler.reconcile(pdf_data, img_data, status, sim)

        # Format outputs
        return (
            "‚úÖ Processing Complete!",
            fmt_data(pdf_data, "üìÑ PDF Bill", "#e3f2fd"),
            fmt_data(img_data, "üì∏ Image Receipt", "#f3e5f5"),
            fmt_match(status, sim, icon),
            fmt_conflicts(report),
            fmt_final(report['data'], report['needs_review'])
        )
    except Exception as e:
        return f"‚ùå Error: {e}", "", "", "", "", ""



In [None]:

# ==========================================
# CELL 9: INITIALIZE ALL ENGINES
# Run this before creating UI
# ==========================================

print("Initializing system...")
ocr = OCREngine()
extractor = DataExtractor()
matcher = Matcher()
reconciler = Reconciler()
print("‚úÖ All engines ready!")


In [None]:


# ==========================================
# CELL 10: CREATE AND LAUNCH UI
# Final cell - creates the interface
# ==========================================

with gr.Blocks(title="Receipt Reconciliation", theme=gr.themes.Soft()) as app:
    gr.Markdown("# üßæ Receipt Reconciliation System\n### Upload PDF bill + receipt image for automatic reconciliation")

    with gr.Row():
        pdf_in = gr.File(label="üìÑ PDF Bill (Source of Truth)", file_types=[".pdf"])
        img_in = gr.File(label="üì∏ Receipt Image", file_types=[".jpg",".jpeg",".png"])

    btn = gr.Button("üöÄ Process & Reconcile", variant="primary", size="lg")
    status = gr.Markdown()

    gr.Markdown("## üìä Extracted Data")
    with gr.Row():
        pdf_out = gr.HTML()
        img_out = gr.HTML()

    gr.Markdown("## üîç Match Analysis")
    match_out = gr.HTML()

    gr.Markdown("## ‚öñÔ∏è Reconciliation")
    conflict_out = gr.HTML()

    gr.Markdown("## ‚úÖ Final Bill")
    final_out = gr.HTML()

    btn.click(process_files, [pdf_in, img_in], [status, pdf_out, img_out, match_out, conflict_out, final_out])

    gr.Markdown("### Instructions:\n1. Upload PDF bill\n2. Upload receipt image\n3. Click Process\n4. Review results")

app.launch(share=True, debug=True)
print("üöÄ UI launched! Click the link above.")