In [6]:
# CELL 1: INSTALL DEPENDENCIES

!apt-get install -y tesseract-ocr poppler-utils
!pip install -q pytesseract pdf2image pillow opencv-python
!pip install -q scikit-learn fuzzywuzzy python-Levenshtein gradio
!pip install -q paddlepaddle==2.4.2
!pip install -q paddleocr==2.6.1.3 --no-deps
!pip install -q shapely pyclipper lmdb tqdm numpy opencv-python pillow
!pip install -q easyocr sentence-transformers pymupdf

print("Installation complete")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
[31mERROR: Could not find a version that satisfies the requirement paddlepaddle==2.4.2 (from versions: 2.6.2, 3.0.0b1, 3.0.0b2, 3.0.0rc0, 3.0.0rc1, 3.0.0, 3.1.0, 3.1.1, 3.2.0, 3.2.1, 3.2.2, 3.3.0)[0m[31m
[0m[31mERROR: No matching distribution found for paddlepaddle==2.4.2[0m[31m
[0mGemini extraction error: HTTPConnectionPool(host='localhost', port=40199): Read timed out. (read timeout=600.0)
Using OCR for PDF...
Using Gemini Vision...
Installation complete


In [7]:

# CELL 2: IMPORT LIBRARIES

import os, re, json, warnings
import cv2, numpy as np
from PIL import Image
import pytesseract, fitz
from pdf2image import convert_from_path
import easyocr, gradio as gr
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz
warnings.filterwarnings('ignore')

try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
except:
    PADDLEOCR_AVAILABLE = False

print("Libraries imported")



Libraries imported


In [8]:


# CELL 3: OCR ENGINE

class OCREngine:
    def __init__(self):
        self.paddle_ocr = None
        if PADDLEOCR_AVAILABLE:
            try:
                self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, show_log=False)
            except:
                pass

        self.easy_reader = easyocr.Reader(['en'], gpu=False, verbose=False)

    def preprocess_image(self, image_path):
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        denoised = cv2.fastNlMeansDenoising(gray)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return binary

    def extract_from_image(self, image_path):
        self.preprocess_image(image_path)
        texts = []

        if self.paddle_ocr:
            try:
                result = self.paddle_ocr.ocr(image_path, cls=True)
                if result and result[0]:
                    texts.append('\n'.join([line[1][0] for line in result[0]]))
            except:
                pass

        try:
            result = self.easy_reader.readtext(image_path)
            texts.append('\n'.join([text[1] for text in result]))
        except:
            pass

        try:
            texts.append(pytesseract.image_to_string(Image.open(image_path)))
        except:
            pass

        return max(texts, key=len) if texts else ""

    def extract_from_pdf(self, pdf_path):
        text = ""
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
        doc.close()

        if len(text.strip()) < 50:
            images = convert_from_path(pdf_path)
            for i, img in enumerate(images):
                temp = f'temp_{i}.jpg'
                img.save(temp, 'JPEG')
                text += self.extract_from_image(temp) + '\n'
                os.remove(temp)

        return text



In [9]:
# CELL 4: DATA EXTRACTOR

class DataExtractor:
    def extract(self, text):
        data = {
            'merchant_name': '', 'location': '', 'date': '', 'time': '',
            'transaction_id': '', 'payment_method': '', 'items': [],
            'item_count': 0, 'subtotal': 0.0, 'tax': 0.0, 'total': 0.0
        }

        lines = [l.strip() for l in text.split('\n') if l.strip()]
        if lines:
            data['merchant_name'] = lines[0][:50]

        # Extract all dates and use the last one (transaction date, not generated date)
        date_patterns = [
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
            r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}'
        ]
        all_dates = []
        for pattern in date_patterns:
            all_dates.extend(re.findall(pattern, text, re.I))
        if all_dates:
            data['date'] = all_dates[-1] if isinstance(all_dates[-1], str) else all_dates[-1][0]

        # Extract all times and use the last one (transaction time)
        all_times = re.findall(r'\d{1,2}:\d{2}(:\d{2})?(\s?[AP]M)?', text, re.I)
        if all_times:
            data['time'] = ''.join(all_times[-1]).strip()

        for pattern in [r'(transaction|trans|txn|receipt|order)[\s#:]+([A-Z0-9-]{6,})', r'#\s*([A-Z0-9-]{6,})']:
            if match := re.search(pattern, text, re.I):
                data['transaction_id'] = match.group(2) if match.lastindex >= 2 else match.group(1)
                break

        for key in ['visa', 'mastercard', 'amex', 'cash', 'credit', 'debit']:
            if key in text.lower():
                data['payment_method'] = key.upper()
                break

        if match := re.search(r'(\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd))', text, re.I):
            data['location'] = match.group(0)

        if match := re.search(r'(subtotal|sub[\s-]?total)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['subtotal'] = float(match.group(2).replace(',', ''))

        if match := re.search(r'(tax|vat)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['tax'] = float(match.group(2).replace(',', ''))

        if match := re.search(r'(total|amount)[\s:$]*(\d+[,\.]?\d*\.?\d{2})', text, re.I):
            data['total'] = float(match.group(2).replace(',', ''))

        # Extract item count from text
        count_patterns = [
            r'(\d+)\s*item[s]?',
            r'item[s]?\s*[:\-]?\s*(\d+)',
            r'qty[:\s]*(\d+)',
            r'quantity[:\s]*(\d+)'
        ]
        for pattern in count_patterns:
            if match := re.search(pattern, text, re.I):
                try:
                    data['item_count'] = int(match.group(1))
                    break
                except:
                    pass

        # Extract items (exclude transaction-related lines)
        exclude_keywords = ['transaction', 'trans', 'txn', 'subtotal', 'sub-total',
                          'tax', 'vat', 'total', 'payment', 'balance', 'amount due',
                          'invoice', 'receipt', 'bill', 'date', 'time', 'thank you']

        for line in lines:
            if any(keyword in line.lower() for keyword in exclude_keywords):
                continue

            # Pattern 1: Item with quantity (e.g., "Item 2x $5.99")
            match = re.search(r'^(.+?)\s+\$?(\d+\.\d{2})$', line)



In [10]:


# CELL 5: MATCHING ENGINE

class Matcher:
    def __init__(self):
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

    def match(self, d1, d2):
        scores = {'overall': 0, 'merchant': 0, 'date': 0, 'total': 0, 'transaction_id': 0}

        if d1.get('merchant_name') and d2.get('merchant_name'):
            scores['merchant'] = fuzz.ratio(d1['merchant_name'].lower(), d2['merchant_name'].lower()) / 100

        if d1.get('date') and d2.get('date'):
            scores['date'] = 1.0 if d1['date'] == d2['date'] else 0.5

        if d1.get('total') and d2.get('total'):
            diff = abs(d1['total'] - d2['total'])
            tolerance = max(d1['total'], d2['total']) * 0.05
            scores['total'] = max(0, 1 - (diff / (tolerance + 0.01)))

        if d1.get('transaction_id') and d2.get('transaction_id'):
            scores['transaction_id'] = 1.0 if d1['transaction_id'] == d2['transaction_id'] else 0

        weights = {'merchant': 0.25, 'date': 0.20, 'total': 0.30, 'transaction_id': 0.25}
        scores['overall'] = sum(scores[k] * weights[k] for k in weights)

        if scores['overall'] >= 0.85:
            status = 'STRONG_MATCH'
        elif scores['overall'] >= 0.70:
            status = 'PROBABLE_MATCH'
        elif scores['overall'] >= 0.50:
            status = 'WEAK_MATCH'
        else:
            status = 'NO_MATCH'

        return status, scores


In [11]:


# CELL 6: RECONCILIATION ENGINE

class Reconciler:
    def reconcile(self, pdf_d, img_d, status, sim):
        reconciled = pdf_d.copy()
        conflicts, corrections = [], []

        if abs(pdf_d.get('total', 0) - img_d.get('total', 0)) > 0.01:
            diff = abs(pdf_d['total'] - img_d['total'])
            conflicts.append({
                'field': 'Total Amount',
                'pdf': f"${pdf_d['total']:.2f}",
                'image': f"${img_d['total']:.2f}",
                'diff': f"${diff:.2f}"
            })

        for field in ['date', 'time', 'payment_method', 'transaction_id', 'location']:
            if not reconciled.get(field) and img_d.get(field):
                reconciled[field] = img_d[field]
                corrections.append({'field': field, 'value': img_d[field]})

        if len(img_d.get('items', [])) > len(pdf_d.get('items', [])):
            reconciled['items'] = img_d['items']
            reconciled['item_count'] = len(img_d['items'])

        return {
            'status': status,
            'confidence': sim['overall'],
            'data': reconciled,
            'conflicts': conflicts,
            'corrections': corrections,
            'needs_review': len(conflicts) > 0 or sim['overall'] < 0.70
        }



In [12]:
# CELL 7: DISPLAY FORMATTERS

def fmt_data(data, title):
    return f"""
    <div style='background:#ffffff;padding:20px;border:1px solid #dee2e6;border-radius:4px;margin:10px 0'>
        <h4 style='color:#000000;margin:0 0 15px 0;font-weight:600'>{title}</h4>
        <table style='width:100%;border-collapse:collapse'>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Merchant:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('merchant_name','N/A')}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Date:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('date','N/A')}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Time:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('time','N/A')}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Transaction ID:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('transaction_id','N/A')}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Payment Method:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('payment_method','N/A')}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Location:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('location','N/A')}</td></tr>
            <tr style='border-top:2px solid #dee2e6'><td style='padding:12px 0 8px 0;color:#000000;font-weight:500'>Subtotal:</td>
                <td style='padding:12px 0 8px 0;color:#000000'>${data.get('subtotal',0):.2f}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Tax:</td>
                <td style='padding:8px 0;color:#000000'>${data.get('tax',0):.2f}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:700;font-size:16px'>Total:</td>
                <td style='padding:8px 0;color:#0d6efd;font-weight:700;font-size:16px'>${data.get('total',0):.2f}</td></tr>
            <tr><td style='padding:8px 0;color:#000000;font-weight:500'>Items:</td>
                <td style='padding:8px 0;color:#000000'>{data.get('item_count',0)}</td></tr>
        </table>
    </div>"""

def fmt_items(items):
    if not items:
        return "<p style='color:#000000'>No items found</p>"

    html = "<table style='width:100%;border-collapse:collapse;margin:15px 0'>"
    html += "<thead><tr style='background:#f8f9fa;border-bottom:2px solid #dee2e6'>"
    html += "<th style='padding:12px;text-align:left;color:#000000;font-weight:600'>Item Name</th>"
    html += "<th style='padding:12px;text-align:center;color:#000000;font-weight:600'>Quantity</th>"
    html += "<th style='padding:12px;text-align:right;color:#000000;font-weight:600'>Price</th>"
    html += "<th style='padding:12px;text-align:right;color:#000000;font-weight:600'>Total</th></tr></thead><tbody>"

    for i, item in enumerate(items):
        bg = '#ffffff' if i % 2 == 0 else '#f8f9fa'
        total = item.get('quantity', 1) * item.get('price', 0)
        html += f"<tr style='background:{bg};border-bottom:1px solid #dee2e6'>"
        html += f"<td style='padding:10px;color:#000000'>{item.get('name','N/A')}</td>"
        html += f"<td style='padding:10px;text-align:center;color:#000000'>{item.get('quantity',1)}</td>"
        html += f"<td style='padding:10px;text-align:right;color:#000000'>${item.get('price',0):.2f}</td>"
        html += f"<td style='padding:10px;text-align:right;color:#000000;font-weight:500'>${total:.2f}</td></tr>"

    return html + "</tbody></table>"

def fmt_match(status, sim):
    colors = {
        'STRONG_MATCH': '#198754',
        'PROBABLE_MATCH': '#ffc107',
        'WEAK_MATCH': '#fd7e14',
        'NO_MATCH': '#dc3545'
    }

    return f"""
    <div style='background:{colors.get(status,"#6c757d")};color:#ffffff;padding:25px;border-radius:4px;margin:10px 0'>
        <h3 style='margin:0 0 5px 0;font-weight:600'>{status.replace('_',' ')}</h3>
        <p style='margin:0 0 15px 0;font-size:18px'>Confidence: {sim['overall']:.1%}</p>
        <div style='border-top:1px solid rgba(255,255,255,0.3);padding-top:15px;margin-top:15px'>
            <table style='width:100%;color:#ffffff'>
                <tr>
                    <td style='padding:5px'>Merchant:</td><td style='padding:5px;text-align:right;font-weight:600'>{sim['merchant']:.0%}</td>
                    <td style='padding:5px;padding-left:20px'>Date:</td><td style='padding:5px;text-align:right;font-weight:600'>{sim['date']:.0%}</td>
                </tr>
                <tr>
                    <td style='padding:5px'>Total:</td><td style='padding:5px;text-align:right;font-weight:600'>{sim['total']:.0%}</td>
                    <td style='padding:5px;padding-left:20px'>Transaction:</td><td style='padding:5px;text-align:right;font-weight:600'>{sim['transaction_id']:.0%}</td>
                </tr>
            </table>
        </div>
    </div>"""

def fmt_conflicts(report):
    html = ""

    if report['conflicts']:
        html = "<div style='background:#fff3cd;border:1px solid #ffc107;border-radius:4px;padding:20px;margin:10px 0'>"
        html += "<h4 style='color:#000000;margin:0 0 15px 0;font-weight:600'>Conflicts Detected</h4>"
        for c in report['conflicts']:
            html += f"<div style='background:#ffffff;padding:12px;margin:8px 0;border-left:3px solid #ffc107'>"
            html += f"<p style='margin:0;font-weight:600;color:#000000'>{c['field']}</p>"
            html += f"<p style='margin:5px 0 0 0;color:#000000'>PDF: {c['pdf']} | Image: {c['image']} | Difference: {c['diff']}</p></div>"
        html += "</div>"
    else:
        html = "<div style='background:#d1e7dd;border:1px solid #198754;border-radius:4px;padding:20px;margin:10px 0'>"
        html += "<h4 style='color:#000000;margin:0;font-weight:600'>No Conflicts</h4>"
        html += "<p style='color:#000000;margin:5px 0 0 0'>All fields match between documents</p></div>"

    if report['corrections']:
        html += "<div style='background:#cfe2ff;border:1px solid #0d6efd;border-radius:4px;padding:20px;margin:10px 0'>"
        html += "<h4 style='color:#000000;margin:0 0 15px 0;font-weight:600'>Auto-Corrections Applied</h4>"
        for c in report['corrections']:
            html += f"<p style='margin:5px 0;color:#000000'><strong>{c['field']}:</strong> {c['value']}</p>"
        html += "</div>"

    return html

def fmt_final(data, needs_review):
    border = '#dc3545' if needs_review else '#198754'

    html = f"<div style='background:#ffffff;padding:25px;border:2px solid {border};border-radius:4px;margin:10px 0'>"
    html += "<h3 style='color:#000000;margin:0 0 20px 0;font-weight:600'>Final Reconciled Bill</h3>"
    html += f"<div style='background:#ffffff;padding:15px;border:1px solid #dee2e6;border-radius:4px;margin-bottom:20px'>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Merchant:</strong> {data.get('merchant_name','N/A')}</p>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Location:</strong> {data.get('location','N/A')}</p>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Date:</strong> {data.get('date','N/A')} {data.get('time','N/A')}</p>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Transaction:</strong> {data.get('transaction_id','N/A')}</p>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Payment:</strong> {data.get('payment_method','N/A')}</p></div>"
    html += f"<h4 style='color:#000000;margin:15px 0;font-weight:600'>Items ({data.get('item_count',0)})</h4>"
    html += fmt_items(data.get('items',[]))
    html += "<div style='background:#ffffff;padding:15px;border:1px solid #dee2e6;border-radius:4px;margin-top:20px'>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Subtotal:</strong> ${data.get('subtotal',0):.2f}</p>"
    html += f"<p style='margin:5px 0;color:#000000'><strong style='color:#000000'>Tax:</strong> ${data.get('tax',0):.2f}</p>"
    html += f"<p style='margin:10px 0 5px 0;color:#000000;font-size:20px;font-weight:700'>Total: ${data.get('total',0):.2f}</p>"
    html += f"<p style='margin:10px 0 0 0;color:#000000'><strong style='color:#000000'>Requires Review:</strong> {'Yes' if needs_review else 'No'}</p></div></div>"

    return html



In [13]:
# CELL 8: PROCESSING FUNCTION

def process_files(pdf_file, img_file):
    if not pdf_file or not img_file:
        return "Please upload both PDF and image files", "", "", "", "", ""

    try:
        pdf_text = ocr.extract_from_pdf(pdf_file.name)
        img_text = ocr.extract_from_image(img_file.name)

        pdf_data = extractor.extract(pdf_text)
        img_data = extractor.extract(img_text)

        status, sim = matcher.match(pdf_data, img_data)
        report = reconciler.reconcile(pdf_data, img_data, status, sim)

        return (
            "Processing Complete",
            fmt_data(pdf_data, "PDF Bill Data"),
            fmt_data(img_data, "Image Receipt Data"),
            fmt_match(status, sim),
            fmt_conflicts(report),
            fmt_final(report['data'], report['needs_review'])
        )
    except Exception as e:
        return f"Error: {str(e)}", "", "", "", "", ""


In [14]:


# CELL 9: INITIALIZE ENGINES

if 'ocr' not in globals():
    ocr = OCREngine()
if 'extractor' not in globals():
    extractor = DataExtractor()
if 'matcher' not in globals():
    matcher = Matcher()
if 'reconciler' not in globals():
    reconciler = Reconciler()

print("System initialized")


System initialized


In [15]:

# CELL 10: CREATE UI

with gr.Blocks(title="Receipt Reconciliation System", theme=gr.themes.Default()) as app:
    gr.Markdown("# Receipt Reconciliation System")
    gr.Markdown("Upload PDF bill and receipt image for automated data extraction and reconciliation")

    with gr.Row():
        pdf_in = gr.File(label="PDF Bill Document", file_types=[".pdf"])
        img_in = gr.File(label="Receipt Image", file_types=[".jpg",".jpeg",".png"])

    btn = gr.Button("Process Documents", variant="primary", size="lg")
    status = gr.Markdown()

    gr.Markdown("## Extracted Data")
    with gr.Row():
        pdf_out = gr.HTML()
        img_out = gr.HTML()

    gr.Markdown("## Match Analysis")
    match_out = gr.HTML()

    gr.Markdown("## Reconciliation Results")
    conflict_out = gr.HTML()

    gr.Markdown("## Final Reconciled Bill")
    final_out = gr.HTML()

    btn.click(process_files, [pdf_in, img_in],
             [status, pdf_out, img_out, match_out, conflict_out, final_out])

app.launch(share=True)
print("Application launched")

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://51c58f16ffd89c6a24.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Application launched
