# Receipt Reconciliation System
Enterprise-grade intelligent receipt reconciliation using OCR, NLP, and LLMs

## Step 1: Environment Setup

In [None]:
!apt-get update
!apt-get install -y tesseract-ocr
!apt-get install -y poppler-utils

0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://cli.github.com/packages stable/main amd64 Packages [354 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,868 kB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [38.5 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd6

In [None]:
!pip install opencv-python pytesseract pdf2image easyocr sentence-transformers google-generativeai openpyxl scikit-learn

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.13.0-py3-none-

## Step 2: Import Dependencies

In [None]:
import os
import re
import json
import cv2
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path

import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import easyocr
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai

from google.colab import files
from IPython.display import display, HTML


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


## Step 3: Data Structure Definition

In [None]:
@dataclass
class ReceiptData:
    source_type: str
    location: Optional[str] = None
    transaction_time: Optional[str] = None
    transaction_id: Optional[str] = None
    payment_method: Optional[str] = None
    num_items: Optional[int] = None
    items: Optional[List[Dict]] = None
    subtotal: Optional[float] = None
    tax: Optional[float] = None
    total: Optional[float] = None
    raw_text: Optional[str] = None
    confidence_score: Optional[float] = None

## Step 4: OCR Processing Module

In [None]:
class OCRProcessor:

    def __init__(self, use_easyocr: bool = True):
        self.use_easyocr = use_easyocr
        if use_easyocr:
            print("Initializing EasyOCR reader...")
            self.reader = easyocr.Reader(['en'], gpu=True)
            print("EasyOCR initialized successfully")

    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        denoised = cv2.fastNlMeansDenoising(gray)
        _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
        processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
        return processed

    def extract_from_image(self, image_path: str) -> Tuple[str, float]:
        print(f"Processing image: {image_path}")
        image = cv2.imread(image_path)
        processed = self.preprocess_image(image)

        if self.use_easyocr:
            results = self.reader.readtext(processed)
            text = '\n'.join([res[1] for res in results])
            confidence = np.mean([res[2] for res in results]) if results else 0.0
        else:
            text = pytesseract.image_to_string(Image.fromarray(processed))
            confidence = 0.85

        print(f"Extracted {len(text)} characters with confidence: {confidence:.2f}")
        return text, confidence

    def extract_from_pdf(self, pdf_path: str) -> Tuple[str, float]:
        print(f"Processing PDF: {pdf_path}")
        images = convert_from_path(pdf_path, dpi=300)
        full_text = []
        confidences = []

        for idx, image in enumerate(images):
            print(f"Processing page {idx + 1}/{len(images)}")
            image_np = np.array(image)
            processed = self.preprocess_image(image_np)

            if self.use_easyocr:
                results = self.reader.readtext(processed)
                page_text = '\n'.join([res[1] for res in results])
                page_confidence = np.mean([res[2] for res in results]) if results else 0.0
            else:
                page_text = pytesseract.image_to_string(Image.fromarray(processed))
                page_confidence = 0.85

            full_text.append(page_text)
            confidences.append(page_confidence)

        final_text = '\n\n'.join(full_text)
        avg_confidence = np.mean(confidences)
        print(f"PDF processing complete. Average confidence: {avg_confidence:.2f}")
        return final_text, avg_confidence

## Step 5: Enhanced NLP Extraction Module

In [None]:
class NLPExtractor:

    def __init__(self):
        self.patterns = {
            'transaction_id': r'(?:transaction|trans|receipt|order|invoice)[\s#:]*([A-Z0-9]{6,})',
            'date': r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})',
            'time': r'(\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AP]M)?)',
            'payment': r'(visa|mastercard|amex|cash|credit|debit|card)[\s*]*(?:ending|xxxx)?[\s*]*(\d{4})?',
            'amount': r'\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
            'tax': r'(?:tax|gst|vat)[\s:]*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
            'total': r'(?:total|amount|balance)[\s:]*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
            'subtotal': r'(?:subtotal|sub-total|sub total)[\s:]*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'
        }

    def extract_structured_data(self, text: str) -> Dict:
        print("Extracting structured data with NLP...")
        data = {}
        text_lower = text.lower()

        for key, pattern in self.patterns.items():
            match = re.search(pattern, text_lower, re.IGNORECASE)
            if match:
                data[key] = match.group(1).strip()

        location = self._extract_location(text)
        items = self._extract_items(text)

        result = {
            'location': location,
            'transaction_id': data.get('transaction_id'),
            'transaction_time': f"{data.get('date', '')} {data.get('time', '')}".strip(),
            'payment_method': data.get('payment'),
            'items': items,
            'num_items': len(items),
            'subtotal': self._parse_amount(data.get('subtotal')),
            'tax': self._parse_amount(data.get('tax')),
            'total': self._parse_amount(data.get('total'))
        }

        print(f"NLP extraction complete. Found {len(items)} items.")
        return result

    def _extract_location(self, text: str) -> Optional[str]:
        lines = text.split('\n')
        for i, line in enumerate(lines[:5]):
            if any(word in line.lower() for word in ['store', 'shop', 'restaurant', 'mart', 'market', 'inc', 'llc', 'ltd']):
                return line.strip()
        return lines[0].strip() if lines else None

    def _extract_items(self, text: str) -> List[Dict]:
        items = []
        lines = text.split('\n')

        patterns = [
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+(?:SKU|ID|#)[:\s]*([A-Z0-9]{4,})\s+(\d+)\s*x?\s*@?\s*\$?\s*(\d+\.\d{2})\s*\$?\s*(\d+\.\d{2})',
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+([A-Z0-9]{4,})\s+(\d+)\s*x?\s*@?\s*\$?\s*(\d+\.\d{2})\s*\$?\s*(\d+\.\d{2})',
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+(\d+)\s*x\s*@?\s*\$?\s*(\d+\.\d{2})\s*=?\s*\$?\s*(\d+\.\d{2})',
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+\$?\s*(\d+\.\d{2})\s*x\s*(\d+)\s*=?\s*\$?\s*(\d+\.\d{2})',
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+(?:SKU|ID|#)[:\s]*([A-Z0-9]{4,})\s+\$?\s*(\d+\.\d{2})',
            r'([A-Za-z0-9\s\-\.\(\)]+?)\s+(\d+)\s*@\s*\$?\s*(\d+\.\d{2})'
        ]

        for line in lines:
            line = line.strip()
            if len(line) < 5 or any(word in line.lower() for word in ['total', 'subtotal', 'tax', 'payment', 'card']):
                continue

            item_found = False
            for pattern in patterns:
                match = re.search(pattern, line, re.IGNORECASE)
                if match:
                    groups = match.groups()

                    if len(groups) == 5:
                        items.append({
                            'name': groups[0].strip(),
                            'product_id': groups[1].strip(),
                            'quantity': int(groups[2]),
                            'unit_price': float(groups[3]),
                            'total_price': float(groups[4])
                        })
                    elif len(groups) == 4:
                        if groups[1].isdigit():
                            items.append({
                                'name': groups[0].strip(),
                                'product_id': None,
                                'quantity': int(groups[1]),
                                'unit_price': float(groups[2]),
                                'total_price': float(groups[3])
                            })
                        else:
                            items.append({
                                'name': groups[0].strip(),
                                'product_id': None,
                                'quantity': int(groups[2]),
                                'unit_price': float(groups[1]),
                                'total_price': float(groups[3])
                            })
                    elif len(groups) == 3:
                        if groups[1].replace('.', '').isdigit():
                            items.append({
                                'name': groups[0].strip(),
                                'product_id': groups[1].strip(),
                                'quantity': 1,
                                'unit_price': float(groups[2]),
                                'total_price': float(groups[2])
                            })
                        else:
                            items.append({
                                'name': groups[0].strip(),
                                'product_id': None,
                                'quantity': int(groups[1]),
                                'unit_price': float(groups[2]),
                                'total_price': int(groups[1]) * float(groups[2])
                            })

                    item_found = True
                    break

            if not item_found:
                simple_pattern = r'([A-Za-z][A-Za-z0-9\s\-\.]{3,})\s+\$?\s*(\d+\.\d{2})$'
                match = re.search(simple_pattern, line)
                if match and float(match.group(2)) < 1000:
                    items.append({
                        'name': match.group(1).strip(),
                        'product_id': None,
                        'quantity': 1,
                        'unit_price': float(match.group(2)),
                        'total_price': float(match.group(2))
                    })

        return items

    def _parse_amount(self, amount_str: Optional[str]) -> Optional[float]:
        if not amount_str:
            return None
        cleaned = re.sub(r'[^\d.]', '', amount_str)
        try:
            return float(cleaned)
        except ValueError:
            return None

## Step 6: LLM Processing Module

In [None]:
class LLMProcessor:

    def __init__(self, api_key: str):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-1.5-flash')
        print("Loading sentence transformer model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("LLM processor initialized successfully")

    def extract_with_llm(self, text: str) -> Dict:
        print("Extracting data using LLM...")
        prompt = f"""Extract ALL information from this receipt text in valid JSON format:

Required structure:
{{
  "location": "store name",
  "transaction_time": "date and time",
  "transaction_id": "transaction ID",
  "payment_method": "payment method",
  "items": [
    {{
      "name": "item name",
      "product_id": "product ID or SKU if available, else null",
      "quantity": quantity_as_number,
      "unit_price": unit_price_as_number,
      "total_price": total_price_as_number
    }}
  ],
  "num_items": total_count_of_items,
  "subtotal": subtotal_before_tax,
  "tax": tax_amount,
  "total": total_after_tax
}}

Receipt text:
{text}

Return ONLY valid JSON. Use null for missing values. Numbers should be numbers, not strings."""

        try:
            response = self.model.generate_content(prompt)
            json_text = response.text.strip()

            if json_text.startswith('```json'):
                json_text = json_text[7:]
            if json_text.startswith('```'):
                json_text = json_text[3:]
            if json_text.endswith('```'):
                json_text = json_text[:-3]

            result = json.loads(json_text.strip())
            print(f"LLM extraction successful - found {len(result.get('items', []))} items")
            return result
        except Exception as e:
            print(f"LLM extraction error: {str(e)}")
            return {}

    def generate_embedding(self, text: str) -> np.ndarray:
        return self.embedding_model.encode([text])[0]

    def calculate_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
        return cosine_similarity([emb1], [emb2])[0][0]

## Step 7: Enhanced Reconciliation Engine

In [None]:
class ReconciliationEngine:

    def __init__(self, similarity_threshold: float = 0.75):
        self.similarity_threshold = similarity_threshold

    def match_receipts(self, pdf_data: ReceiptData, image_data: ReceiptData,
                       llm_processor: LLMProcessor) -> Dict:
        print("Calculating document similarity...")

        pdf_emb = llm_processor.generate_embedding(pdf_data.raw_text or "")
        img_emb = llm_processor.generate_embedding(image_data.raw_text or "")

        similarity_score = llm_processor.calculate_similarity(pdf_emb, img_emb)
        print(f"Similarity score: {similarity_score:.3f}")

        field_matches = self._compare_fields(pdf_data, image_data)
        print(f"Field match percentage: {field_matches['match_percentage']:.1%}")

        item_comparison = self._compare_items(pdf_data.items or [], image_data.items or [])
        print(f"Item match rate: {item_comparison['match_rate']:.1%}")

        is_match = (similarity_score >= self.similarity_threshold and
                   field_matches['match_percentage'] >= 0.5)

        return {
            'is_match': is_match,
            'similarity_score': float(similarity_score),
            'field_matches': field_matches,
            'item_comparison': item_comparison,
            'conflicts': self._identify_conflicts(pdf_data, image_data)
        }

    def _compare_fields(self, data1: ReceiptData, data2: ReceiptData) -> Dict:
        fields = ['transaction_id', 'total', 'tax', 'subtotal', 'num_items']
        matches = 0
        total_fields = 0
        field_details = []

        for field in fields:
            val1 = getattr(data1, field)
            val2 = getattr(data2, field)

            if val1 is not None and val2 is not None:
                total_fields += 1
                is_match = False

                if isinstance(val1, float):
                    is_match = abs(val1 - val2) < 0.01
                else:
                    is_match = val1 == val2

                if is_match:
                    matches += 1

                field_details.append({
                    'field': field,
                    'pdf_value': val1,
                    'image_value': val2,
                    'match': is_match
                })

        return {
            'matches': matches,
            'total_compared': total_fields,
            'match_percentage': matches / total_fields if total_fields > 0 else 0.0,
            'details': field_details
        }

    def _compare_items(self, pdf_items: List[Dict], image_items: List[Dict]) -> Dict:
        item_matches = []
        matched_image_indices = set()

        for pdf_item in pdf_items:
            best_match = None
            best_score = 0
            best_idx = -1

            for idx, img_item in enumerate(image_items):
                if idx in matched_image_indices:
                    continue

                score = self._calculate_item_similarity(pdf_item, img_item)
                if score > best_score:
                    best_score = score
                    best_match = img_item
                    best_idx = idx

            if best_score > 0.6:
                matched_image_indices.add(best_idx)
                item_matches.append({
                    'pdf_item': pdf_item,
                    'image_item': best_match,
                    'match_score': best_score,
                    'matched': True
                })
            else:
                item_matches.append({
                    'pdf_item': pdf_item,
                    'image_item': None,
                    'match_score': 0,
                    'matched': False
                })

        for idx, img_item in enumerate(image_items):
            if idx not in matched_image_indices:
                item_matches.append({
                    'pdf_item': None,
                    'image_item': img_item,
                    'match_score': 0,
                    'matched': False
                })

        matched_count = sum(1 for match in item_matches if match['matched'])
        total_items = max(len(pdf_items), len(image_items))

        return {
            'matched_items': matched_count,
            'total_items': total_items,
            'match_rate': matched_count / total_items if total_items > 0 else 0.0,
            'details': item_matches
        }

    def _calculate_item_similarity(self, item1: Dict, item2: Dict) -> float:
        score = 0.0

        if item1.get('product_id') and item2.get('product_id'):
            if item1['product_id'] == item2['product_id']:
                score += 0.4

        name1 = item1.get('name', '').lower()
        name2 = item2.get('name', '').lower()
        if name1 and name2:
            words1 = set(name1.split())
            words2 = set(name2.split())
            if words1 and words2:
                name_similarity = len(words1 & words2) / len(words1 | words2)
                score += name_similarity * 0.3

        if abs(item1.get('total_price', 0) - item2.get('total_price', 0)) < 0.01:
            score += 0.2

        if item1.get('quantity') == item2.get('quantity'):
            score += 0.1

        return score

    def _identify_conflicts(self, pdf_data: ReceiptData, image_data: ReceiptData) -> List[Dict]:
        conflicts = []

        fields = ['transaction_id', 'total', 'tax', 'subtotal', 'num_items',
                 'payment_method', 'transaction_time']

        for field in fields:
            val1 = getattr(pdf_data, field)
            val2 = getattr(image_data, field)

            if val1 is not None and val2 is not None and val1 != val2:
                if isinstance(val1, float) and abs(val1 - val2) < 0.01:
                    continue

                conflicts.append({
                    'field': field,
                    'pdf_value': val1,
                    'image_value': val2,
                    'recommended': val1,
                    'reason': 'PDF source is primary reference'
                })

        print(f"Identified {len(conflicts)} conflicts")
        return conflicts

    def reconcile_data(self, pdf_data: ReceiptData, image_data: ReceiptData,
                       match_result: Dict) -> ReceiptData:
        print("Reconciling data...")

        reconciled = ReceiptData(source_type='reconciled')

        for field in pdf_data.__dataclass_fields__:
            if field == 'source_type':
                continue

            pdf_val = getattr(pdf_data, field)
            img_val = getattr(image_data, field)

            if field == 'items':
                reconciled.items = self._reconcile_items(pdf_val or [], img_val or [], match_result)
            elif pdf_val is not None:
                setattr(reconciled, field, pdf_val)
            elif img_val is not None:
                setattr(reconciled, field, img_val)

        reconciled.confidence_score = (
            (pdf_data.confidence_score or 0) * 0.7 +
            (image_data.confidence_score or 0) * 0.3
        )

        print("Reconciliation complete")
        return reconciled

    def _reconcile_items(self, pdf_items: List[Dict], image_items: List[Dict],
                        match_result: Dict) -> List[Dict]:
        reconciled_items = []

        for match in match_result.get('item_comparison', {}).get('details', []):
            if match['matched']:
                pdf_item = match['pdf_item']
                img_item = match['image_item']

                reconciled_item = pdf_item.copy()

                if not reconciled_item.get('product_id') and img_item.get('product_id'):
                    reconciled_item['product_id'] = img_item['product_id']

                reconciled_items.append(reconciled_item)
            elif match['pdf_item']:
                reconciled_items.append(match['pdf_item'])

        return reconciled_items

## Step 8: Main System Integration

In [None]:
class ReceiptReconciliationSystem:

    def __init__(self, gemini_api_key: str, use_easyocr: bool = True):
        print("Initializing Receipt Reconciliation System...")
        self.ocr_processor = OCRProcessor(use_easyocr=use_easyocr)
        self.nlp_extractor = NLPExtractor()
        self.llm_processor = LLMProcessor(api_key=gemini_api_key)
        self.reconciliation_engine = ReconciliationEngine()
        print("System initialization complete")

    def process_document(self, file_path: str, source_type: str) -> ReceiptData:
        print(f"\nProcessing {source_type.upper()} document...")

        if source_type == 'pdf':
            raw_text, confidence = self.ocr_processor.extract_from_pdf(file_path)
        else:
            raw_text, confidence = self.ocr_processor.extract_from_image(file_path)

        nlp_data = self.nlp_extractor.extract_structured_data(raw_text)

        llm_data = self.llm_processor.extract_with_llm(raw_text)

        merged_data = self._merge_extraction_results(nlp_data, llm_data)

        receipt_data = ReceiptData(
            source_type=source_type,
            raw_text=raw_text,
            confidence_score=confidence,
            **merged_data
        )

        print(f"{source_type.upper()} processing complete")
        return receipt_data

    def reconcile_documents(self, pdf_path: str, image_path: str) -> Dict:
        print("\n" + "="*70)
        print("STARTING RECONCILIATION PROCESS")
        print("="*70)

        pdf_data = self.process_document(pdf_path, 'pdf')

        image_data = self.process_document(image_path, 'image')

        print("\n" + "-"*70)
        print("MATCHING DOCUMENTS")
        print("-"*70)
        match_result = self.reconciliation_engine.match_receipts(
            pdf_data, image_data, self.llm_processor
        )

        reconciled_data = None
        if match_result['is_match']:
            reconciled_data = self.reconciliation_engine.reconcile_data(
                pdf_data, image_data, match_result
            )

        print("\n" + "="*70)
        print("RECONCILIATION COMPLETE")
        print("="*70)

        return {
            'pdf_data': asdict(pdf_data),
            'image_data': asdict(image_data),
            'match_result': match_result,
            'reconciled_data': asdict(reconciled_data) if reconciled_data else None
        }

    def _merge_extraction_results(self, nlp_data: Dict, llm_data: Dict) -> Dict:
        merged = {}
        all_keys = set(nlp_data.keys()) | set(llm_data.keys())

        for key in all_keys:
            nlp_val = nlp_data.get(key)
            llm_val = llm_data.get(key)

            if key == 'items':
                nlp_items = nlp_val or []
                llm_items = llm_val or []
                merged[key] = llm_items if len(llm_items) >= len(nlp_items) else nlp_items
            elif nlp_val is not None and llm_val is not None:
                merged[key] = llm_val if llm_val else nlp_val
            elif nlp_val is not None:
                merged[key] = nlp_val
            elif llm_val is not None:
                merged[key] = llm_val

        return merged

    def export_results(self, results: Dict, output_path: str):
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2, default=str)
        print(f"Results exported to {output_path}")

        if results.get('reconciled_data'):
            df_data = {
                'Field': [],
                'PDF Value': [],
                'Image Value': [],
                'Reconciled Value': [],
                'Match Status': []
            }

            for field in results['reconciled_data'].keys():
                if field not in ['source_type', 'raw_text', 'items']:
                    pdf_val = results['pdf_data'].get(field)
                    img_val = results['image_data'].get(field)
                    rec_val = results['reconciled_data'].get(field)

                    match_status = 'Match' if pdf_val == img_val else 'Mismatch'
                    if pdf_val is None or img_val is None:
                        match_status = 'Missing'

                    df_data['Field'].append(field)
                    df_data['PDF Value'].append(pdf_val)
                    df_data['Image Value'].append(img_val)
                    df_data['Reconciled Value'].append(rec_val)
                    df_data['Match Status'].append(match_status)

            df = pd.DataFrame(df_data)
            excel_path = output_path.replace('.json', '_comparison.xlsx')

            with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Field Comparison', index=False)

                if results.get('match_result', {}).get('item_comparison'):
                    self._export_item_comparison(results, writer)

            print(f"Comparison tables exported to {excel_path}")

    def _export_item_comparison(self, results: Dict, writer):
        item_data = {
            'Item Name (PDF)': [],
            'Product ID (PDF)': [],
            'Quantity (PDF)': [],
            'Unit Price (PDF)': [],
            'Total (PDF)': [],
            'Item Name (Image)': [],
            'Product ID (Image)': [],
            'Quantity (Image)': [],
            'Unit Price (Image)': [],
            'Total (Image)': [],
            'Match Status': []
        }

        for match in results['match_result']['item_comparison']['details']:
            pdf_item = match.get('pdf_item', {})
            img_item = match.get('image_item', {})

            item_data['Item Name (PDF)'].append(pdf_item.get('name', 'N/A') if pdf_item else 'N/A')
            item_data['Product ID (PDF)'].append(pdf_item.get('product_id', 'N/A') if pdf_item else 'N/A')
            item_data['Quantity (PDF)'].append(pdf_item.get('quantity', 'N/A') if pdf_item else 'N/A')
            item_data['Unit Price (PDF)'].append(pdf_item.get('unit_price', 'N/A') if pdf_item else 'N/A')
            item_data['Total (PDF)'].append(pdf_item.get('total_price', 'N/A') if pdf_item else 'N/A')

            item_data['Item Name (Image)'].append(img_item.get('name', 'N/A') if img_item else 'N/A')
            item_data['Product ID (Image)'].append(img_item.get('product_id', 'N/A') if img_item else 'N/A')
            item_data['Quantity (Image)'].append(img_item.get('quantity', 'N/A') if img_item else 'N/A')
            item_data['Unit Price (Image)'].append(img_item.get('unit_price', 'N/A') if img_item else 'N/A')
            item_data['Total (Image)'].append(img_item.get('total_price', 'N/A') if img_item else 'N/A')

            if match.get('matched'):
                item_data['Match Status'].append(f"Matched ({match.get('match_score', 0):.0%})")
            else:
                item_data['Match Status'].append('No Match')

        df_items = pd.DataFrame(item_data)
        df_items.to_excel(writer, sheet_name='Item Comparison', index=False)

## Step 9: Configuration and Execution

In [None]:
GEMINI_API_KEY = input("Enter your Gemini API Key: ")

system = ReceiptReconciliationSystem(
    gemini_api_key=GEMINI_API_KEY,
    use_easyocr=True
)

Enter your Gemini API Key: AIzaSyByom6c-638TrNXqhWwRXYZOpRVIjvXOOM




Initializing Receipt Reconciliation System...
Initializing EasyOCR reader...
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

LLM processor initialized successfully
System initialization complete


## Step 10: Upload Documents

In [None]:
print("Upload PDF receipt:")
pdf_uploaded = files.upload()
pdf_path = list(pdf_uploaded.keys())[0]

print("\nUpload receipt image:")
image_uploaded = files.upload()
image_path = list(image_uploaded.keys())[0]

Upload PDF receipt:


Saving w1.pdf to w1 (1).pdf

Upload receipt image:


Saving WhatsApp Image 2025-12-18 at 7.51.44 PM.jpeg to WhatsApp Image 2025-12-18 at 7.51.44 PM.jpeg


## Step 11: Execute Reconciliation

In [None]:
results = system.reconcile_documents(pdf_path, image_path)


STARTING RECONCILIATION PROCESS

Processing PDF document...
Processing PDF: w1 (1).pdf
Processing page 1/1




PDF processing complete. Average confidence: 0.89
Extracting structured data with NLP...
NLP extraction complete. Found 0 items.
Extracting data using LLM...




LLM extraction error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
PDF processing complete

Processing IMAGE document...
Processing image: WhatsApp Image 2025-12-18 at 7.51.44 PM.jpeg
Extracted 411 characters with confidence: 0.93
Extracting structured data with NLP...
NLP extraction complete. Found 0 items.
Extracting data using LLM...




LLM extraction error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
IMAGE processing complete

----------------------------------------------------------------------
MATCHING DOCUMENTS
----------------------------------------------------------------------
Calculating document similarity...
Similarity score: 0.878
Field match percentage: 75.0%
Item match rate: 0.0%
Identified 2 conflicts
Reconciling data...
Reconciliation complete

RECONCILIATION COMPLETE


## Step 12: Display Summary Results

In [None]:
print("\n" + "="*70)
print("RECONCILIATION SUMMARY")
print("="*70)
print(f"Match Status: {'MATCHED' if results['match_result']['is_match'] else 'NOT MATCHED'}")
print(f"Similarity Score: {results['match_result']['similarity_score']:.3f}")
print(f"Field Match Rate: {results['match_result']['field_matches']['match_percentage']:.1%}")
print(f"Item Match Rate: {results['match_result']['item_comparison']['match_rate']:.1%}")
print(f"Conflicts Found: {len(results['match_result']['conflicts'])}")

if results['match_result']['conflicts']:
    print("\nConflicts:")
    for conflict in results['match_result']['conflicts']:
        print(f"  - {conflict['field']}: PDF={conflict['pdf_value']}, Image={conflict['image_value']}")


RECONCILIATION SUMMARY
Match Status: MATCHED
Similarity Score: 0.878
Field Match Rate: 75.0%
Item Match Rate: 0.0%
Conflicts Found: 2

Conflicts:
  - tax: PDF=1.0, Image=8.0
  - transaction_time: PDF=12/28/25 18:53:32, Image=12/10/25 18:53:32


## Step 13: Display Field Comparison

In [None]:
print("\n" + "="*70)
print("FIELD-BY-FIELD COMPARISON")
print("="*70)

for detail in results['match_result']['field_matches']['details']:
    status = "✓ MATCH" if detail['match'] else "✗ MISMATCH"
    print(f"\n{detail['field'].upper()}:")
    print(f"  PDF:   {detail['pdf_value']}")
    print(f"  Image: {detail['image_value']}")
    print(f"  Status: {status}")


FIELD-BY-FIELD COMPARISON

TOTAL:
  PDF:   10.0
  Image: 10.0
  Status: ✓ MATCH

TAX:
  PDF:   1.0
  Image: 8.0
  Status: ✗ MISMATCH

SUBTOTAL:
  PDF:   10.0
  Image: 10.0
  Status: ✓ MATCH

NUM_ITEMS:
  PDF:   0
  Image: 0
  Status: ✓ MATCH


## Step 14: Display Item-by-Item Comparison

In [None]:
print("\n" + "="*70)
print("ITEM-BY-ITEM COMPARISON")
print("="*70)

for idx, match in enumerate(results['match_result']['item_comparison']['details'], 1):
    print(f"\nItem {idx}:")

    if match['pdf_item']:
        pdf_item = match['pdf_item']
        print(f"  PDF:")
        print(f"    Name: {pdf_item.get('name')}")
        print(f"    Product ID: {pdf_item.get('product_id', 'N/A')}")
        print(f"    Quantity: {pdf_item.get('quantity')}")
        print(f"    Unit Price: ${pdf_item.get('unit_price', 0):.2f}")
        print(f"    Total: ${pdf_item.get('total_price', 0):.2f}")

    if match['image_item']:
        img_item = match['image_item']
        print(f"  Image:")
        print(f"    Name: {img_item.get('name')}")
        print(f"    Product ID: {img_item.get('product_id', 'N/A')}")
        print(f"    Quantity: {img_item.get('quantity')}")
        print(f"    Unit Price: ${img_item.get('unit_price', 0):.2f}")
        print(f"    Total: ${img_item.get('total_price', 0):.2f}")

    if match['matched']:
        print(f"  Status: ✓ MATCHED (Score: {match['match_score']:.0%})")
    else:
        print(f"  Status: ✗ NO MATCH")


ITEM-BY-ITEM COMPARISON


## Step 15: Display Reconciled Data

In [None]:
if results['reconciled_data']:
    reconciled = results['reconciled_data']
    print("\n" + "="*70)
    print("RECONCILED RECEIPT DATA")
    print("="*70)
    print(f"Location: {reconciled.get('location')}")
    print(f"Transaction ID: {reconciled.get('transaction_id')}")
    print(f"Transaction Time: {reconciled.get('transaction_time')}")
    print(f"Payment Method: {reconciled.get('payment_method')}")
    print(f"Number of Items: {reconciled.get('num_items')}")
    print(f"Subtotal: ${reconciled.get('subtotal', 0):.2f}")
    print(f"Tax: ${reconciled.get('tax', 0):.2f}")
    print(f"Total: ${reconciled.get('total', 0):.2f}")
    print(f"Confidence Score: {reconciled.get('confidence_score', 0):.2%}")

    if reconciled.get('items'):
        print("\nReconciled Items:")
        for idx, item in enumerate(reconciled['items'], 1):
            print(f"  {idx}. {item.get('name')}")
            print(f"     Product ID: {item.get('product_id', 'N/A')}")
            print(f"     Qty: {item.get('quantity')} x ${item.get('unit_price', 0):.2f} = ${item.get('total_price', 0):.2f}")


RECONCILED RECEIPT DATA
Location: Walmart.com
Transaction ID: details
Transaction Time: 12/28/25 18:53:32
Payment Method: visa
Number of Items: 0
Subtotal: $10.00
Tax: $1.00
Total: $10.00
Confidence Score: 90.44%


## Step 16: Export Results

In [None]:
system.export_results(results, "reconciliation_results.json")

files.download("reconciliation_results.json")
if results['reconciled_data']:
    files.download("reconciliation_results_comparison.xlsx")

print("\nFiles downloaded successfully!")

Results exported to reconciliation_results.json
Comparison tables exported to reconciliation_results_comparison.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Files downloaded successfully!


## Step 17: Display Comparison Tables

In [None]:
if results['reconciled_data']:
    print("\n" + "="*70)
    print("FIELD COMPARISON TABLE")
    print("="*70)

    comparison_data = []
    fields = ['location', 'transaction_id', 'transaction_time', 'payment_method',
              'num_items', 'subtotal', 'tax', 'total']

    for field in fields:
        pdf_val = results['pdf_data'].get(field)
        img_val = results['image_data'].get(field)
        rec_val = results['reconciled_data'].get(field)
        match = "✓" if pdf_val == img_val else "✗"

        comparison_data.append({
            'Field': field,
            'PDF': pdf_val,
            'Image': img_val,
            'Reconciled': rec_val,
            'Match': match
        })

    comparison_df = pd.DataFrame(comparison_data)
    display(comparison_df)


FIELD COMPARISON TABLE


Unnamed: 0,Field,PDF,Image,Reconciled,Match
0,location,Walmart.com,Walmart,Walmart.com,✗
1,transaction_id,details,,details,✗
2,transaction_time,12/28/25 18:53:32,12/10/25 18:53:32,12/28/25 18:53:32,✗
3,payment_method,visa,visa,visa,✓
4,num_items,0,0,0,✓
5,subtotal,10.0,10.0,10.0,✓
6,tax,1.0,8.0,1.0,✗
7,total,10.0,10.0,10.0,✓


## Step 18: Display Item Comparison Table

In [None]:
if results['match_result'].get('item_comparison'):
    print("\n" + "="*70)
    print("ITEM COMPARISON TABLE")
    print("="*70)

    item_comparison_data = []

    for match in results['match_result']['item_comparison']['details']:
        pdf_item = match.get('pdf_item', {})
        img_item = match.get('image_item', {})

        item_comparison_data.append({
            'Name (PDF)': pdf_item.get('name', 'N/A') if pdf_item else 'N/A',
            'ID (PDF)': pdf_item.get('product_id', 'N/A') if pdf_item else 'N/A',
            'Price (PDF)': f"${pdf_item.get('total_price', 0):.2f}" if pdf_item else 'N/A',
            'Name (Image)': img_item.get('name', 'N/A') if img_item else 'N/A',
            'ID (Image)': img_item.get('product_id', 'N/A') if img_item else 'N/A',
            'Price (Image)': f"${img_item.get('total_price', 0):.2f}" if img_item else 'N/A',
            'Match': '✓' if match['matched'] else '✗'
        })

    item_df = pd.DataFrame(item_comparison_data)
    display(item_df)


ITEM COMPARISON TABLE
