In [27]:
!pip install PyMuPDF pytesseract pandas numpy scikit-learn python-dateutil pillow
!pip install pytesseract
!pip install python-dateutil




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [71]:


import os
import pytesseract
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dateutil import parser
import re
import fitz  # PyMuPDF
import io
from PIL import Image
from tabulate import tabulate
import locale

# Convert PDF to image
def pdf_to_image(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)  # Load the first page
    pix = page.get_pixmap()
    img_data = pix.tobytes("png")
    return Image.open(io.BytesIO(img_data))

# Set Tesseract path and data directory
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata'

def extract_invoice_details(pdf_path):
    locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

    img = pdf_to_image(pdf_path)
    text = pytesseract.image_to_string(img, lang='deu')
    
    # Extract invoice number
    invoice_number_patterns = [
        r'(?i)rechnung(?:s-?)?(?:nummer|nr\.?)?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)rechnungs-?id\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)beleg-?nr\.?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nummer\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nr\.?\s*[:.]?\s*(\w+[-\w]*)'
    ]
    
    invoice_number = None
    for pattern in invoice_number_patterns:
        match = re.search(pattern, text)
        if match:
            invoice_number = match.group(1)
            break

    # Extract date
    date_patterns = [
        r'(?i)(?:rechnungs)?datum\s*[:.]?\s*(\d{1,2}\.?\s*\d{1,2}\.?\s*\d{2,4})',
        r'(?i)datum\s*[:.]?\s*(\d{1,2}\.\s*[a-zä]+\s*\d{4})',
        r'(?i)(\d{1,2}\.\d{1,2}\.\d{2,4})'
    ]
    
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                date_str = match.group(1)
                parsed_date = parser.parse(date_str, dayfirst=True)
                date = parsed_date.strftime('%Y-%m-%d')
                break
            except ValueError:
                continue

    # Extract total amount
    total_patterns = [
        r'(?i)gesamtbetrag\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)summe\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)zu\s*zahlen\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)'
    ]

    total = None
    for pattern in total_patterns:
        match = re.search(pattern, text)
        if match:
            total = match.group(1)
            break

    # Extract company name
    company_pattern = r'^([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)*(?:\s+GmbH|AG|KG|OHG)?)'
    company_match = re.search(company_pattern, text, re.MULTILINE)
    company = company_match.group(1) if company_match else ''

    # Extract items
    items = re.findall(r'(\d+)\s+(.*?)\s+([\d.,]+)\s+(?:[\d.,]+)\s+([\d.,]+)', text)

    return {
        'invoice_number': invoice_number if invoice_number else '',
        'date': date if date else '',
        'total': total if total else '',
        'company': company,
        'items': items
    }

def calculate_similarity(details1, details2):
    text1 = f"{details1['invoice_number']} {details1['date']} {details1['total']} {details1['company']} {' '.join([item[1] for item in details1['items']])}"
    text2 = f"{details2['invoice_number']} {details2['date']} {details2['total']} {details2['company']} {' '.join([item[1] for item in details2['items']])}"
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def process_invoices(invoice_dir):
    results = []
    for filename in os.listdir(invoice_dir):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(invoice_dir, filename)
            try:
                details = extract_invoice_details(file_path)
                results.append({'filename': filename, 'details': details})
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    return results

def find_similar_invoices(test_invoice, database_invoices):
    test_details = test_invoice['details']
    similarities = []
    
    for db_invoice in database_invoices:
        db_details = db_invoice['details']
        text_sim = calculate_similarity(test_details, db_details)
        similarities.append((db_invoice['filename'], text_sim, db_details))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:5]  # Return top 5 similar invoices

def display_invoice_comparison(test_invoice, similar_invoices):
    headers = ["Field", test_invoice['filename']] + [sim[0] for sim in similar_invoices]
    
    table_data = []
    fields = ['invoice_number', 'date', 'total', 'company']
    
    for field in fields:
        row = [field.capitalize(), test_invoice['details'][field]]
        for sim in similar_invoices:
            row.append(sim[2][field])
        table_data.append(row)
    
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    
    # Display items separately
    print("\nItems:")
    for i, invoice in enumerate([test_invoice] + similar_invoices):
        print(f"\n{headers[i]}:")
        items = invoice['details']['items'] if i == 0 else invoice[2]['items']
        if items:
            item_headers = ["Quantity", "Description", "Price", "Total"]
            print(tabulate(items, headers=item_headers, tablefmt="grid"))
        else:
            print("No items found.")

def calculate_field_accuracy(test_value, similar_value):
    if not test_value and not similar_value:
        return 1.0  # Both empty, consider it a match
    if not test_value or not similar_value:
        return 0.0  # One is empty, the other isn't
    
    # For numeric values (assuming they're for 'total')
    if test_value.replace(',', '').replace('.', '').isdigit() and similar_value.replace(',', '').replace('.', '').isdigit():
        test_num = float(test_value.replace(',', ''))
        similar_num = float(similar_value.replace(',', ''))
        diff = abs(test_num - similar_num)
        max_val = max(test_num, similar_num)
        return 1 - (diff / max_val) if max_val != 0 else 1
    
    # For string values
    return 1 - (sum(c1 != c2 for c1, c2 in zip(test_value, similar_value)) + abs(len(test_value) - len(similar_value))) / max(len(test_value), len(similar_value))

def generate_accuracy_report(test_invoices, train_invoices):
    overall_accuracy = []
    field_accuracies = {field: [] for field in ['invoice_number', 'date', 'total', 'company']}
    
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        most_similar = similar_invoices[0]  # Take the most similar invoice
        
        invoice_accuracy = []
        for field in field_accuracies.keys():
            accuracy = calculate_field_accuracy(test_invoice['details'][field], most_similar[2][field])
            field_accuracies[field].append(accuracy)
            invoice_accuracy.append(accuracy)
        
        overall_accuracy.append(np.mean(invoice_accuracy))
    
    report = {
        "Overall Accuracy": np.mean(overall_accuracy),
        "Field Accuracies": {field: np.mean(accuracies) for field, accuracies in field_accuracies.items()}
    }
    
    return report

# Main program
train_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\train"
test_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\test"

print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")
print(f"Files in train directory: {os.listdir(train_dir)}")
print(f"Files in test directory: {os.listdir(test_dir)}")

try:
    # Process training invoices
    train_invoices = process_invoices(train_dir)
    print(f"\nProcessed {len(train_invoices)} training invoices")

    # Process test invoices
    test_invoices = process_invoices(test_dir)
    print(f"\nProcessed {len(test_invoices)} test invoices")

    # Generate accuracy report
    accuracy_report = generate_accuracy_report(test_invoices, train_invoices)

    # Print accuracy report
    print("\nAccuracy Report:")
    accuracy_table = [
        ["Metric", "Accuracy"],
        ["Overall Accuracy", f"{accuracy_report['Overall Accuracy']:.2%}"],
        ["Invoice Number", f"{accuracy_report['Field Accuracies']['invoice_number']:.2%}"],
        ["Date", f"{accuracy_report['Field Accuracies']['date']:.2%}"],
        ["Total", f"{accuracy_report['Field Accuracies']['total']:.2%}"],
        ["Company", f"{accuracy_report['Field Accuracies']['company']:.2%}"]
    ]
    print(tabulate(accuracy_table, headers="firstrow", tablefmt="grid"))

    # Find similar invoices for each test invoice and display comparison
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        print(f"\nTest Invoice: {test_invoice['filename']}")
        print("Similar Invoices:")
        display_invoice_comparison(test_invoice, similar_invoices)
        print("\n" + "="*80 + "\n")

    # Save results to CSV
    results = []
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        results.append({
            'test_invoice': test_invoice['filename'],
            'similar_invoices': [(sim[0], sim[1]) for sim in similar_invoices]
        })
    df = pd.DataFrame(results)
    df.to_csv(r"C:\Users\HP\Desktop\invoice project\document similarity\similarity_results.csv", index=False)
    print("\nResults saved to similarity_results.csv")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Train directory: C:\Users\HP\Desktop\invoice project\document similarity\train
Test directory: C:\Users\HP\Desktop\invoice project\document similarity\test
Files in train directory: ['2024.03.15_0954.pdf', '2024.03.15_1145.pdf', 'Faller_8.PDF', 'invoice_102856.pdf', 'invoice_77073.pdf', 'Rechnungsvorlage_mit_Umsatzsteuer_2023.pdf', 'Rechnungsvorlage_ohne_Umsatzsteuer_KU_2023.pdf', 'sample.pdf']
Files in test directory: ['invoice_102857.pdf', 'invoice_77098.pdf']

Processed 8 training invoices

Processed 2 test invoices

Accuracy Report:
+------------------+------------+
| Metric           | Accuracy   |
| Overall Accuracy | 97.50%     |
+------------------+------------+
| Invoice Number   | 100.00%    |
+------------------+------------+
| Date             | 90.00%     |
+------------------+------------+
| Total            | 100.00%    |
+------------------+------------+
| Company          | 100.00%    |
+------------------+------------+

Test Invoice: invoice_102857.pdf
Similar Invoice

In [67]:
import os
import pytesseract
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dateutil import parser
import re
import fitz  # PyMuPDF
import io
from PIL import Image
from tabulate import tabulate

def pdf_to_image(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)  # Load the first page
    pix = page.get_pixmap()
    img_data = pix.tobytes("png")
    return Image.open(io.BytesIO(img_data))

import re
from dateutil import parser
import pytesseract
from PIL import Image
import os

# Set Tesseract path and data directory
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust this path if necessary
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata' 
def extract_invoice_details(pdf_path):
    # Set locale to German for date parsing
    locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

    # Convert PDF to image
    img = pdf_to_image(pdf_path)
    
    # Perform OCR using Tesseract with German language
    text = pytesseract.image_to_string(img, lang='deu')
    
    print("Extracted text:")
    print(text)
    print("=" * 50)

    # Extract invoice number
    invoice_number_patterns = [
        r'(?i)rechnung(?:s-?)?(?:nummer|nr\.?)?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)rechnungs-?id\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)beleg-?nr\.?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nummer\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nr\.?\s*[:.]?\s*(\w+[-\w]*)'
    ]
    
    invoice_number = None
    for pattern in invoice_number_patterns:
        match = re.search(pattern, text)
        if match:
            invoice_number = match.group(1)
            print(f"Invoice number found: {invoice_number} (using pattern: {pattern})")
            break
        else:
            print(f"No match found for pattern: {pattern}")
    
    if not invoice_number:
        print("WARNING: No invoice number found.")

    # Extract date
    date_patterns = [
        r'(?i)(?:rechnungs)?datum\s*[:.]?\s*(\d{1,2}\.?\s*\d{1,2}\.?\s*\d{2,4})',
        r'(?i)datum\s*[:.]?\s*(\d{1,2}\.\s*[a-zä]+\s*\d{4})',
        r'(?i)(\d{1,2}\.\d{1,2}\.\d{2,4})'
    ]
    
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                date_str = match.group(1)
                # Try parsing with dateutil
                parsed_date = parser.parse(date_str, dayfirst=True)
                date = parsed_date.strftime('%Y-%m-%d')
                print(f"Date found: {date} (using pattern: {pattern})")
                break
            except ValueError:
                print(f"Failed to parse date: {date_str}")
                continue
    
    if not date:
        print("WARNING: No date found.")

    # Extract total amount
    total_patterns = [
        r'(?i)gesamtbetrag\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)summe\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)zu\s*zahlen\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)'
    ]

    total = None
    for pattern in total_patterns:
        match = re.search(pattern, text)
        if match:
            total = match.group(1)
            print(f"Total amount found: {total} (using pattern: {pattern})")
            break
    
    if not total:
        print("WARNING: No total amount found.")

    # Extract company name (adjust as needed for German company names)
    company_pattern = r'^([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)*(?:\s+GmbH|AG|KG|OHG)?)'
    company_match = re.search(company_pattern, text, re.MULTILINE)
    company = company_match.group(1) if company_match else ''

    # Extract items (adjust as needed for German invoice formats)
    items = re.findall(r'(\d+)\s+(.*?)\s+([\d.,]+)\s+(?:[\d.,]+)\s+([\d.,]+)', text)

    return {
        'invoice_number': invoice_number if invoice_number else '',
        'date': date if date else '',
        'total': total if total else '',
        'company': company,
        'items': items
    }
def calculate_similarity(details1, details2):
    # Calculate similarity based on extracted details
    text1 = f"{details1['invoice_number']} {details1['date']} {details1['total']} {details1['company']} {' '.join([item[1] for item in details1['items']])}"
    text2 = f"{details2['invoice_number']} {details2['date']} {details2['total']} {details2['company']} {' '.join([item[1] for item in details2['items']])}"
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def process_invoices(invoice_dir):
    results = []
    print(f"Processing directory: {invoice_dir}")
    for filename in os.listdir(invoice_dir):
        print(f"Checking file: {filename}")
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(invoice_dir, filename)
            print(f"Processing file: {file_path}")
            try:
                details = extract_invoice_details(file_path)
                results.append({'filename': filename, 'details': details})
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
        else:
            print(f"Skipping file {filename} - not a PDF")
    return results

def find_similar_invoices(test_invoice, database_invoices):
    test_details = test_invoice['details']
    similarities = []
    
    for db_invoice in database_invoices:
        db_details = db_invoice['details']
        text_sim = calculate_similarity(test_details, db_details)
        similarities.append((db_invoice['filename'], text_sim, db_details))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:5]  # Return top 5 similar invoices

def display_invoice_comparison(test_invoice, similar_invoices):
    headers = ["Field", test_invoice['filename']] + [sim[0] for sim in similar_invoices]
    
    table_data = []
    fields = ['invoice_number', 'date', 'total', 'company']
    
    for field in fields:
        row = [field.capitalize(), test_invoice['details'][field]]
        for sim in similar_invoices:
            row.append(sim[2][field])
        table_data.append(row)
    
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    
    # Display items separately
    print("\nItems:")
    for i, invoice in enumerate([test_invoice] + similar_invoices):
        print(f"\n{headers[i]}:")
        items = invoice['details']['items'] if i == 0 else invoice[2]['items']
        if items:
            item_headers = ["Quantity", "Description", "Price", "Total"]
            print(tabulate(items, headers=item_headers, tablefmt="grid"))
        else:
            print("No items found.")

print(f"Processed {len(test_invoices)} test invoices")


def calculate_field_accuracy(test_value, similar_value):
    if not test_value and not similar_value:
        return 1.0  # Both empty, consider it a match
    if not test_value or not similar_value:
        return 0.0  # One is empty, the other isn't
    
    # For numeric values (assuming they're for 'total')
    if test_value.replace(',', '').replace('.', '').isdigit() and similar_value.replace(',', '').replace('.', '').isdigit():
        test_num = float(test_value.replace(',', ''))
        similar_num = float(similar_value.replace(',', ''))
        diff = abs(test_num - similar_num)
        max_val = max(test_num, similar_num)
        return 1 - (diff / max_val) if max_val != 0 else 1
    
    # For string values
    return 1 - (sum(c1 != c2 for c1, c2 in zip(test_value, similar_value)) + abs(len(test_value) - len(similar_value))) / max(len(test_value), len(similar_value))

def generate_accuracy_report(test_invoices, train_invoices):
    overall_accuracy = []
    field_accuracies = {field: [] for field in ['invoice_number', 'date', 'total', 'company']}
    
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        most_similar = similar_invoices[0]  # Take the most similar invoice
        
        invoice_accuracy = []
        for field in field_accuracies.keys():
            accuracy = calculate_field_accuracy(test_invoice['details'][field], most_similar[2][field])
            field_accuracies[field].append(accuracy)
            invoice_accuracy.append(accuracy)
        
        overall_accuracy.append(np.mean(invoice_accuracy))
    
    report = {
        "Overall Accuracy": np.mean(overall_accuracy),
        "Field Accuracies": {field: np.mean(accuracies) for field, accuracies in field_accuracies.items()}
    }
    
    return report

# Main program
train_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\train"
test_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\test"

print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")
print(f"Files in train directory: {os.listdir(train_dir)}")
print(f"Files in test directory: {os.listdir(test_dir)}")

try:
    # Process training invoices
    train_invoices = process_invoices(train_dir)
    print(f"Processed {len(train_invoices)} training invoices")

    # Process test invoices
    test_invoices = process_invoices(test_dir)
    print(f"Processed {len(test_invoices)} test invoices")

    # Generate accuracy report
    accuracy_report = generate_accuracy_report(test_invoices, train_invoices)

    # Print accuracy report
    print("\nAccuracy Report:")
    accuracy_table = [
        ["Metric", "Accuracy"],
        ["Overall Accuracy", f"{accuracy_report['Overall Accuracy']:.2%}"],
        ["Invoice Number", f"{accuracy_report['Field Accuracies']['invoice_number']:.2%}"],
        ["Date", f"{accuracy_report['Field Accuracies']['date']:.2%}"],
        ["Total", f"{accuracy_report['Field Accuracies']['total']:.2%}"],
        ["Company", f"{accuracy_report['Field Accuracies']['company']:.2%}"]
    ]
    print(tabulate(accuracy_table, headers="firstrow", tablefmt="grid"))

    # Find similar invoices for each test invoice and display comparison
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        print(f"\nTest Invoice: {test_invoice['filename']}")
        print("Similar Invoices:")
        display_invoice_comparison(test_invoice, similar_invoices)
        print("\n" + "="*80 + "\n")

    # Save results to CSV
    results = []
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        results.append({
            'test_invoice': test_invoice['filename'],
            'similar_invoices': [(sim[0], sim[1]) for sim in similar_invoices]
        })
    df = pd.DataFrame(results)
    df.to_csv(r"C:\Users\HP\Desktop\invoice project\document similarity\similarity_results.csv", index=False)
    print("Results saved to similarity_results.csv")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Processed 2 test invoices
Train directory: C:\Users\HP\Desktop\invoice project\document similarity\train
Test directory: C:\Users\HP\Desktop\invoice project\document similarity\test
Files in train directory: ['2024.03.15_0954.pdf', '2024.03.15_1145.pdf', 'Faller_8.PDF', 'invoice_102856.pdf', 'invoice_77073.pdf', 'Rechnungsvorlage_mit_Umsatzsteuer_2023.pdf', 'Rechnungsvorlage_ohne_Umsatzsteuer_KU_2023.pdf', 'sample.pdf']
Files in test directory: ['invoice_102857.pdf', 'invoice_77098.pdf']
Processing directory: C:\Users\HP\Desktop\invoice project\document similarity\train
Checking file: 2024.03.15_0954.pdf
Processing file: C:\Users\HP\Desktop\invoice project\document similarity\train\2024.03.15_0954.pdf
Extracted text:
Bremer Spirituosen Contor GmbH
Glseia-Müler-WoHt-Straßie 7

Klein Markenvertrieb GmbH

Nümberger Str. 39

28197 Bromen 90562 Heroldeberg
Tel.:0011.95 65 520

Auftragsübersicht Fax: 001105 65 5 19

Lieteransehrift:

r 005 Tr Ondusire) 21430

S0 4388096533968

Tegenaburger S

Tesseract version: 5.4.0.20240606
Current Tesseract command: C:\Program Files\Tesseract-OCR\tesseract.exe
Current TESSDATA_PREFIX: C:\Program Files\Tesseract-OCR\tessdata
Available languages: ['eng', 'osd']


In [63]:
import os
import pytesseract
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dateutil import parser
import re
import fitz  # PyMuPDF
import io
from PIL import Image
from tabulate import tabulate

# Tesseract configuration
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files\Tesseract-OCR\tessdata'

def pdf_to_image(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)
    pix = page.get_pixmap()
    img_data = pix.tobytes("png")
    return Image.open(io.BytesIO(img_data))

def extract_invoice_details(pdf_path):
    img = pdf_to_image(pdf_path)
    text = pytesseract.image_to_string(img, lang='deu')

    invoice_number = extract_pattern(text, [
        r'(?i)rechnung(?:s-?)?(?:nummer|nr\.?)?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)rechnungs-?id\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)beleg-?nr\.?\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nummer\s*[:.]?\s*(\w+[-\w]*)',
        r'(?i)nr\.?\s*[:.]?\s*(\w+[-\w]*)'
    ])
    
    date = extract_pattern(text, [
        r'(?i)(?:rechnungs)?datum\s*[:.]?\s*(\d{1,2}\.?\s*\d{1,2}\.?\s*\d{2,4})',
        r'(?i)datum\s*[:.]?\s*(\d{1,2}\.\s*[a-zä]+\s*\d{4})',
        r'(?i)(\d{1,2}\.\d{1,2}\.\d{2,4})'
    ], date=True)
    
    total = extract_pattern(text, [
        r'(?i)gesamtbetrag\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)summe\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)',
        r'(?i)zu\s*zahlen\s*[:.]?\s*(\d{1,3}(?:\.?\d{3})*(?:,\d{2})?)'
    ])
    
    company = extract_pattern(text, [r'^([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)*(?:\s+GmbH|AG|KG|OHG)?)'], company=True)
    items = re.findall(r'(\d+)\s+(.*?)\s+([\d.,]+)\s+(?:[\d.,]+)\s+([\d.,]+)', text)

    return {
        'invoice_number': invoice_number,
        'date': date,
        'total': total,
        'company': company,
        'items': items
    }

def extract_pattern(text, patterns, date=False, company=False):
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            if date:
                try:
                    date_str = match.group(1)
                    parsed_date = parser.parse(date_str, dayfirst=True)
                    return parsed_date.strftime('%Y-%m-%d')
                except ValueError:
                    continue
            return match.group(1)
    return ''

def calculate_similarity(details1, details2):
    text1 = f"{details1['invoice_number']} {details1['date']} {details1['total']} {details1['company']} {' '.join([item[1] for item in details1['items']])}"
    text2 = f"{details2['invoice_number']} {details2['date']} {details2['total']} {details2['company']} {' '.join([item[1] for item in details2['items']])}"
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

def process_invoices(invoice_dir):
    results = []
    for filename in os.listdir(invoice_dir):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(invoice_dir, filename)
            try:
                details = extract_invoice_details(file_path)
                results.append({'filename': filename, 'details': details})
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    return results

def find_similar_invoices(test_invoice, database_invoices):
    test_details = test_invoice['details']
    similarities = []
    
    for db_invoice in database_invoices:
        db_details = db_invoice['details']
        text_sim = calculate_similarity(test_details, db_details)
        similarities.append((db_invoice['filename'], text_sim, db_details))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:5]  # Return top 5 similar invoices

def display_invoice_comparison(test_invoice, similar_invoices):
    headers = ["Field", test_invoice['filename']] + [sim[0] for sim in similar_invoices]
    
    table_data = []
    fields = ['invoice_number', 'date', 'total', 'company']
    
    for field in fields:
        row = [field.capitalize(), test_invoice['details'][field]]
        for sim in similar_invoices:
            row.append(sim[2][field])
        table_data.append(row)
    
    print("\nInvoice Comparison:")
    print(tabulate(table_data, headers=headers, tablefmt="grid"))
    
    print("\nItems:")
    for i, invoice in enumerate([test_invoice] + similar_invoices):
        print(f"\n{headers[i]}:")
        items = invoice['details']['items'] if i == 0 else invoice[2]['items']
        if items:
            item_headers = ["Quantity", "Description", "Price", "Total"]
            print(tabulate(items, headers=item_headers, tablefmt="grid"))
        else:
            print("No items found.")

def calculate_field_accuracy(test_value, similar_value):
    if not test_value and not similar_value:
        return 1.0
    if not test_value or not similar_value:
        return 0.0
    
    if test_value.replace(',', '').replace('.', '').isdigit() and similar_value.replace(',', '').replace('.', '').isdigit():
        test_num = float(test_value.replace(',', ''))
        similar_num = float(similar_value.replace(',', ''))
        diff = abs(test_num - similar_num)
        max_val = max(test_num, similar_num)
        return 1 - (diff / max_val) if max_val != 0 else 1
    
    return 1 - (sum(c1 != c2 for c1, c2 in zip(test_value, similar_value)) + abs(len(test_value) - len(similar_value))) / max(len(test_value), len(similar_value))

def generate_accuracy_report(test_invoices, train_invoices):
    overall_accuracy = []
    field_accuracies = {field: [] for field in ['invoice_number', 'date', 'total', 'company']}
    
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        if not similar_invoices:
            continue
        most_similar = similar_invoices[0]
        
        invoice_accuracy = []
        for field in field_accuracies.keys():
            accuracy = calculate_field_accuracy(test_invoice['details'][field], most_similar[2][field])
            field_accuracies[field].append(accuracy)
            invoice_accuracy.append(accuracy)
        
        overall_accuracy.append(np.mean(invoice_accuracy))
    
    return {
        "Overall Accuracy": np.mean(overall_accuracy),
        "Field Accuracies": {field: np.mean(accuracies) for field, accuracies in field_accuracies.items()}
    }

# Main program
train_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\train"
test_dir = r"C:\Users\HP\Desktop\invoice project\document similarity\test"

print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")

try:
    # Process training invoices
    train_invoices = process_invoices(train_dir)
    print(f"Processed {len(train_invoices)} training invoices")

    # Process test invoices
    test_invoices = process_invoices(test_dir)
    print(f"Processed {len(test_invoices)} test invoices")

    # Generate accuracy report
    accuracy_report = generate_accuracy_report(test_invoices, train_invoices)

    # Print accuracy report
    print("\nAccuracy Report:")
    accuracy_table = [
        ["Metric", "Accuracy"],
        ["Overall Accuracy", f"{accuracy_report['Overall Accuracy']:.2%}"],
        ["Invoice Number", f"{accuracy_report['Field Accuracies']['invoice_number']:.2%}"],
        ["Date", f"{accuracy_report['Field Accuracies']['date']:.2%}"],
        ["Total", f"{accuracy_report['Field Accuracies']['total']:.2%}"],
        ["Company", f"{accuracy_report['Field Accuracies']['company']:.2%}"]
    ]
    print(tabulate(accuracy_table, headers="firstrow", tablefmt="grid"))

    # Find similar invoices for each test invoice and display comparison
    for test_invoice in test_invoices:
        similar_invoices = find_similar_invoices(test_invoice, train_invoices)
        print(f"\nTest Invoice: {test_invoice['filename']}")
        print("Similar Invoices:")
        display_invoice_comparison(test_invoice, similar_invoices)
        print("\n" + "="*80 + "\n")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Train directory: C:\Users\HP\Desktop\invoice project\document similarity\train
Test directory: C:\Users\HP\Desktop\invoice project\document similarity\test
Processed 8 training invoices
Processed 2 test invoices

Accuracy Report:
+------------------+------------+
| Metric           | Accuracy   |
| Overall Accuracy | 97.50%     |
+------------------+------------+
| Invoice Number   | 100.00%    |
+------------------+------------+
| Date             | 90.00%     |
+------------------+------------+
| Total            | 100.00%    |
+------------------+------------+
| Company          | 100.00%    |
+------------------+------------+

Test Invoice: invoice_102857.pdf
Similar Invoices:

Invoice Comparison:
+----------------+----------------------+--------------+----------------------+---------------------+----------------+----------------------------------------------+
| Field          | invoice_102857.pdf   | sample.pdf   | invoice_102856.pdf   | invoice_77073.pdf   | Faller_8.PDF   | Rech

Train directory: C:\Users\HP\Desktop\invoice project\document similarity\train
Test directory: C:\Users\HP\Desktop\invoice project\document similarity\test
Files in train directory: ['2024.03.15_0954.pdf', '2024.03.15_1145.pdf', 'Faller_8.PDF', 'invoice_102856.pdf', 'invoice_77073.pdf', 'Rechnungsvorlage_mit_Umsatzsteuer_2023.pdf', 'Rechnungsvorlage_ohne_Umsatzsteuer_KU_2023.pdf', 'sample.pdf']
Files in test directory: ['invoice_102857.pdf', 'invoice_77098.pdf']

Processed 8 training invoices

Processed 2 test invoices

Accuracy Report:
+------------------+------------+
| Metric           | Accuracy   |
| Overall Accuracy | 97.50%     |
+------------------+------------+
| Invoice Number   | 100.00%    |
+------------------+------------+
| Date             | 90.00%     |
+------------------+------------+
| Total            | 100.00%    |
+------------------+------------+
| Company          | 100.00%    |
+------------------+------------+

Test Invoice: invoice_102857.pdf
Similar Invoice