In [1]:
import re

def extract_invoice_data(invoice_text):
    # Extract relevant data using regular expressions
    invoice_number = re.search(r'Invoice Number: (\S+)', invoice_text).group(1)
    invoice_date = re.search(r'Invoice Date: (\S+)', invoice_text).group(1)
    total_amount = float(re.search(r'Total Amount: \$(\S+)', invoice_text).group(1))
    recipient_name = re.search(r'To: (.+?) GSTIN', invoice_text).group(1).strip()
    recipient_gstin = re.search(r'GSTIN: (\S+)', invoice_text).group(1)
    supplier_gstin = re.search(r'Supplier GSTIN: (\S+)', invoice_text).group(1)

    # Extract items using regex
    item_lines = re.findall(r'(\d+)\s+([\w\s]+)\s+\$(\S+)\s+(\d+)\s+\$(\S+)', invoice_text)
    items = []
    
    for line in item_lines:
        item_number, description, price, quantity, total = line
        items.append({
            'item_number': int(item_number),
            'description': description.strip(),
            'price': float(price),
            'quantity': int(quantity),
            'total': float(total)
        })
    
    # Generate structured outputs
    t1 = {
        'invoice_number': invoice_number,
        'invoice_date': invoice_date,
        'total_amount': total_amount
    }

    t2 = {
        'recipient': {
            'name': recipient_name,
            'gstin': recipient_gstin
        },
        'supplier': {
            'gstin': supplier_gstin
        }
    }

    t3 = items  # Items list

    t4 = {
        'invoice': {
            'number': invoice_number,
            'date': invoice_date,
            'total_amount': total_amount,
            'recipient': {
                'name': recipient_name,
                'gstin': recipient_gstin
            },
            'supplier': {
                'gstin': supplier_gstin
            },
            'items': items
        }
    }

    t5 = {
        'item_summary': {
            'total_items': len(items),
            'total_amount': total_amount,
            'item_details': [{'item_number': item['item_number'], 'total': item['total']} for item in items]
        }
    }

    return t1, t2, t3, t4, t5

# Sample invoice text
invoice_text = """Schmidt Inc INVOICE
Invoice Number: INV1360-C
Invoice Date: 13.08.2023
To: Green-Robinson GSTIN: 14NbnUy622853Z6
SL. Item Description Price Qty. Total
1 Integrated coherent algorithm $266.38 2 $532.76
2 Extended 24/7 architecture $766.59 2 $1533.18
3 Virtual radical budgetary management $686.1 5 $3430.5
Total Amount: $5729.6
Supplier GSTIN: 03rmlrM815749Z9"""

# Extract and print the structured data
t1, t2, t3, t4, t5 = extract_invoice_data(invoice_text)

print("t1:", t1)
print("t2:", t2)
print("t3:", t3)
print("t4:", t4)
print("t5:", t5)


t1: {'invoice_number': 'INV1360-C', 'invoice_date': '13.08.2023', 'total_amount': 5729.6}
t2: {'recipient': {'name': 'Green-Robinson', 'gstin': '14NbnUy622853Z6'}, 'supplier': {'gstin': '03rmlrM815749Z9'}}
t3: [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'description': 'Virtual radical budgetary management', 'price': 686.1, 'quantity': 5, 'total': 3430.5}]
t4: {'invoice': {'number': 'INV1360-C', 'date': '13.08.2023', 'total_amount': 5729.6, 'recipient': {'name': 'Green-Robinson', 'gstin': '14NbnUy622853Z6'}, 'supplier': {'gstin': '03rmlrM815749Z9'}, 'items': [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'des

In [1]:
import re

def extract_invoice_data(invoice_text):
    # Use regular expressions to extract relevant data
    invoice_number = re.search(r'Invoice Number: (\S+)', invoice_text).group(1)
    invoice_date = re.search(r'Invoice Date: (\S+)', invoice_text).group(1)
    total_amount = float(re.search(r'Total Amount: \$(\S+)', invoice_text).group(1))
    recipient_name = re.search(r'To: (.+?) GSTIN', invoice_text).group(1).strip()
    recipient_gstin = re.search(r'GSTIN: (\S+)', invoice_text).group(1)
    supplier_gstin = re.search(r'Supplier GSTIN: (\S+)', invoice_text).group(1)

    # Extract items using a regex pattern
    item_lines = re.findall(r'(\d+)\s+([\w\s]+)\s+\$(\S+)\s+(\d+)\s+\$(\S+)', invoice_text)
    items = []
    
    for line in item_lines:
        item_number, description, price, quantity, total = line
        items.append({
            'item_number': int(item_number),
            'description': description.strip(),
            'price': float(price),
            'quantity': int(quantity),
            'total': float(total)
        })
    
    # Structure the outputs in various formats
    t1 = {
        'invoice_number': invoice_number,
        'invoice_date': invoice_date,
        'total_amount': total_amount
    }

    t2 = {
        'recipient': {
            'name': recipient_name,
            'gstin': recipient_gstin
        },
        'supplier': {
            'gstin': supplier_gstin
        }
    }

    t3 = items  # Item details

    t4 = {
        'invoice': {
            'number': invoice_number,
            'date': invoice_date,
            'total_amount': total_amount,
            'recipient': {
                'name': recipient_name,
                'gstin': recipient_gstin
            },
            'supplier': {
                'gstin': supplier_gstin
            },
            'items': items
        }
    }

    t5 = {
        'item_summary': {
            'total_items': len(items),
            'total_amount': total_amount,
            'item_details': [{'item_number': item['item_number'], 'total': item['total']} for item in items]
        }
    }

    return t1, t2, t3, t4, t5

# Example of invoice text
invoice_text = """Schmidt Inc INVOICE
Invoice Number: INV1360-C
Invoice Date: 13.08.2023
To: Green-Robinson GSTIN: 14NbnUy622853Z6
SL. Item Description Price Qty. Total
1 Integrated coherent algorithm $266.38 2 $532.76
2 Extended 24/7 architecture $766.59 2 $1533.18
3 Virtual radical budgetary management $686.1 5 $3430.5
Total Amount: $5729.6
Supplier GSTIN: 03rmlrM815749Z9"""

# Extract and print the structured data
t1, t2, t3, t4, t5 = extract_invoice_data(invoice_text)

print("t1:", t1)
print("t2:", t2)
print("t3:", t3)
print("t4:", t4)
print("t5:", t5)


t1: {'invoice_number': 'INV1360-C', 'invoice_date': '13.08.2023', 'total_amount': 5729.6}
t2: {'recipient': {'name': 'Green-Robinson', 'gstin': '14NbnUy622853Z6'}, 'supplier': {'gstin': '03rmlrM815749Z9'}}
t3: [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'description': 'Virtual radical budgetary management', 'price': 686.1, 'quantity': 5, 'total': 3430.5}]
t4: {'invoice': {'number': 'INV1360-C', 'date': '13.08.2023', 'total_amount': 5729.6, 'recipient': {'name': 'Green-Robinson', 'gstin': '14NbnUy622853Z6'}, 'supplier': {'gstin': '03rmlrM815749Z9'}, 'items': [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'des

In [4]:
import re

def extract_invoice_data(invoice_text):
    # Extract relevant data using regular expressions
    invoice_number = re.search(r'Invoice Number: (\S+)', invoice_text).group(1)
    invoice_date = re.search(r'Invoice Date: (\S+)', invoice_text).group(1)
    total_amount = float(re.search(r'Total Amount: \$(\S+)', invoice_text).group(1))
    recipient_name = re.search(r'To: (.+?) GSTIN', invoice_text).group(1).strip()
    recipient_gstin = re.search(r'GSTIN: (\S+)', invoice_text).group(1)
    supplier_gstin = re.search(r'Supplier GSTIN: (\S+)', invoice_text).group(1)

    # Extract items using regex
    item_lines = re.findall(r'(\d+)\s+([\w\s]+)\s+\$(\S+)\s+(\d+)\s+\$(\S+)', invoice_text)
    items = []
    
    for line in item_lines:
        item_number, description, price, quantity, total = line
        items.append({
            'item_number': int(item_number),
            'description': description.strip(),
            'price': float(price),
            'quantity': int(quantity),
            'total': float(total)
        })

    # Standardized format for all outputs
    result = {
        'invoice_number': invoice_number,
        'invoice_date': invoice_date,
        'total_amount': total_amount,
        'recipient_name': recipient_name,
        'recipient_gstin': recipient_gstin,
        'supplier_gstin': supplier_gstin,
        'items': [{'item_number': item['item_number'], 
                   'description': item['description'], 
                   'price': item['price'], 
                   'quantity': item['quantity'], 
                   'total': item['total']} for item in items],
        'total_items': len(items),
        'item_summary': [{'item_number': item['item_number'], 'total': item['total']} for item in items]
    }

    return result

# Sample invoice text
invoice_text = """Schmidt Inc INVOICE
Invoice Number: INV1360-C
Invoice Date: 13.08.2023
To: Green-Robinson GSTIN: 14NbnUy622853Z6
SL. Item Description Price Qty. Total
1 Integrated coherent algorithm $266.38 2 $532.76
2 Extended 24/7 architecture $766.59 2 $1533.18
3 Virtual radical budgetary management $686.1 5 $3430.5
Total Amount: $5729.6
Supplier GSTIN: 03rmlrM815749Z9"""

# Extract and print the structured data in a single standard format
result = extract_invoice_data(invoice_text)

# Display result in standard format
print(result)


{'invoice_number': 'INV1360-C', 'invoice_date': '13.08.2023', 'total_amount': 5729.6, 'recipient_name': 'Green-Robinson', 'recipient_gstin': '14NbnUy622853Z6', 'supplier_gstin': '03rmlrM815749Z9', 'items': [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'description': 'Virtual radical budgetary management', 'price': 686.1, 'quantity': 5, 'total': 3430.5}], 'total_items': 3, 'item_summary': [{'item_number': 1, 'total': 532.76}, {'item_number': 7, 'total': 1533.18}, {'item_number': 3, 'total': 3430.5}]}


In [5]:
import re

def extract_invoice_data(invoice_text):
    # Use more general regex patterns to handle variations in invoice formats
    invoice_number = re.search(r'Invoice\s*(Number|No\.?):\s*(\S+)', invoice_text)
    invoice_date = re.search(r'Invoice\s*Date:\s*(\S+)', invoice_text)
    total_amount = re.search(r'(Total Amount|Amount Due):\s*\$(\S+)', invoice_text)
    recipient_name = re.search(r'To:\s*(.+?)\s+GSTIN', invoice_text)
    recipient_gstin = re.search(r'GSTIN:\s*(\S+)', invoice_text)
    supplier_gstin = re.search(r'Supplier GSTIN:\s*(\S+)', invoice_text)

    # Use regex to extract items
    item_lines = re.findall(r'(\d+)\s+([\w\s]+)\s+\$(\S+)\s+(\d+)\s+\$(\S+)', invoice_text)
    items = []

    # Check for missing fields and handle them
    invoice_number = invoice_number.group(2) if invoice_number else "N/A"
    invoice_date = invoice_date.group(1) if invoice_date else "N/A"
    total_amount = float(total_amount.group(2)) if total_amount else 0.0
    recipient_name = recipient_name.group(1).strip() if recipient_name else "N/A"
    recipient_gstin = recipient_gstin.group(1) if recipient_gstin else "N/A"
    supplier_gstin = supplier_gstin.group(1) if supplier_gstin else "N/A"
    
    for line in item_lines:
        item_number, description, price, quantity, total = line
        items.append({
            'item_number': int(item_number),
            'description': description.strip(),
            'price': float(price),
            'quantity': int(quantity),
            'total': float(total)
        })

    # Standardized format for all outputs
    result = {
        'invoice_number': invoice_number,
        'invoice_date': invoice_date,
        'total_amount': total_amount,
        'recipient_name': recipient_name,
        'recipient_gstin': recipient_gstin,
        'supplier_gstin': supplier_gstin,
        'items': [{'item_number': item['item_number'], 
                   'description': item['description'], 
                   'price': item['price'], 
                   'quantity': item['quantity'], 
                   'total': item['total']} for item in items],
        'total_items': len(items),
        'item_summary': [{'item_number': item['item_number'], 'total': item['total']} for item in items]
    }

    return result

# Test with different invoice formats
invoice_text_1 = """Schmidt Inc INVOICE
Invoice No.: INV1360-C
Invoice Date: 13.08.2023
To: Green-Robinson GSTIN: 14NbnUy622853Z6
SL. Item Description Price Qty. Total
1 Integrated coherent algorithm $266.38 2 $532.76
2 Extended 24/7 architecture $766.59 2 $1533.18
3 Virtual radical budgetary management $686.1 5 $3430.5
Amount Due: $5729.6
Supplier GSTIN: 03rmlrM815749Z9"""

invoice_text_2 = """New Corp Billing
Invoice Number: INV2024-100
Invoice Date: 01.10.2024
To: Blue-Sky Ventures GSTIN: 16WXYZ892345Z6
SL. Item Name Price Quantity Total
1 Custom API Integration $500.00 1 $500.00
2 Cloud Deployment $200.00 1 $200.00
Total Amount: $700.00
Supplier GSTIN: 09ABCD1234567X8"""

# Extract and print the structured data in a single standard format for both invoices
result_1 = extract_invoice_data(invoice_text_1)
result_2 = extract_invoice_data(invoice_text_2)

# Display results in standard format
print("Result 1:", result_1)
print("Result 2:", result_2)


Result 1: {'invoice_number': 'INV1360-C', 'invoice_date': '13.08.2023', 'total_amount': 5729.6, 'recipient_name': 'Green-Robinson', 'recipient_gstin': '14NbnUy622853Z6', 'supplier_gstin': '03rmlrM815749Z9', 'items': [{'item_number': 1, 'description': 'Integrated coherent algorithm', 'price': 266.38, 'quantity': 2, 'total': 532.76}, {'item_number': 7, 'description': 'architecture', 'price': 766.59, 'quantity': 2, 'total': 1533.18}, {'item_number': 3, 'description': 'Virtual radical budgetary management', 'price': 686.1, 'quantity': 5, 'total': 3430.5}], 'total_items': 3, 'item_summary': [{'item_number': 1, 'total': 532.76}, {'item_number': 7, 'total': 1533.18}, {'item_number': 3, 'total': 3430.5}]}
Result 2: {'invoice_number': 'INV2024-100', 'invoice_date': '01.10.2024', 'total_amount': 700.0, 'recipient_name': 'Blue-Sky Ventures', 'recipient_gstin': '16WXYZ892345Z6', 'supplier_gstin': '09ABCD1234567X8', 'items': [{'item_number': 1, 'description': 'Custom API Integration', 'price': 500.