<a href="https://colab.research.google.com/github/chauhanneha16/Invoice/blob/main/Copy_of_Invoice_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install opencv-python



In [None]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->easyocr)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==1

In [None]:
import easyocr
import pandas as pd
import glob
import cv2
import re

def cleanup_text(text):
    # strip out non-ASCII text so we can draw the text on the image
    # using OpenCV
    return "".join([c if ord(c) < 128 else "" for c in text]).strip()


def display_image(image, results):
    image_file = cv2.imread(image)

    for (bbox, text) in results:
        # unpack the bounding box
        (tl, tr, br, bl) = bbox
        tl = (int(tl[0]), int(tl[1]))
        tr = (int(tr[0]), int(tr[1]))
        br = (int(br[0]), int(br[1]))
        bl = (int(bl[0]), int(bl[1]))
        # cleanup the text and draw the box surrounding the text along
        # with the OCR'd text itself
        text = cleanup_text(text)
        cv2.rectangle(image_file, tl, br, (0, 255, 0), 2)
        cv2.putText(image_file, text, (tl[0], tl[1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
    # show the output image_file
    cv2.imshow("Image", image_file)
    cv2.waitKey(0)


data_output = [
    [
        'Invoice number',
        'Country',
        'Subtotal amount',
        'VAT ID number',
        'VAT %',
        'VAT Amount',
        'Total amount',
    ]
]

img = glob.glob('/content/290902362-e6887b15-c19b-417e-9144-33b1d88085ee.png')

for i, image in enumerate(img):
    reader = easyocr.Reader(['en'], gpu=True)
    results = reader.readtext(
        image,
        # Paragraph is needed to get text grouped together
        paragraph=True,
        # X_threshold is needed to get text grouped together
        # For example, address fields with high X_threshold will be grouped together
        # and will be returned as one text. This is not the desired outcome.
        # Lowering the X_threshold will return the address fields as separate text, which is also bad!
        # PS. Play around with the X_threshold to see the results.
        x_ths=0.3
    )
    df = pd.DataFrame(results, columns=['bbox', 'text'])
    df.drop(columns=['bbox'], inplace=True)

    text = '\n\n'.join(df['text'].tolist())
    print(text)

    # Adjusted regular expression to match invoice number using the label "INVOICE #"
    invoice_number_match = re.search(r'INVOICE #:\s*(\w+-\d{3})', text)
    if invoice_number_match:
        invoice_number = invoice_number_match.group(1).strip()
    else:
        print('Invoice number not found for invoice: ' + image)
        continue  # Skip processing this invoice further

    country_match = re.search(r'Address:\s*(.+?)\s*\n\n', text, re.DOTALL)
    if country_match:
        country = country_match.group(1).split(',')[-1].strip()
    else:
        print('Country not found for invoice: ' + image)
        continue  # Skip processing this invoice further

    subtotal_amount_match = re.search(r'Subtotal \(USD\):\s*(.+?)\s*\n', text)
    if subtotal_amount_match:
        subtotal_amount = subtotal_amount_match.group(1).strip()
    else:
        print('Subtotal amount not found for invoice: ' + image)
        continue  # Skip processing this invoice further

    search_vat_number = re.search(r'VAT number:\s*(.+?)\s*\n', text)
    if search_vat_number and search_vat_number.group(1) != '999 3333 11':
        vat_id_number = search_vat_number.group(1).strip()
    else:
        vat_id_number = None

    # Our OCR reader sometimes treats % as 9... So our Regex needs [%9] to match both
    vat_information = re.search(r'VAT \((.+?[%9])\):\s*(.+?)\s*\n', text)
    if not vat_information:
        # Sometimes, it decides to not treat `0.00` as a number, so we need to handle that
        print('VAT information not found for invoice: ' + image)
        vat_percent = 0
        vat_amount = 0
    else:
        vat_percent = vat_information.group(1).strip()
        vat_amount = vat_information.group(2).strip()

    # Extracting total amount with or without the currency symbol ('S' or '$')
    total_amount_match = re.search(r'Invoice Total \(USD\):\s*($)?(\$)?(\d+\.\d+)', text)
    if total_amount_match:
        total_amount = total_amount_match.group(3).strip()
    else:
        print('Total amount not found for invoice: ' + image)
        continue  # Skip processing this invoice further

    # Convert the extracted total_amount string to float for consistency
    total_amount = float(total_amount)

    data_output.append([
        invoice_number,
        country,
        subtotal_amount,
        vat_id_number,
        vat_percent,
        vat_amount,
        total_amount,
    ])

    # Uncomment to display image with bounding boxes
    # display_image(image, results)

print(data_output)


INVOICE John Smith 4490 Oak Drive Albany, NY 12210

LOGO

BILL TO Jessie M Horne 4312 Wood Road New York; NY 10031

SHIP TO Jessie M Horne 2019 Redbud Drive New York; NY 10011

INVOICE # INVOICE DATE PO.# DUE DATE

INT-001

11/02/2019

2412/2019

26/02/2019

QTY

DESCRIPTION

UNIT PRICE

AMOUNT

Front and rear brake cables

100.00

100.00

New set of pedal arms

25.00

50.00

Labor 3hrs

15.00

45.00

Sublotal

195.00

Sales Tax 5.0%

9.75

TOTAL

S204.75

RmSwh

TERMS & CONDITIONS

Payment is due within 15 days

Name of Bank Uhank yeu| Account number: 1234567890 Routing: 098765432
Invoice number not found for invoice: /content/290902362-e6887b15-c19b-417e-9144-33b1d88085ee.png
[['Invoice number', 'Country', 'Subtotal amount', 'VAT ID number', 'VAT %', 'VAT Amount', 'Total amount']]
