In [None]:
# Importing necessary libraries
import cv2
import easyocr
import numpy as np
import re
from typing import List, Tuple




In [37]:

# Loading and pre-processing the image
def preprocess_img(image_path: str) -> np.ndarray:
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY, 11, 2)
    denoised = cv2.fastNlMeansDenoising(thresh, h=30)
    return denoised

def extract_text_with_boxes(image: np.ndarray, original_image: np.ndarray) -> Tuple[List[str], np.ndarray]:
    reader = easyocr.Reader(['en'], gpu=True)
    results = reader.readtext(image)
    extracted_text: List[str] = []

    for (bbox, text, prob) in results:
        if prob > 0.5:  # Confidence threshold
            extracted_text.append(text)

            (top_left, top_right, bottom_right, bottom_left) = bbox
            top_left = tuple(map(int, top_left))
            bottom_right = tuple(map(int, bottom_right))
            cv2.rectangle(original_image, top_left, bottom_right, (0, 255, 0), 2)
            cv2.putText(original_image, text, (top_left[0], top_left[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    return extracted_text, original_image

# Postprocessing
def clean_text(text_list: List[str]) -> List[str]:
    cleaned: List[str] = []
    for text in text_list:
        text = re.sub(r'[^\w\s\.\-\/]', '', text)
        cleaned.append(text.strip())
    return cleaned

# Main function
def run_pipeline(image_path: str) -> str:
    preprocessed = preprocess_img(image_path)
    original_image = cv2.imread(image_path)
    raw_text, boxed_image = extract_text_with_boxes(preprocessed, original_image)
    final_text = clean_text(raw_text)

    print("\nExtracted Text:")
    for line in final_text:
        print(line)

    
    output_path = 'output_with_boxes.png'
    cv2.imwrite(output_path, boxed_image)
    print(f"\nImage with bounding boxes saved to: {output_path}")

    return final_text


# Example usage
final = run_pipeline(r'D:\Tasks\OCR\img\test3.png') 


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.



Extracted Text:
Tax Invoice
Inyoice Number
FABOFF1900039768
GSTIN
27A4GCC4236P1Z9
Ordcr ID OD133596594395454000
Shlp To
Ordcr Date 10-10-2018
Sudhir Balkishna Rao
Sudhir Balkrishna Rap
05DSK Madhuban Apartments
Invalco Datc 11-102018
Mehra Compound road Sakinaka
Mumbal 400072 Maharashtra
Mumbal 408072 Maharashtra
Phone 9049567772
Phona 9049567772
Product
Title
Gross
Discount
Toxabk
cGST
SGST
Totol
Amount
Valuz
UTGST
Samsung
Ton
Star BEE
3107.00
21790.62
3050.69
305069
27892.00
FSN
Rating 2018 Inverter AC
ACNFYXTTNKGTHZAW
White
043PPAKSOO194
30999.00
-3107.00
21790.62
3050.69 3050.69
27892.00
Grand Total
27892.00
ConsultingRooms Privale Limiled
Aulhorized Slgnatory
Fupkart E
teg original packing andinvoice
EZz
Ipage

Image with bounding boxes saved to: output_with_boxes.png


In [38]:
final

['Tax Invoice',
 'Inyoice Number',
 'FABOFF1900039768',
 'GSTIN',
 '27A4GCC4236P1Z9',
 'Ordcr ID OD133596594395454000',
 'Shlp To',
 'Ordcr Date 10-10-2018',
 'Sudhir Balkishna Rao',
 'Sudhir Balkrishna Rap',
 '05DSK Madhuban Apartments',
 'Invalco Datc 11-102018',
 'Mehra Compound road Sakinaka',
 'Mumbal 400072 Maharashtra',
 'Mumbal 408072 Maharashtra',
 'Phone 9049567772',
 'Phona 9049567772',
 'Product',
 'Title',
 'Gross',
 'Discount',
 'Toxabk',
 'cGST',
 'SGST',
 'Totol',
 'Amount',
 'Valuz',
 'UTGST',
 'Samsung',
 'Ton',
 'Star BEE',
 '3107.00',
 '21790.62',
 '3050.69',
 '305069',
 '27892.00',
 'FSN',
 'Rating 2018 Inverter AC',
 'ACNFYXTTNKGTHZAW',
 'White',
 '043PPAKSOO194',
 '30999.00',
 '-3107.00',
 '21790.62',
 '3050.69 3050.69',
 '27892.00',
 'Grand Total',
 '27892.00',
 'ConsultingRooms Privale Limiled',
 'Aulhorized Slgnatory',
 'Fupkart E',
 'teg original packing andinvoice',
 'EZz',
 'Ipage']

In [39]:

final_lower = [item.lower() for item in final]


In [40]:
final_lower

['tax invoice',
 'inyoice number',
 'faboff1900039768',
 'gstin',
 '27a4gcc4236p1z9',
 'ordcr id od133596594395454000',
 'shlp to',
 'ordcr date 10-10-2018',
 'sudhir balkishna rao',
 'sudhir balkrishna rap',
 '05dsk madhuban apartments',
 'invalco datc 11-102018',
 'mehra compound road sakinaka',
 'mumbal 400072 maharashtra',
 'mumbal 408072 maharashtra',
 'phone 9049567772',
 'phona 9049567772',
 'product',
 'title',
 'gross',
 'discount',
 'toxabk',
 'cgst',
 'sgst',
 'totol',
 'amount',
 'valuz',
 'utgst',
 'samsung',
 'ton',
 'star bee',
 '3107.00',
 '21790.62',
 '3050.69',
 '305069',
 '27892.00',
 'fsn',
 'rating 2018 inverter ac',
 'acnfyxttnkgthzaw',
 'white',
 '043ppaksoo194',
 '30999.00',
 '-3107.00',
 '21790.62',
 '3050.69 3050.69',
 '27892.00',
 'grand total',
 '27892.00',
 'consultingrooms privale limiled',
 'aulhorized slgnatory',
 'fupkart e',
 'teg original packing andinvoice',
 'ezz',
 'ipage']

In [41]:

from textblob import TextBlob

corrected_text = [str(TextBlob(word).correct()) for word in final_lower]


In [42]:
corrected_text

['tax voice',
 'inyoice number',
 'faboff1900039768',
 'skin',
 '27a4gcc4236p1z9',
 'order id od133596594395454000',
 'ship to',
 'order date 10-10-2018',
 'sudhir balkishna ran',
 'sudhir balkrishna ran',
 '05dsk madhuban apartments',
 'invalid date 11-102018',
 'mere compound road sakinaka',
 'mutual 400072 maharashtra',
 'mutual 408072 maharashtra',
 'phone 9049567772',
 'phone 9049567772',
 'product',
 'title',
 'gross',
 'discount',
 'toxabk',
 'cyst',
 'must',
 'total',
 'amount',
 'value',
 'test',
 'samson',
 'ton',
 'star bee',
 '3107.00',
 '21790.62',
 '3050.69',
 '305069',
 '27892.00',
 'isn',
 'rating 2018 inverted ac',
 'acnfyxttnkgthzaw',
 'white',
 '043ppaksoo194',
 '30999.00',
 '-3107.00',
 '21790.62',
 '3050.69 3050.69',
 '27892.00',
 'grand total',
 '27892.00',
 'consultingrooms private limited',
 'authorized slgnatory',
 'fupkart e',
 'ten original packing andinvoice',
 'end',
 'page']

In [43]:

import spacy
import pandas as pd

# loading spacy's large english model
nlp = spacy.load("en_core_web_lg")


In [44]:

entities = []
for text in corrected_text:
    doc = nlp(text)
    for ent in doc.ents:
        entities.append((text, ent.text, ent.label_))



df = pd.DataFrame(entities, columns=["Original Text", "Entity", "Label"])
print(df)


                      Original Text                           Entity     Label
0                  faboff1900039768                 faboff1900039768    PERSON
1                   27a4gcc4236p1z9                  27a4gcc4236p1z9  CARDINAL
2     order id od133596594395454000          id od133596594395454000       ORG
3              sudhir balkishna ran                 sudhir balkishna    PERSON
4             sudhir balkrishna ran                sudhir balkrishna    PERSON
5         05dsk madhuban apartments                            05dsk  CARDINAL
6            invalid date 11-102018                        11-102018      DATE
7         mutual 400072 maharashtra                           400072  CARDINAL
8         mutual 400072 maharashtra                      maharashtra       GPE
9         mutual 408072 maharashtra                           408072  CARDINAL
10                 phone 9049567772                       9049567772  CARDINAL
11                 phone 9049567772                 

In [45]:

df.to_csv("ner_results.csv", index=False)

print("NER results saved to ner_results.csv")


NER results saved to ner_results.csv
